ronin-web-spider 0.1.0.beta2 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ronin-web-spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.beta2
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-01 00:00:00.000000000 Z
11
+ date: 2024-06-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.0.0.beta1
33
+ version: '1.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.0.0.beta1
40
+ version: '1.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,7 @@ files:
66
66
  - ".github/workflows/ruby.yml"
67
67
  - ".gitignore"
68
68
  - ".rspec"
69
+ - ".rubocop.yml"
69
70
  - ".ruby-version"
70
71
  - ".yardopts"
71
72
  - COPYING.txt
@@ -81,20 +82,14 @@ files:
81
82
  - lib/ronin/web/spider/git_archive.rb
82
83
  - lib/ronin/web/spider/version.rb
83
84
  - ronin-web-spider.gemspec
84
- - spec/agent_spec.rb
85
- - spec/archive_spec.rb
86
- - spec/example_app.rb
87
- - spec/git_archive_spec.rb
88
- - spec/spec_helper.rb
89
- - spec/spider_spec.rb
90
85
  homepage: https://ronin-rb.dev/
91
86
  licenses:
92
87
  - LGPL-3.0
93
88
  metadata:
94
- documentation_uri: https://rubydoc.info/gems/ronin-web-spider
89
+ documentation_uri: https://ronin-rb.dev/docs/ronin-web-spider
95
90
  source_code_uri: https://github.com/ronin-rb/ronin-web-spider
96
91
  bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
97
- changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/master/ChangeLog.md
92
+ changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/main/ChangeLog.md
98
93
  rubygems_mfa_required: 'true'
99
94
  post_install_message:
100
95
  rdoc_options: []
@@ -111,12 +106,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
111
106
  - !ruby/object:Gem::Version
112
107
  version: '0'
113
108
  requirements: []
114
- rubygems_version: 3.3.26
109
+ rubygems_version: 3.3.27
115
110
  signing_key:
116
111
  specification_version: 4
117
- summary: collection of common web spidering routines
118
- test_files:
119
- - spec/agent_spec.rb
120
- - spec/archive_spec.rb
121
- - spec/git_archive_spec.rb
122
- - spec/spider_spec.rb
112
+ summary: A collection of common web spidering routines.
113
+ test_files: []
data/spec/agent_spec.rb DELETED
@@ -1,585 +0,0 @@
1
- require 'spec_helper'
2
- require 'ronin/web/spider/agent'
3
-
4
- require 'webmock/rspec'
5
- require 'sinatra/base'
6
-
7
- describe Ronin::Web::Spider::Agent do
8
- describe "#initialize" do
9
- context "when Ronin::Support::Network::HTTP.proxy is set" do
10
- let(:proxy_host) { 'example.com' }
11
- let(:proxy_port) { 8080 }
12
- let(:proxy_uri) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
13
-
14
- before { Ronin::Support::Network::HTTP.proxy = proxy_uri }
15
-
16
- it "must parse ENV['RONIN_HTTP_USER_AGENT'] and set #proxy" do
17
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
18
- expect(subject.proxy.host).to eq(proxy_host)
19
- expect(subject.proxy.port).to eq(proxy_port)
20
- end
21
-
22
- after { Ronin::Support::Network::HTTP.proxy = nil }
23
- end
24
-
25
- context "when Ronin::Support::Network::HTTP.user_agent is set" do
26
- let(:user_agent) { 'Foo Bar' }
27
-
28
- before { Ronin::Support::Network::HTTP.user_agent = user_agent }
29
-
30
- it "must default #user_agent to ENV['RONIN_HTTP_USER_AGENT']" do
31
- expect(subject.user_agent).to eq(user_agent)
32
- end
33
-
34
- after { Ronin::Support::Network::HTTP.user_agent = nil }
35
- end
36
-
37
- context "when given the proxy: keyword argument" do
38
- let(:proxy_host) { 'example.com' }
39
- let(:proxy_port) { 8080 }
40
-
41
- context "and it's an Addressable::URI" do
42
- let(:proxy) { Addressable::URI.new(host: proxy_host, port: proxy_port) }
43
-
44
- subject { described_class.new(proxy: proxy) }
45
-
46
- it "must convert it to a Spidr::Proxy object" do
47
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
48
- expect(subject.proxy.host).to eq(proxy_host)
49
- expect(subject.proxy.port).to eq(proxy_port)
50
- end
51
- end
52
-
53
- context "and it's an URI::HTTP" do
54
- let(:proxy) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
55
-
56
- subject { described_class.new(proxy: proxy) }
57
-
58
- it "must convert it to a Spidr::Proxy object" do
59
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
60
- expect(subject.proxy.host).to eq(proxy_host)
61
- expect(subject.proxy.port).to eq(proxy_port)
62
- end
63
- end
64
-
65
- context "and it's a Hash" do
66
- let(:proxy) do
67
- {host: proxy_host, port: proxy_port}
68
- end
69
-
70
- subject { described_class.new(proxy: proxy) }
71
-
72
- it "must convert it to a Spidr::Proxy object" do
73
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
74
- expect(subject.proxy.host).to eq(proxy_host)
75
- expect(subject.proxy.port).to eq(proxy_port)
76
- end
77
- end
78
-
79
- context "and it's a String" do
80
- let(:proxy) { "http://#{proxy_host}:#{proxy_port}" }
81
-
82
- subject { described_class.new(proxy: proxy) }
83
-
84
- it "must convert it to a Spidr::Proxy object" do
85
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
86
- expect(subject.proxy.host).to eq(proxy_host)
87
- expect(subject.proxy.port).to eq(proxy_port)
88
- end
89
- end
90
- end
91
-
92
- context "when given the user_agent: keyword argument" do
93
- context "and it's a String" do
94
- let(:user_agent) { "test user-agent" }
95
-
96
- subject { described_class.new(user_agent: user_agent) }
97
-
98
- it "must set the #user_agent" do
99
- expect(subject.user_agent).to eq(user_agent)
100
- end
101
- end
102
-
103
- context "and it's a Symbol" do
104
- let(:user_agent) { :chrome_linux }
105
- let(:expected_user_agent) do
106
- Ronin::Support::Network::HTTP::UserAgents[user_agent]
107
- end
108
-
109
- subject { described_class.new(user_agent: user_agent) }
110
-
111
- it "must map the Symbol to one of Ronin::Support::Network::HTTP::UserAgents" do
112
- expect(subject.user_agent).to eq(expected_user_agent)
113
- end
114
- end
115
- end
116
-
117
- it "must default #visited_hosts to nil" do
118
- expect(subject.visited_hosts).to be(nil)
119
- end
120
- end
121
-
122
- describe "#every_host" do
123
- module TestAgentEveryHost
124
- class Host1 < Sinatra::Base
125
-
126
- set :host, 'host1.example.com'
127
- set :port, 80
128
-
129
- get '/' do
130
- <<~HTML
131
- <html>
132
- <body>
133
- <a href="/link1">link1</a>
134
- <a href="http://host2.example.com/offsite-link">offsite link</a>
135
- <a href="/link2">link2</a>
136
- </body>
137
- </html>
138
- HTML
139
- end
140
-
141
- get '/link1' do
142
- '<html><body>got here</body></html>'
143
- end
144
-
145
- get '/link2' do
146
- '<html><body>got here</body></html>'
147
- end
148
- end
149
-
150
- class Host2 < Sinatra::Base
151
-
152
- set :host, 'host2.example.com'
153
- set :port, 80
154
-
155
- get '/offsite-link' do
156
- '<html><body>should not get here</body></html>'
157
- end
158
-
159
- end
160
- end
161
-
162
- let(:host1) { 'host1.example.com' }
163
- let(:host2) { 'host2.example.com' }
164
-
165
- let(:host1_app) { TestAgentEveryHost::Host1 }
166
- let(:host2_app) { TestAgentEveryHost::Host2 }
167
-
168
- before do
169
- stub_request(:any, /#{Regexp.escape(host1)}/).to_rack(host1_app)
170
- stub_request(:any, /#{Regexp.escape(host2)}/).to_rack(host2_app)
171
- end
172
-
173
- it "must yield every newly discovered hostname while spidering" do
174
- yielded_hosts = []
175
-
176
- subject.every_host do |host|
177
- yielded_hosts << host
178
- end
179
-
180
- subject.start_at("http://#{host1}/")
181
-
182
- expect(yielded_hosts).to eq([host1, host2])
183
- end
184
-
185
- it "must popualte #visited_hosts" do
186
- subject.every_host { |host| }
187
- subject.start_at("http://#{host1}/")
188
-
189
- expect(subject.visited_hosts).to be_kind_of(Set)
190
- expect(subject.visited_hosts.entries).to eq([host1, host2])
191
- end
192
- end
193
-
194
- # TODO: need to figure out how to test #every_cert using webmock.
195
- describe "#every_cert"
196
-
197
- describe "#every_favicon" do
198
- module TestAgentEveryHost
199
- class TestApp < Sinatra::Base
200
-
201
- set :host, 'example.com'
202
- set :port, 80
203
-
204
- get '/' do
205
- <<~HTML
206
- <html>
207
- <head>
208
- <link rel="favicon" href="/favicon1.ico" type="image/x-icon"/>
209
- </head>
210
- <body>
211
- <a href="/link1">link1</a>
212
- <a href="http://host2.example.com/offsite-link">offsite link</a>
213
- <a href="/link2">link2</a>
214
- </body>
215
- </html>
216
- HTML
217
- end
218
-
219
- get '/favicon1.ico' do
220
- content_type 'image/x-icon'
221
-
222
- "favicon1"
223
- end
224
-
225
- get '/favicon2.ico' do
226
- content_type 'image/vnd.microsoft.icon'
227
-
228
- "favicon2"
229
- end
230
-
231
- get '/link1' do
232
- '<html><body>got here</body></html>'
233
- end
234
-
235
- get '/link2' do
236
- <<~HTML
237
- <html>
238
- <head>
239
- <link rel="favicon" href="/favicon2.ico" type="image/x-icon"/>
240
- </head>
241
- <body>got here</body>
242
- </html>
243
- HTML
244
- end
245
- end
246
- end
247
-
248
- let(:host) { 'example.com' }
249
-
250
- let(:test_app) { TestAgentEveryHost::TestApp }
251
-
252
- before do
253
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
254
- end
255
-
256
- it "must yield Spidr::Page objects for each encountered .ico file" do
257
- yielded_favicons = []
258
-
259
- subject.every_favicon do |favicon|
260
- yielded_favicons << favicon
261
- end
262
-
263
- subject.start_at("http://#{host}/")
264
-
265
- expect(yielded_favicons).to_not be_empty
266
-
267
- expect(yielded_favicons[0]).to be_kind_of(Spidr::Page)
268
- expect(yielded_favicons[0].content_type).to eq('image/x-icon')
269
- expect(yielded_favicons[0].url).to eq(URI("http://#{host}/favicon1.ico"))
270
-
271
- expect(yielded_favicons[1]).to be_kind_of(Spidr::Page)
272
- expect(yielded_favicons[1].content_type).to eq('image/vnd.microsoft.icon')
273
- expect(yielded_favicons[1].url).to eq(URI("http://#{host}/favicon2.ico"))
274
- end
275
- end
276
-
277
- describe "#every_html_comment" do
278
- module TestAgentEveryHTMLComment
279
- class TestApp < Sinatra::Base
280
-
281
- set :host, 'example.com'
282
- set :port, 80
283
-
284
- get '/' do
285
- <<~HTML
286
- <html>
287
- <head>
288
- <!-- comment 1 -->
289
- </head>
290
- <!-- -->
291
- <body>
292
- <!-- comment 2 -->
293
- </body>
294
- </html>
295
- HTML
296
- end
297
- end
298
- end
299
-
300
- let(:host) { 'example.com' }
301
-
302
- let(:test_app) { TestAgentEveryHTMLComment::TestApp }
303
-
304
- before do
305
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
306
- end
307
-
308
- it "must yield every non-empty/non-whitespace HTML comment String" do
309
- yielded_comments = []
310
-
311
- subject.every_html_comment do |comment|
312
- yielded_comments << comment
313
- end
314
-
315
- subject.start_at("http://#{host}/")
316
-
317
- expect(yielded_comments).to match_array(
318
- [
319
- 'comment 1',
320
- 'comment 2'
321
- ]
322
- )
323
- end
324
- end
325
-
326
- describe "#every_javascript" do
327
- module TestAgentEveryJavaScript
328
- class TestApp < Sinatra::Base
329
-
330
- set :host, 'example.com'
331
- set :port, 80
332
-
333
- get '/' do
334
- <<~HTML
335
- <html>
336
- <head>
337
- <script type="text/javascript" src="/javascript1.js"></script>
338
- <script type="text/javascript">javascript2</script>
339
- </head>
340
- <body>
341
- <a href="/link1">link1</a>
342
- <a href="http://host2.example.com/offsite-link">offsite link</a>
343
- <a href="/link2">link2</a>
344
- </body>
345
- </html>
346
- HTML
347
- end
348
-
349
- get '/javascript1.js' do
350
- content_type 'text/javascript'
351
- "javascript1"
352
- end
353
- end
354
- end
355
-
356
- let(:host) { 'example.com' }
357
-
358
- let(:test_app) { TestAgentEveryJavaScript::TestApp }
359
-
360
- before do
361
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
362
- end
363
-
364
- it "must yield both the contents of .js files and inline <script> tags" do
365
- yielded_javascripts = []
366
-
367
- subject.every_javascript do |js|
368
- yielded_javascripts << js
369
- end
370
-
371
- subject.start_at("http://#{host}/")
372
-
373
- expect(yielded_javascripts).to match_array(%w[javascript1 javascript2])
374
- end
375
- end
376
-
377
- describe "#every_javascript_string" do
378
- module TestAgentEveryJavaScriptString
379
- class TestApp < Sinatra::Base
380
-
381
- set :host, 'example.com'
382
- set :port, 80
383
-
384
- get '/' do
385
- <<~HTML
386
- <html>
387
- <head>
388
- <script type="text/javascript" src="/javascript1.js"></script>
389
- <script type="text/javascript">
390
- var str3 = "string #3";
391
- var str4 = 'string #4';
392
- </script>
393
- </head>
394
- <body>
395
- <a href="/link1">link1</a>
396
- <a href="http://host2.example.com/offsite-link">offsite link</a>
397
- <a href="/link2">link2</a>
398
- </body>
399
- </html>
400
- HTML
401
- end
402
-
403
- get '/javascript1.js' do
404
- content_type 'text/javascript'
405
- <<~JS
406
- var str1 = "string #1";
407
- var str2 = 'string #2';
408
- JS
409
- end
410
- end
411
- end
412
-
413
- let(:host) { 'example.com' }
414
-
415
- let(:test_app) { TestAgentEveryJavaScriptString::TestApp }
416
-
417
- before do
418
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
419
- end
420
-
421
- it "must yield every JavaScript string from any <script> tag" do
422
- yielded_javascript_strings = []
423
-
424
- subject.every_javascript_string do |string|
425
- yielded_javascript_strings << string
426
- end
427
-
428
- subject.start_at("http://#{host}/")
429
-
430
- expect(yielded_javascript_strings).to match_array(
431
- [
432
- 'string #1',
433
- 'string #2',
434
- 'string #3',
435
- 'string #4'
436
- ]
437
- )
438
- end
439
- end
440
-
441
- describe "#every_javascript_comment" do
442
- module TestAgentEveryJavaScriptComment
443
- class TestApp < Sinatra::Base
444
-
445
- set :host, 'example.com'
446
- set :port, 80
447
-
448
- get '/' do
449
- <<~HTML
450
- <html>
451
- <head>
452
- <script type="text/javascript" src="/javascript1.js"></script>
453
- <script type="text/javascript">
454
- // comment 3
455
- var str3 = "string #3";
456
- /*
457
- comment 4
458
- */
459
- var str4 = 'string #4';
460
- </script>
461
- </head>
462
- <body>
463
- <a href="/link1">link1</a>
464
- <a href="http://host2.example.com/offsite-link">offsite link</a>
465
- <a href="/link2">link2</a>
466
- </body>
467
- </html>
468
- HTML
469
- end
470
-
471
- get '/javascript1.js' do
472
- content_type 'text/javascript'
473
- <<~JS
474
- // comment 1
475
- var str1 = "string #1";
476
- /* comment 2 */
477
- var str2 = 'string #2';
478
- JS
479
- end
480
- end
481
- end
482
-
483
- let(:host) { 'example.com' }
484
-
485
- let(:test_app) { TestAgentEveryJavaScriptComment::TestApp }
486
-
487
- before do
488
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
489
- end
490
-
491
- it "must yield every JavaScript comment from any <script> tag" do
492
- yielded_javascript_comments = []
493
-
494
- subject.every_javascript_comment do |comment|
495
- yielded_javascript_comments << comment
496
- end
497
-
498
- subject.start_at("http://#{host}/")
499
-
500
- expect(yielded_javascript_comments).to match_array(
501
- [
502
- "// comment 1\n",
503
- "/* comment 2 */",
504
- "// comment 3\n",
505
- "/*\n comment 4\n */"
506
- ]
507
- )
508
- end
509
- end
510
-
511
- describe "#every_comment" do
512
- module TestAgentEveryComment
513
- class TestApp < Sinatra::Base
514
-
515
- set :host, 'example.com'
516
- set :port, 80
517
-
518
- get '/' do
519
- <<~HTML
520
- <html>
521
- <head>
522
- <!-- HTML comment 1 -->
523
- <script type="text/javascript" src="/javascript1.js"></script>
524
- <script type="text/javascript">
525
- // JavaScript comment 3
526
- var str3 = "string #3";
527
- /*
528
- JavaScript comment 4
529
- */
530
- var str4 = 'string #4';
531
- </script>
532
- </head>
533
- <!-- -->
534
- <body>
535
- <!-- HTML comment 2 -->
536
- <a href="/link1">link1</a>
537
- <a href="http://host2.example.com/offsite-link">offsite link</a>
538
- <a href="/link2">link2</a>
539
- </body>
540
- </html>
541
- HTML
542
- end
543
-
544
- get '/javascript1.js' do
545
- content_type 'text/javascript'
546
- <<~JS
547
- // JavaScript comment 1
548
- var str1 = "string #1";
549
- /* JavaScript comment 2 */
550
- var str2 = 'string #2';
551
- JS
552
- end
553
- end
554
- end
555
-
556
- let(:host) { 'example.com' }
557
-
558
- let(:test_app) { TestAgentEveryComment::TestApp }
559
-
560
- before do
561
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
562
- end
563
-
564
- it "must yield every HTML and JavaScript comment from any <script> tag" do
565
- yielded_comments = []
566
-
567
- subject.every_comment do |comment|
568
- yielded_comments << comment
569
- end
570
-
571
- subject.start_at("http://#{host}/")
572
-
573
- expect(yielded_comments).to match_array(
574
- [
575
- "HTML comment 1",
576
- "// JavaScript comment 1\n",
577
- "/* JavaScript comment 2 */",
578
- "// JavaScript comment 3\n",
579
- "/*\n JavaScript comment 4\n */",
580
- "HTML comment 2"
581
- ]
582
- )
583
- end
584
- end
585
- end