ronin-web-spider 0.1.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,137 @@
1
+ require 'spec_helper'
2
+ require 'ronin/web/spider/git_archive'
3
+
4
+ require 'tmpdir'
5
+
6
+ describe Ronin::Web::Spider::GitArchive do
7
+ let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
8
+
9
+ describe ".open" do
10
+ subject { described_class }
11
+
12
+ context "when the root directory does not already exist" do
13
+ let(:root) { File.join(Dir.tmpdir,'ronin-web-spider-new-dir') }
14
+
15
+ it "must run `git init` on the new archive directory" do
16
+ subject.open(root)
17
+
18
+ expect(File.directory?(File.join(root,'.git'))).to be(true)
19
+ end
20
+
21
+ after { FileUtils.rm_r(root) }
22
+ end
23
+
24
+ context "when the root directory already exists" do
25
+ context "but does not contain a .git directory" do
26
+ it "must run `git init` within the root directory" do
27
+ subject.open(root)
28
+
29
+ expect(File.directory?(File.join(root,'.git'))).to be(true)
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ subject { described_class.open(root) }
36
+
37
+ describe "#git?" do
38
+ subject { described_class.new(root) }
39
+
40
+ context "when the archive directory contains a .git directory" do
41
+ before do
42
+ FileUtils.mkdir(File.join(root,'.git'))
43
+ end
44
+
45
+ it "must return true" do
46
+ expect(subject.git?).to be(true)
47
+ end
48
+ end
49
+
50
+ context "when the archive directory does not contains a .git directory" do
51
+ it "must return false" do
52
+ expect(subject.git?).to be(false)
53
+ end
54
+ end
55
+ end
56
+
57
+ describe "#init" do
58
+ it "must run the 'git init' command" do
59
+ expect(subject).to receive(:system).with('git','-C',root,'init').and_return(true)
60
+
61
+ subject.init
62
+ end
63
+
64
+ context "when the 'git init' command fails" do
65
+ it do
66
+ allow(subject).to receive(:system).with('git','-C',root,'init').and_return(false)
67
+
68
+ expect {
69
+ subject.init
70
+ }.to raise_error(Ronin::Web::Spider::GitError,"git command failed: git -C #{root} init")
71
+ end
72
+ end
73
+
74
+ context "when 'git' is not installed" do
75
+ it do
76
+ allow(subject).to receive(:system).with('git','-C',root,'init').and_return(nil)
77
+
78
+ expect {
79
+ subject.init
80
+ }.to raise_error(Ronin::Web::Spider::GitError,"the git command was not found")
81
+ end
82
+ end
83
+ end
84
+
85
+ describe "#write" do
86
+ let(:url) { URI('https://example.com/foo/bar.html') }
87
+ let(:body) { 'test file' }
88
+
89
+ it "must automatically create parent directory" do
90
+ subject.write(url,body)
91
+
92
+ expect(File.directory?(File.join(root,'foo'))).to be(true)
93
+ end
94
+
95
+ it "must write the body into the file" do
96
+ subject.write(url,body)
97
+
98
+ expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
99
+ end
100
+
101
+ it "must add the file using `git add`" do
102
+ absolute_path = File.join(root,'foo','bar.html')
103
+
104
+ expect(subject).to receive(:system).with(
105
+ 'git', '-C', root, 'add', absolute_path
106
+ ).and_return(true)
107
+
108
+ subject.write(url,body)
109
+ end
110
+ end
111
+
112
+ describe "#commit" do
113
+ let(:message) { 'commit message' }
114
+
115
+ context "when a block is given" do
116
+ it "must yield control before calling `git commit -m ...` with the commit message" do
117
+ expect(subject).to receive(:system).with(
118
+ 'git', '-C', root, 'commit', '-m', message
119
+ ).and_return(true)
120
+
121
+ expect { |b|
122
+ subject.commit(message,&b)
123
+ }.to yield_with_args(subject)
124
+ end
125
+ end
126
+
127
+ context "when no block is given" do
128
+ it "must not yield and call `git commit -m ...` with the commit message" do
129
+ expect(subject).to receive(:system).with(
130
+ 'git', '-C', root, 'commit', '-m', message
131
+ ).and_return(true)
132
+
133
+ subject.commit(message)
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,4 @@
1
+ require 'rspec'
2
+ require 'simplecov'
3
+
4
+ SimpleCov.start
@@ -0,0 +1,252 @@
1
+ require 'spec_helper'
2
+ require 'example_app'
3
+
4
+ require 'ronin/web/spider'
5
+
6
+ describe Ronin::Web::Spider do
7
+ include_context "example App"
8
+
9
+ describe ".start_at" do
10
+ module TestAgentStartAt
11
+ class ExampleApp < Sinatra::Base
12
+
13
+ set :host, 'example.com'
14
+ set :port, 80
15
+
16
+ get '/' do
17
+ '<html><body>should not get here</body></html>'
18
+ end
19
+
20
+ get '/entry-point' do
21
+ <<~HTML
22
+ <html>
23
+ <body>
24
+ <a href="/link1">link1</a>
25
+ <a href="http://other.com/offsite-link">offsite link</a>
26
+ <a href="/link2">link2</a>
27
+ </body>
28
+ </html>
29
+ HTML
30
+ end
31
+
32
+ get '/link1' do
33
+ '<html><body>got here</body></html>'
34
+ end
35
+
36
+ get '/link2' do
37
+ '<html><body>got here</body></html>'
38
+ end
39
+ end
40
+
41
+ class OtherApp < Sinatra::Base
42
+
43
+ set :host, 'other.com'
44
+ set :port, 80
45
+
46
+ get '/offsite-link' do
47
+ '<html><body>should not get here</body></html>'
48
+ end
49
+
50
+ end
51
+ end
52
+
53
+ subject { described_class }
54
+
55
+ let(:host) { 'example.com' }
56
+ let(:other_host) { 'other.com' }
57
+ let(:url) { URI("http://#{host}/entry-point") }
58
+
59
+ let(:app) { TestAgentStartAt::ExampleApp }
60
+ let(:other_app) { TestAgentStartAt::OtherApp }
61
+
62
+ before do
63
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
64
+ stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app)
65
+ end
66
+
67
+ it "must spider the website starting at the given URL" do
68
+ agent = subject.start_at(url)
69
+
70
+ expect(agent.history).to be == Set[
71
+ URI("http://#{host}/entry-point"),
72
+ URI("http://#{host}/link1"),
73
+ URI("http://#{other_host}/offsite-link"),
74
+ URI("http://#{host}/link2")
75
+ ]
76
+ end
77
+ end
78
+
79
+ describe ".site" do
80
+ module TestAgentSite
81
+ class ExampleApp < Sinatra::Base
82
+
83
+ set :host, 'example.com'
84
+ set :port, 80
85
+
86
+ get '/' do
87
+ '<html><body>should not get here</body></html>'
88
+ end
89
+
90
+ get '/entry-point' do
91
+ <<~HTML
92
+ <html>
93
+ <body>
94
+ <a href="/link1">link1</a>
95
+ <a href="http://other.com/offsite-link">offsite link</a>
96
+ <a href="/link2">link2</a>
97
+ </body>
98
+ </html>
99
+ HTML
100
+ end
101
+
102
+ get '/link1' do
103
+ '<html><body>got here</body></html>'
104
+ end
105
+
106
+ get '/link2' do
107
+ '<html><body>got here</body></html>'
108
+ end
109
+
110
+ end
111
+ end
112
+
113
+ subject { described_class }
114
+
115
+ let(:host) { 'example.com' }
116
+ let(:url) { URI("http://#{host}/entry-point") }
117
+
118
+ let(:app) { TestAgentSite::ExampleApp }
119
+
120
+ before do
121
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
122
+ end
123
+
124
+ it "must spider the website starting at the given URL" do
125
+ agent = subject.site(url)
126
+
127
+ expect(agent.history).to be == Set[
128
+ URI("http://#{host}/entry-point"),
129
+ URI("http://#{host}/link1"),
130
+ URI("http://#{host}/link2")
131
+ ]
132
+ end
133
+ end
134
+
135
+ describe ".host" do
136
+ module TestAgentHost
137
+ class ExampleApp < Sinatra::Base
138
+
139
+ set :host, 'example.com'
140
+ set :port, 80
141
+
142
+ get '/' do
143
+ <<~HTML
144
+ <html>
145
+ <body>
146
+ <a href="/link1">link1</a>
147
+ <a href="http://other.com/offsite-link">offsite link</a>
148
+ <a href="/link2">link2</a>
149
+ </body>
150
+ </html>
151
+ HTML
152
+ end
153
+
154
+ get '/link1' do
155
+ '<html><body>got here</body></html>'
156
+ end
157
+
158
+ get '/link2' do
159
+ '<html><body>got here</body></html>'
160
+ end
161
+
162
+ end
163
+ end
164
+
165
+ subject { described_class }
166
+
167
+ let(:host) { 'example.com' }
168
+ let(:app) { TestAgentHost::ExampleApp }
169
+
170
+ before do
171
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
172
+ end
173
+
174
+ it "must spider the website starting at the given URL" do
175
+ agent = subject.host(host)
176
+
177
+ # XXX: for some reason Set#== was returning false, so convert to an Array
178
+ expect(agent.history.to_a).to be == [
179
+ URI("http://#{host}/"),
180
+ URI("http://#{host}/link1"),
181
+ URI("http://#{host}/link2")
182
+ ]
183
+ end
184
+ end
185
+
186
+ describe ".domain" do
187
+ module TestAgentDomain
188
+ class ExampleApp < Sinatra::Base
189
+
190
+ set :host, 'example.com'
191
+ set :port, 80
192
+
193
+ get '/' do
194
+ <<~HTML
195
+ <html>
196
+ <body>
197
+ <a href="/link1">link1</a>
198
+ <a href="http://sub.example.com/subdomain-link">subdomain link</a>
199
+ <a href="/link2">link2</a>
200
+ </body>
201
+ </html>
202
+ HTML
203
+ end
204
+
205
+ get '/link1' do
206
+ '<html><body>got here</body></html>'
207
+ end
208
+
209
+ get '/link2' do
210
+ '<html><body>got here</body></html>'
211
+ end
212
+
213
+ end
214
+
215
+ class SubDomainApp < Sinatra::Base
216
+
217
+ set :host, 'sub.example.com'
218
+ set :port, 80
219
+
220
+ get '/subdomain-link' do
221
+ '<html><body>should get here</body></html>'
222
+ end
223
+
224
+ end
225
+ end
226
+
227
+ subject { described_class }
228
+
229
+ let(:domain) { 'example.com' }
230
+ let(:domain_app) { TestAgentDomain::ExampleApp }
231
+
232
+ let(:subdomain) { 'sub.example.com' }
233
+ let(:subdomain_app) { TestAgentDomain::SubDomainApp }
234
+
235
+ before do
236
+ stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app)
237
+ stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app)
238
+ end
239
+
240
+ it "must spider the domain and subdomains starting at the given domain" do
241
+ agent = subject.domain(domain)
242
+
243
+ # XXX: for some reason Set#== was returning false, so convert to an Array
244
+ expect(agent.history.to_a).to be == [
245
+ URI("http://#{domain}/"),
246
+ URI("http://#{domain}/link1"),
247
+ URI("http://#{subdomain}/subdomain-link"),
248
+ URI("http://#{domain}/link2")
249
+ ]
250
+ end
251
+ end
252
+ end
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ronin-web-spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0.beta1
5
+ platform: ruby
6
+ authors:
7
+ - Postmodern
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-01-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: spidr
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: ronin-support
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.0'
55
+ description: ronin-web-spider is a collection of common web spidering routines using
56
+ the spidr gem.
57
+ email: postmodern.mod3@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files:
61
+ - COPYING.txt
62
+ - ChangeLog.md
63
+ - README.md
64
+ files:
65
+ - ".document"
66
+ - ".github/workflows/ruby.yml"
67
+ - ".gitignore"
68
+ - ".rspec"
69
+ - ".ruby-version"
70
+ - ".yardopts"
71
+ - COPYING.txt
72
+ - ChangeLog.md
73
+ - Gemfile
74
+ - README.md
75
+ - Rakefile
76
+ - gemspec.yml
77
+ - lib/ronin/web/spider.rb
78
+ - lib/ronin/web/spider/agent.rb
79
+ - lib/ronin/web/spider/archive.rb
80
+ - lib/ronin/web/spider/exceptions.rb
81
+ - lib/ronin/web/spider/git_archive.rb
82
+ - lib/ronin/web/spider/version.rb
83
+ - ronin-web-spider.gemspec
84
+ - spec/agent_spec.rb
85
+ - spec/archive_spec.rb
86
+ - spec/example_app.rb
87
+ - spec/git_archive_spec.rb
88
+ - spec/spec_helper.rb
89
+ - spec/spider_spec.rb
90
+ homepage: https://ronin-rb.dev/
91
+ licenses:
92
+ - LGPL-3.0
93
+ metadata:
94
+ documentation_uri: https://rubydoc.info/gems/ronin-web-spider
95
+ source_code_uri: https://github.com/ronin-rb/ronin-web-spider
96
+ bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
97
+ changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/master/ChangeLog.md
98
+ rubygems_mfa_required: 'true'
99
+ post_install_message:
100
+ rdoc_options: []
101
+ require_paths:
102
+ - lib
103
+ required_ruby_version: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ version: 3.0.0
108
+ required_rubygems_version: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
113
+ requirements: []
114
+ rubygems_version: 3.3.26
115
+ signing_key:
116
+ specification_version: 4
117
+ summary: collection of common web spidering routines
118
+ test_files:
119
+ - spec/agent_spec.rb
120
+ - spec/archive_spec.rb
121
+ - spec/git_archive_spec.rb
122
+ - spec/spider_spec.rb