ronin-web-spider 0.1.0.beta2 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +17 -5
- data/.rubocop.yml +11 -0
- data/.yardopts +1 -1
- data/ChangeLog.md +23 -1
- data/Gemfile +3 -0
- data/README.md +303 -32
- data/Rakefile +2 -2
- data/gemspec.yml +4 -4
- data/lib/ronin/web/spider/agent.rb +123 -7
- data/lib/ronin/web/spider/archive.rb +4 -0
- data/lib/ronin/web/spider/exceptions.rb +2 -1
- data/lib/ronin/web/spider/git_archive.rb +3 -2
- data/lib/ronin/web/spider/version.rb +3 -2
- data/lib/ronin/web/spider.rb +290 -1
- data/ronin-web-spider.gemspec +5 -4
- metadata +10 -19
- data/spec/agent_spec.rb +0 -585
- data/spec/archive_spec.rb +0 -91
- data/spec/example_app.rb +0 -27
- data/spec/git_archive_spec.rb +0 -137
- data/spec/spec_helper.rb +0 -4
- data/spec/spider_spec.rb +0 -252
data/spec/archive_spec.rb
DELETED
@@ -1,91 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'ronin/web/spider/archive'
|
3
|
-
|
4
|
-
require 'tmpdir'
|
5
|
-
|
6
|
-
describe Ronin::Web::Spider::Archive do
|
7
|
-
let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
|
8
|
-
|
9
|
-
subject { described_class.new(root) }
|
10
|
-
|
11
|
-
describe "#initialize" do
|
12
|
-
it "must set #root" do
|
13
|
-
expect(subject.root).to eq(root)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
describe ".open" do
|
18
|
-
subject { described_class.open(root) }
|
19
|
-
|
20
|
-
it "must return a new #{described_class}" do
|
21
|
-
expect(subject).to be_kind_of(described_class)
|
22
|
-
end
|
23
|
-
|
24
|
-
context "when given a block" do
|
25
|
-
it "must yield the new #{described_class}" do
|
26
|
-
expect { |b|
|
27
|
-
described_class.open(root,&b)
|
28
|
-
}.to yield_with_args(described_class)
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
context "when the root directory does not exist" do
|
33
|
-
let(:root) { File.join(super(),'does-not-exist-yet') }
|
34
|
-
|
35
|
-
it "must create the given root directory" do
|
36
|
-
described_class.open(root)
|
37
|
-
|
38
|
-
expect(File.directory?(root)).to be(true)
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
context "when the root directory does exist" do
|
43
|
-
let(:root) { File.join(super(),'does-not-exist-yet') }
|
44
|
-
|
45
|
-
before { FileUtils.mkdir(root) }
|
46
|
-
|
47
|
-
it "must not raise an error" do
|
48
|
-
expect {
|
49
|
-
described_class.open(root)
|
50
|
-
}.to_not raise_error
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
describe "#write" do
|
56
|
-
let(:url) { URI('https://example.com/foo/bar.html') }
|
57
|
-
let(:body) { 'test file' }
|
58
|
-
|
59
|
-
before { subject.write(url,body) }
|
60
|
-
|
61
|
-
it "must automatically create parent directory" do
|
62
|
-
expect(File.directory?(File.join(root,'foo'))).to be(true)
|
63
|
-
end
|
64
|
-
|
65
|
-
it "must write the body into the file" do
|
66
|
-
expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
|
67
|
-
end
|
68
|
-
|
69
|
-
context "when the URL has a query string" do
|
70
|
-
let(:url) { URI('https://example.com/foo/bar.php?q=1') }
|
71
|
-
|
72
|
-
it "must include the query string as part of the file name" do
|
73
|
-
expect(File.read(File.join(root,'foo','bar.php?q=1'))).to eq(body)
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
context "when the URL path ends with a '/'" do
|
78
|
-
let(:url) { URI('https://example.com/foo/bar/') }
|
79
|
-
|
80
|
-
it "must write the body to an index.html file within the URL's path" do
|
81
|
-
expect(File.read(File.join(root,'foo','bar','index.html'))).to eq(body)
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
describe "#to_s" do
|
87
|
-
it "must return the root directory" do
|
88
|
-
expect(subject.to_s).to eq(root)
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
data/spec/example_app.rb
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
require 'rspec'
|
2
|
-
require 'sinatra/base'
|
3
|
-
require 'webmock/rspec'
|
4
|
-
|
5
|
-
require 'ronin/web/spider/agent'
|
6
|
-
|
7
|
-
RSpec.shared_context "example App" do
|
8
|
-
let(:host) { 'example.com' }
|
9
|
-
|
10
|
-
subject { Ronin::Web::Spider::Agent.new(host: host) }
|
11
|
-
|
12
|
-
def self.app(&block)
|
13
|
-
let(:app) do
|
14
|
-
klass = Class.new(Sinatra::Base)
|
15
|
-
klass.set :host, host
|
16
|
-
klass.set :port, 80
|
17
|
-
klass.class_eval(&block)
|
18
|
-
return klass
|
19
|
-
end
|
20
|
-
|
21
|
-
before do
|
22
|
-
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
23
|
-
|
24
|
-
subject.start_at("http://#{host}/")
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
data/spec/git_archive_spec.rb
DELETED
@@ -1,137 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'ronin/web/spider/git_archive'
|
3
|
-
|
4
|
-
require 'tmpdir'
|
5
|
-
|
6
|
-
describe Ronin::Web::Spider::GitArchive do
|
7
|
-
let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
|
8
|
-
|
9
|
-
describe ".open" do
|
10
|
-
subject { described_class }
|
11
|
-
|
12
|
-
context "when the root directory does not already exist" do
|
13
|
-
let(:root) { File.join(Dir.tmpdir,'ronin-web-spider-new-dir') }
|
14
|
-
|
15
|
-
it "must run `git init` on the new archive directory" do
|
16
|
-
subject.open(root)
|
17
|
-
|
18
|
-
expect(File.directory?(File.join(root,'.git'))).to be(true)
|
19
|
-
end
|
20
|
-
|
21
|
-
after { FileUtils.rm_r(root) }
|
22
|
-
end
|
23
|
-
|
24
|
-
context "when the root directory already exists" do
|
25
|
-
context "but does not contain a .git directory" do
|
26
|
-
it "must run `git init` within the root directory" do
|
27
|
-
subject.open(root)
|
28
|
-
|
29
|
-
expect(File.directory?(File.join(root,'.git'))).to be(true)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
subject { described_class.open(root) }
|
36
|
-
|
37
|
-
describe "#git?" do
|
38
|
-
subject { described_class.new(root) }
|
39
|
-
|
40
|
-
context "when the archive directory contains a .git directory" do
|
41
|
-
before do
|
42
|
-
FileUtils.mkdir(File.join(root,'.git'))
|
43
|
-
end
|
44
|
-
|
45
|
-
it "must return true" do
|
46
|
-
expect(subject.git?).to be(true)
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
context "when the archive directory does not contains a .git directory" do
|
51
|
-
it "must return false" do
|
52
|
-
expect(subject.git?).to be(false)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
describe "#init" do
|
58
|
-
it "must run the 'git init' command" do
|
59
|
-
expect(subject).to receive(:system).with('git','-C',root,'init').and_return(true)
|
60
|
-
|
61
|
-
subject.init
|
62
|
-
end
|
63
|
-
|
64
|
-
context "when the 'git init' command fails" do
|
65
|
-
it do
|
66
|
-
allow(subject).to receive(:system).with('git','-C',root,'init').and_return(false)
|
67
|
-
|
68
|
-
expect {
|
69
|
-
subject.init
|
70
|
-
}.to raise_error(Ronin::Web::Spider::GitError,"git command failed: git -C #{root} init")
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
context "when 'git' is not installed" do
|
75
|
-
it do
|
76
|
-
allow(subject).to receive(:system).with('git','-C',root,'init').and_return(nil)
|
77
|
-
|
78
|
-
expect {
|
79
|
-
subject.init
|
80
|
-
}.to raise_error(Ronin::Web::Spider::GitError,"the git command was not found")
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
describe "#write" do
|
86
|
-
let(:url) { URI('https://example.com/foo/bar.html') }
|
87
|
-
let(:body) { 'test file' }
|
88
|
-
|
89
|
-
it "must automatically create parent directory" do
|
90
|
-
subject.write(url,body)
|
91
|
-
|
92
|
-
expect(File.directory?(File.join(root,'foo'))).to be(true)
|
93
|
-
end
|
94
|
-
|
95
|
-
it "must write the body into the file" do
|
96
|
-
subject.write(url,body)
|
97
|
-
|
98
|
-
expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
|
99
|
-
end
|
100
|
-
|
101
|
-
it "must add the file using `git add`" do
|
102
|
-
absolute_path = File.join(root,'foo','bar.html')
|
103
|
-
|
104
|
-
expect(subject).to receive(:system).with(
|
105
|
-
'git', '-C', root, 'add', absolute_path
|
106
|
-
).and_return(true)
|
107
|
-
|
108
|
-
subject.write(url,body)
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
describe "#commit" do
|
113
|
-
let(:message) { 'commit message' }
|
114
|
-
|
115
|
-
context "when a block is given" do
|
116
|
-
it "must yield control before calling `git commit -m ...` with the commit message" do
|
117
|
-
expect(subject).to receive(:system).with(
|
118
|
-
'git', '-C', root, 'commit', '-m', message
|
119
|
-
).and_return(true)
|
120
|
-
|
121
|
-
expect { |b|
|
122
|
-
subject.commit(message,&b)
|
123
|
-
}.to yield_with_args(subject)
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
context "when no block is given" do
|
128
|
-
it "must not yield and call `git commit -m ...` with the commit message" do
|
129
|
-
expect(subject).to receive(:system).with(
|
130
|
-
'git', '-C', root, 'commit', '-m', message
|
131
|
-
).and_return(true)
|
132
|
-
|
133
|
-
subject.commit(message)
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
data/spec/spec_helper.rb
DELETED
data/spec/spider_spec.rb
DELETED
@@ -1,252 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'example_app'
|
3
|
-
|
4
|
-
require 'ronin/web/spider'
|
5
|
-
|
6
|
-
describe Ronin::Web::Spider do
|
7
|
-
include_context "example App"
|
8
|
-
|
9
|
-
describe ".start_at" do
|
10
|
-
module TestAgentStartAt
|
11
|
-
class ExampleApp < Sinatra::Base
|
12
|
-
|
13
|
-
set :host, 'example.com'
|
14
|
-
set :port, 80
|
15
|
-
|
16
|
-
get '/' do
|
17
|
-
'<html><body>should not get here</body></html>'
|
18
|
-
end
|
19
|
-
|
20
|
-
get '/entry-point' do
|
21
|
-
<<~HTML
|
22
|
-
<html>
|
23
|
-
<body>
|
24
|
-
<a href="/link1">link1</a>
|
25
|
-
<a href="http://other.com/offsite-link">offsite link</a>
|
26
|
-
<a href="/link2">link2</a>
|
27
|
-
</body>
|
28
|
-
</html>
|
29
|
-
HTML
|
30
|
-
end
|
31
|
-
|
32
|
-
get '/link1' do
|
33
|
-
'<html><body>got here</body></html>'
|
34
|
-
end
|
35
|
-
|
36
|
-
get '/link2' do
|
37
|
-
'<html><body>got here</body></html>'
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
class OtherApp < Sinatra::Base
|
42
|
-
|
43
|
-
set :host, 'other.com'
|
44
|
-
set :port, 80
|
45
|
-
|
46
|
-
get '/offsite-link' do
|
47
|
-
'<html><body>should not get here</body></html>'
|
48
|
-
end
|
49
|
-
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
subject { described_class }
|
54
|
-
|
55
|
-
let(:host) { 'example.com' }
|
56
|
-
let(:other_host) { 'other.com' }
|
57
|
-
let(:url) { URI("http://#{host}/entry-point") }
|
58
|
-
|
59
|
-
let(:app) { TestAgentStartAt::ExampleApp }
|
60
|
-
let(:other_app) { TestAgentStartAt::OtherApp }
|
61
|
-
|
62
|
-
before do
|
63
|
-
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
64
|
-
stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app)
|
65
|
-
end
|
66
|
-
|
67
|
-
it "must spider the website starting at the given URL" do
|
68
|
-
agent = subject.start_at(url)
|
69
|
-
|
70
|
-
expect(agent.history).to be == Set[
|
71
|
-
URI("http://#{host}/entry-point"),
|
72
|
-
URI("http://#{host}/link1"),
|
73
|
-
URI("http://#{other_host}/offsite-link"),
|
74
|
-
URI("http://#{host}/link2")
|
75
|
-
]
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
describe ".site" do
|
80
|
-
module TestAgentSite
|
81
|
-
class ExampleApp < Sinatra::Base
|
82
|
-
|
83
|
-
set :host, 'example.com'
|
84
|
-
set :port, 80
|
85
|
-
|
86
|
-
get '/' do
|
87
|
-
'<html><body>should not get here</body></html>'
|
88
|
-
end
|
89
|
-
|
90
|
-
get '/entry-point' do
|
91
|
-
<<~HTML
|
92
|
-
<html>
|
93
|
-
<body>
|
94
|
-
<a href="/link1">link1</a>
|
95
|
-
<a href="http://other.com/offsite-link">offsite link</a>
|
96
|
-
<a href="/link2">link2</a>
|
97
|
-
</body>
|
98
|
-
</html>
|
99
|
-
HTML
|
100
|
-
end
|
101
|
-
|
102
|
-
get '/link1' do
|
103
|
-
'<html><body>got here</body></html>'
|
104
|
-
end
|
105
|
-
|
106
|
-
get '/link2' do
|
107
|
-
'<html><body>got here</body></html>'
|
108
|
-
end
|
109
|
-
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
subject { described_class }
|
114
|
-
|
115
|
-
let(:host) { 'example.com' }
|
116
|
-
let(:url) { URI("http://#{host}/entry-point") }
|
117
|
-
|
118
|
-
let(:app) { TestAgentSite::ExampleApp }
|
119
|
-
|
120
|
-
before do
|
121
|
-
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
122
|
-
end
|
123
|
-
|
124
|
-
it "must spider the website starting at the given URL" do
|
125
|
-
agent = subject.site(url)
|
126
|
-
|
127
|
-
expect(agent.history).to be == Set[
|
128
|
-
URI("http://#{host}/entry-point"),
|
129
|
-
URI("http://#{host}/link1"),
|
130
|
-
URI("http://#{host}/link2")
|
131
|
-
]
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
describe ".host" do
|
136
|
-
module TestAgentHost
|
137
|
-
class ExampleApp < Sinatra::Base
|
138
|
-
|
139
|
-
set :host, 'example.com'
|
140
|
-
set :port, 80
|
141
|
-
|
142
|
-
get '/' do
|
143
|
-
<<~HTML
|
144
|
-
<html>
|
145
|
-
<body>
|
146
|
-
<a href="/link1">link1</a>
|
147
|
-
<a href="http://other.com/offsite-link">offsite link</a>
|
148
|
-
<a href="/link2">link2</a>
|
149
|
-
</body>
|
150
|
-
</html>
|
151
|
-
HTML
|
152
|
-
end
|
153
|
-
|
154
|
-
get '/link1' do
|
155
|
-
'<html><body>got here</body></html>'
|
156
|
-
end
|
157
|
-
|
158
|
-
get '/link2' do
|
159
|
-
'<html><body>got here</body></html>'
|
160
|
-
end
|
161
|
-
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
subject { described_class }
|
166
|
-
|
167
|
-
let(:host) { 'example.com' }
|
168
|
-
let(:app) { TestAgentHost::ExampleApp }
|
169
|
-
|
170
|
-
before do
|
171
|
-
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
172
|
-
end
|
173
|
-
|
174
|
-
it "must spider the website starting at the given URL" do
|
175
|
-
agent = subject.host(host)
|
176
|
-
|
177
|
-
# XXX: for some reason Set#== was returning false, so convert to an Array
|
178
|
-
expect(agent.history.to_a).to be == [
|
179
|
-
URI("http://#{host}/"),
|
180
|
-
URI("http://#{host}/link1"),
|
181
|
-
URI("http://#{host}/link2")
|
182
|
-
]
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
|
-
describe ".domain" do
|
187
|
-
module TestAgentDomain
|
188
|
-
class ExampleApp < Sinatra::Base
|
189
|
-
|
190
|
-
set :host, 'example.com'
|
191
|
-
set :port, 80
|
192
|
-
|
193
|
-
get '/' do
|
194
|
-
<<~HTML
|
195
|
-
<html>
|
196
|
-
<body>
|
197
|
-
<a href="/link1">link1</a>
|
198
|
-
<a href="http://sub.example.com/subdomain-link">subdomain link</a>
|
199
|
-
<a href="/link2">link2</a>
|
200
|
-
</body>
|
201
|
-
</html>
|
202
|
-
HTML
|
203
|
-
end
|
204
|
-
|
205
|
-
get '/link1' do
|
206
|
-
'<html><body>got here</body></html>'
|
207
|
-
end
|
208
|
-
|
209
|
-
get '/link2' do
|
210
|
-
'<html><body>got here</body></html>'
|
211
|
-
end
|
212
|
-
|
213
|
-
end
|
214
|
-
|
215
|
-
class SubDomainApp < Sinatra::Base
|
216
|
-
|
217
|
-
set :host, 'sub.example.com'
|
218
|
-
set :port, 80
|
219
|
-
|
220
|
-
get '/subdomain-link' do
|
221
|
-
'<html><body>should get here</body></html>'
|
222
|
-
end
|
223
|
-
|
224
|
-
end
|
225
|
-
end
|
226
|
-
|
227
|
-
subject { described_class }
|
228
|
-
|
229
|
-
let(:domain) { 'example.com' }
|
230
|
-
let(:domain_app) { TestAgentDomain::ExampleApp }
|
231
|
-
|
232
|
-
let(:subdomain) { 'sub.example.com' }
|
233
|
-
let(:subdomain_app) { TestAgentDomain::SubDomainApp }
|
234
|
-
|
235
|
-
before do
|
236
|
-
stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app)
|
237
|
-
stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app)
|
238
|
-
end
|
239
|
-
|
240
|
-
it "must spider the domain and subdomains starting at the given domain" do
|
241
|
-
agent = subject.domain(domain)
|
242
|
-
|
243
|
-
# XXX: for some reason Set#== was returning false, so convert to an Array
|
244
|
-
expect(agent.history.to_a).to be == [
|
245
|
-
URI("http://#{domain}/"),
|
246
|
-
URI("http://#{domain}/link1"),
|
247
|
-
URI("http://#{subdomain}/subdomain-link"),
|
248
|
-
URI("http://#{domain}/link2")
|
249
|
-
]
|
250
|
-
end
|
251
|
-
end
|
252
|
-
end
|