ronin-web-spider 0.1.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.github/workflows/ruby.yml +31 -0
- data/.gitignore +13 -0
- data/.rspec +1 -0
- data/.ruby-version +1 -0
- data/.yardopts +1 -0
- data/COPYING.txt +165 -0
- data/ChangeLog.md +19 -0
- data/Gemfile +31 -0
- data/README.md +139 -0
- data/Rakefile +31 -0
- data/gemspec.yml +27 -0
- data/lib/ronin/web/spider/agent.rb +302 -0
- data/lib/ronin/web/spider/archive.rb +116 -0
- data/lib/ronin/web/spider/exceptions.rb +36 -0
- data/lib/ronin/web/spider/git_archive.rb +194 -0
- data/lib/ronin/web/spider/version.rb +27 -0
- data/lib/ronin/web/spider.rb +115 -0
- data/ronin-web-spider.gemspec +61 -0
- data/spec/agent_spec.rb +585 -0
- data/spec/archive_spec.rb +91 -0
- data/spec/example_app.rb +27 -0
- data/spec/git_archive_spec.rb +137 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/spider_spec.rb +252 -0
- metadata +122 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'ronin/web/spider/git_archive'
|
3
|
+
|
4
|
+
require 'tmpdir'
|
5
|
+
|
6
|
+
describe Ronin::Web::Spider::GitArchive do
|
7
|
+
let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
|
8
|
+
|
9
|
+
describe ".open" do
|
10
|
+
subject { described_class }
|
11
|
+
|
12
|
+
context "when the root directory does not already exist" do
|
13
|
+
let(:root) { File.join(Dir.tmpdir,'ronin-web-spider-new-dir') }
|
14
|
+
|
15
|
+
it "must run `git init` on the new archive directory" do
|
16
|
+
subject.open(root)
|
17
|
+
|
18
|
+
expect(File.directory?(File.join(root,'.git'))).to be(true)
|
19
|
+
end
|
20
|
+
|
21
|
+
after { FileUtils.rm_r(root) }
|
22
|
+
end
|
23
|
+
|
24
|
+
context "when the root directory already exists" do
|
25
|
+
context "but does not contain a .git directory" do
|
26
|
+
it "must run `git init` within the root directory" do
|
27
|
+
subject.open(root)
|
28
|
+
|
29
|
+
expect(File.directory?(File.join(root,'.git'))).to be(true)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
subject { described_class.open(root) }
|
36
|
+
|
37
|
+
describe "#git?" do
|
38
|
+
subject { described_class.new(root) }
|
39
|
+
|
40
|
+
context "when the archive directory contains a .git directory" do
|
41
|
+
before do
|
42
|
+
FileUtils.mkdir(File.join(root,'.git'))
|
43
|
+
end
|
44
|
+
|
45
|
+
it "must return true" do
|
46
|
+
expect(subject.git?).to be(true)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
context "when the archive directory does not contains a .git directory" do
|
51
|
+
it "must return false" do
|
52
|
+
expect(subject.git?).to be(false)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "#init" do
|
58
|
+
it "must run the 'git init' command" do
|
59
|
+
expect(subject).to receive(:system).with('git','-C',root,'init').and_return(true)
|
60
|
+
|
61
|
+
subject.init
|
62
|
+
end
|
63
|
+
|
64
|
+
context "when the 'git init' command fails" do
|
65
|
+
it do
|
66
|
+
allow(subject).to receive(:system).with('git','-C',root,'init').and_return(false)
|
67
|
+
|
68
|
+
expect {
|
69
|
+
subject.init
|
70
|
+
}.to raise_error(Ronin::Web::Spider::GitError,"git command failed: git -C #{root} init")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
context "when 'git' is not installed" do
|
75
|
+
it do
|
76
|
+
allow(subject).to receive(:system).with('git','-C',root,'init').and_return(nil)
|
77
|
+
|
78
|
+
expect {
|
79
|
+
subject.init
|
80
|
+
}.to raise_error(Ronin::Web::Spider::GitError,"the git command was not found")
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
describe "#write" do
|
86
|
+
let(:url) { URI('https://example.com/foo/bar.html') }
|
87
|
+
let(:body) { 'test file' }
|
88
|
+
|
89
|
+
it "must automatically create parent directory" do
|
90
|
+
subject.write(url,body)
|
91
|
+
|
92
|
+
expect(File.directory?(File.join(root,'foo'))).to be(true)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "must write the body into the file" do
|
96
|
+
subject.write(url,body)
|
97
|
+
|
98
|
+
expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
|
99
|
+
end
|
100
|
+
|
101
|
+
it "must add the file using `git add`" do
|
102
|
+
absolute_path = File.join(root,'foo','bar.html')
|
103
|
+
|
104
|
+
expect(subject).to receive(:system).with(
|
105
|
+
'git', '-C', root, 'add', absolute_path
|
106
|
+
).and_return(true)
|
107
|
+
|
108
|
+
subject.write(url,body)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
describe "#commit" do
|
113
|
+
let(:message) { 'commit message' }
|
114
|
+
|
115
|
+
context "when a block is given" do
|
116
|
+
it "must yield control before calling `git commit -m ...` with the commit message" do
|
117
|
+
expect(subject).to receive(:system).with(
|
118
|
+
'git', '-C', root, 'commit', '-m', message
|
119
|
+
).and_return(true)
|
120
|
+
|
121
|
+
expect { |b|
|
122
|
+
subject.commit(message,&b)
|
123
|
+
}.to yield_with_args(subject)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
context "when no block is given" do
|
128
|
+
it "must not yield and call `git commit -m ...` with the commit message" do
|
129
|
+
expect(subject).to receive(:system).with(
|
130
|
+
'git', '-C', root, 'commit', '-m', message
|
131
|
+
).and_return(true)
|
132
|
+
|
133
|
+
subject.commit(message)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/spec/spider_spec.rb
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_app'
|
3
|
+
|
4
|
+
require 'ronin/web/spider'
|
5
|
+
|
6
|
+
describe Ronin::Web::Spider do
|
7
|
+
include_context "example App"
|
8
|
+
|
9
|
+
describe ".start_at" do
|
10
|
+
module TestAgentStartAt
|
11
|
+
class ExampleApp < Sinatra::Base
|
12
|
+
|
13
|
+
set :host, 'example.com'
|
14
|
+
set :port, 80
|
15
|
+
|
16
|
+
get '/' do
|
17
|
+
'<html><body>should not get here</body></html>'
|
18
|
+
end
|
19
|
+
|
20
|
+
get '/entry-point' do
|
21
|
+
<<~HTML
|
22
|
+
<html>
|
23
|
+
<body>
|
24
|
+
<a href="/link1">link1</a>
|
25
|
+
<a href="http://other.com/offsite-link">offsite link</a>
|
26
|
+
<a href="/link2">link2</a>
|
27
|
+
</body>
|
28
|
+
</html>
|
29
|
+
HTML
|
30
|
+
end
|
31
|
+
|
32
|
+
get '/link1' do
|
33
|
+
'<html><body>got here</body></html>'
|
34
|
+
end
|
35
|
+
|
36
|
+
get '/link2' do
|
37
|
+
'<html><body>got here</body></html>'
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class OtherApp < Sinatra::Base
|
42
|
+
|
43
|
+
set :host, 'other.com'
|
44
|
+
set :port, 80
|
45
|
+
|
46
|
+
get '/offsite-link' do
|
47
|
+
'<html><body>should not get here</body></html>'
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
subject { described_class }
|
54
|
+
|
55
|
+
let(:host) { 'example.com' }
|
56
|
+
let(:other_host) { 'other.com' }
|
57
|
+
let(:url) { URI("http://#{host}/entry-point") }
|
58
|
+
|
59
|
+
let(:app) { TestAgentStartAt::ExampleApp }
|
60
|
+
let(:other_app) { TestAgentStartAt::OtherApp }
|
61
|
+
|
62
|
+
before do
|
63
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
64
|
+
stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app)
|
65
|
+
end
|
66
|
+
|
67
|
+
it "must spider the website starting at the given URL" do
|
68
|
+
agent = subject.start_at(url)
|
69
|
+
|
70
|
+
expect(agent.history).to be == Set[
|
71
|
+
URI("http://#{host}/entry-point"),
|
72
|
+
URI("http://#{host}/link1"),
|
73
|
+
URI("http://#{other_host}/offsite-link"),
|
74
|
+
URI("http://#{host}/link2")
|
75
|
+
]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe ".site" do
|
80
|
+
module TestAgentSite
|
81
|
+
class ExampleApp < Sinatra::Base
|
82
|
+
|
83
|
+
set :host, 'example.com'
|
84
|
+
set :port, 80
|
85
|
+
|
86
|
+
get '/' do
|
87
|
+
'<html><body>should not get here</body></html>'
|
88
|
+
end
|
89
|
+
|
90
|
+
get '/entry-point' do
|
91
|
+
<<~HTML
|
92
|
+
<html>
|
93
|
+
<body>
|
94
|
+
<a href="/link1">link1</a>
|
95
|
+
<a href="http://other.com/offsite-link">offsite link</a>
|
96
|
+
<a href="/link2">link2</a>
|
97
|
+
</body>
|
98
|
+
</html>
|
99
|
+
HTML
|
100
|
+
end
|
101
|
+
|
102
|
+
get '/link1' do
|
103
|
+
'<html><body>got here</body></html>'
|
104
|
+
end
|
105
|
+
|
106
|
+
get '/link2' do
|
107
|
+
'<html><body>got here</body></html>'
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
subject { described_class }
|
114
|
+
|
115
|
+
let(:host) { 'example.com' }
|
116
|
+
let(:url) { URI("http://#{host}/entry-point") }
|
117
|
+
|
118
|
+
let(:app) { TestAgentSite::ExampleApp }
|
119
|
+
|
120
|
+
before do
|
121
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
122
|
+
end
|
123
|
+
|
124
|
+
it "must spider the website starting at the given URL" do
|
125
|
+
agent = subject.site(url)
|
126
|
+
|
127
|
+
expect(agent.history).to be == Set[
|
128
|
+
URI("http://#{host}/entry-point"),
|
129
|
+
URI("http://#{host}/link1"),
|
130
|
+
URI("http://#{host}/link2")
|
131
|
+
]
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
describe ".host" do
|
136
|
+
module TestAgentHost
|
137
|
+
class ExampleApp < Sinatra::Base
|
138
|
+
|
139
|
+
set :host, 'example.com'
|
140
|
+
set :port, 80
|
141
|
+
|
142
|
+
get '/' do
|
143
|
+
<<~HTML
|
144
|
+
<html>
|
145
|
+
<body>
|
146
|
+
<a href="/link1">link1</a>
|
147
|
+
<a href="http://other.com/offsite-link">offsite link</a>
|
148
|
+
<a href="/link2">link2</a>
|
149
|
+
</body>
|
150
|
+
</html>
|
151
|
+
HTML
|
152
|
+
end
|
153
|
+
|
154
|
+
get '/link1' do
|
155
|
+
'<html><body>got here</body></html>'
|
156
|
+
end
|
157
|
+
|
158
|
+
get '/link2' do
|
159
|
+
'<html><body>got here</body></html>'
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
subject { described_class }
|
166
|
+
|
167
|
+
let(:host) { 'example.com' }
|
168
|
+
let(:app) { TestAgentHost::ExampleApp }
|
169
|
+
|
170
|
+
before do
|
171
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
172
|
+
end
|
173
|
+
|
174
|
+
it "must spider the website starting at the given URL" do
|
175
|
+
agent = subject.host(host)
|
176
|
+
|
177
|
+
# XXX: for some reason Set#== was returning false, so convert to an Array
|
178
|
+
expect(agent.history.to_a).to be == [
|
179
|
+
URI("http://#{host}/"),
|
180
|
+
URI("http://#{host}/link1"),
|
181
|
+
URI("http://#{host}/link2")
|
182
|
+
]
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
describe ".domain" do
|
187
|
+
module TestAgentDomain
|
188
|
+
class ExampleApp < Sinatra::Base
|
189
|
+
|
190
|
+
set :host, 'example.com'
|
191
|
+
set :port, 80
|
192
|
+
|
193
|
+
get '/' do
|
194
|
+
<<~HTML
|
195
|
+
<html>
|
196
|
+
<body>
|
197
|
+
<a href="/link1">link1</a>
|
198
|
+
<a href="http://sub.example.com/subdomain-link">subdomain link</a>
|
199
|
+
<a href="/link2">link2</a>
|
200
|
+
</body>
|
201
|
+
</html>
|
202
|
+
HTML
|
203
|
+
end
|
204
|
+
|
205
|
+
get '/link1' do
|
206
|
+
'<html><body>got here</body></html>'
|
207
|
+
end
|
208
|
+
|
209
|
+
get '/link2' do
|
210
|
+
'<html><body>got here</body></html>'
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
class SubDomainApp < Sinatra::Base
|
216
|
+
|
217
|
+
set :host, 'sub.example.com'
|
218
|
+
set :port, 80
|
219
|
+
|
220
|
+
get '/subdomain-link' do
|
221
|
+
'<html><body>should get here</body></html>'
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
subject { described_class }
|
228
|
+
|
229
|
+
let(:domain) { 'example.com' }
|
230
|
+
let(:domain_app) { TestAgentDomain::ExampleApp }
|
231
|
+
|
232
|
+
let(:subdomain) { 'sub.example.com' }
|
233
|
+
let(:subdomain_app) { TestAgentDomain::SubDomainApp }
|
234
|
+
|
235
|
+
before do
|
236
|
+
stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app)
|
237
|
+
stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app)
|
238
|
+
end
|
239
|
+
|
240
|
+
it "must spider the domain and subdomains starting at the given domain" do
|
241
|
+
agent = subject.domain(domain)
|
242
|
+
|
243
|
+
# XXX: for some reason Set#== was returning false, so convert to an Array
|
244
|
+
expect(agent.history.to_a).to be == [
|
245
|
+
URI("http://#{domain}/"),
|
246
|
+
URI("http://#{domain}/link1"),
|
247
|
+
URI("http://#{subdomain}/subdomain-link"),
|
248
|
+
URI("http://#{domain}/link2")
|
249
|
+
]
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
metadata
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ronin-web-spider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0.beta1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Postmodern
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-01-01 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: spidr
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: ronin-support
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.0'
|
55
|
+
description: ronin-web-spider is a collection of common web spidering routines using
|
56
|
+
the spidr gem.
|
57
|
+
email: postmodern.mod3@gmail.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files:
|
61
|
+
- COPYING.txt
|
62
|
+
- ChangeLog.md
|
63
|
+
- README.md
|
64
|
+
files:
|
65
|
+
- ".document"
|
66
|
+
- ".github/workflows/ruby.yml"
|
67
|
+
- ".gitignore"
|
68
|
+
- ".rspec"
|
69
|
+
- ".ruby-version"
|
70
|
+
- ".yardopts"
|
71
|
+
- COPYING.txt
|
72
|
+
- ChangeLog.md
|
73
|
+
- Gemfile
|
74
|
+
- README.md
|
75
|
+
- Rakefile
|
76
|
+
- gemspec.yml
|
77
|
+
- lib/ronin/web/spider.rb
|
78
|
+
- lib/ronin/web/spider/agent.rb
|
79
|
+
- lib/ronin/web/spider/archive.rb
|
80
|
+
- lib/ronin/web/spider/exceptions.rb
|
81
|
+
- lib/ronin/web/spider/git_archive.rb
|
82
|
+
- lib/ronin/web/spider/version.rb
|
83
|
+
- ronin-web-spider.gemspec
|
84
|
+
- spec/agent_spec.rb
|
85
|
+
- spec/archive_spec.rb
|
86
|
+
- spec/example_app.rb
|
87
|
+
- spec/git_archive_spec.rb
|
88
|
+
- spec/spec_helper.rb
|
89
|
+
- spec/spider_spec.rb
|
90
|
+
homepage: https://ronin-rb.dev/
|
91
|
+
licenses:
|
92
|
+
- LGPL-3.0
|
93
|
+
metadata:
|
94
|
+
documentation_uri: https://rubydoc.info/gems/ronin-web-spider
|
95
|
+
source_code_uri: https://github.com/ronin-rb/ronin-web-spider
|
96
|
+
bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
|
97
|
+
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/master/ChangeLog.md
|
98
|
+
rubygems_mfa_required: 'true'
|
99
|
+
post_install_message:
|
100
|
+
rdoc_options: []
|
101
|
+
require_paths:
|
102
|
+
- lib
|
103
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - ">="
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: 3.0.0
|
108
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: '0'
|
113
|
+
requirements: []
|
114
|
+
rubygems_version: 3.3.26
|
115
|
+
signing_key:
|
116
|
+
specification_version: 4
|
117
|
+
summary: collection of common web spidering routines
|
118
|
+
test_files:
|
119
|
+
- spec/agent_spec.rb
|
120
|
+
- spec/archive_spec.rb
|
121
|
+
- spec/git_archive_spec.rb
|
122
|
+
- spec/spider_spec.rb
|