ronin-web-spider 0.1.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.github/workflows/ruby.yml +31 -0
- data/.gitignore +13 -0
- data/.rspec +1 -0
- data/.ruby-version +1 -0
- data/.yardopts +1 -0
- data/COPYING.txt +165 -0
- data/ChangeLog.md +19 -0
- data/Gemfile +31 -0
- data/README.md +139 -0
- data/Rakefile +31 -0
- data/gemspec.yml +27 -0
- data/lib/ronin/web/spider/agent.rb +302 -0
- data/lib/ronin/web/spider/archive.rb +116 -0
- data/lib/ronin/web/spider/exceptions.rb +36 -0
- data/lib/ronin/web/spider/git_archive.rb +194 -0
- data/lib/ronin/web/spider/version.rb +27 -0
- data/lib/ronin/web/spider.rb +115 -0
- data/ronin-web-spider.gemspec +61 -0
- data/spec/agent_spec.rb +585 -0
- data/spec/archive_spec.rb +91 -0
- data/spec/example_app.rb +27 -0
- data/spec/git_archive_spec.rb +137 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/spider_spec.rb +252 -0
- metadata +122 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'ronin/web/spider/git_archive'
|
3
|
+
|
4
|
+
require 'tmpdir'
|
5
|
+
|
6
|
+
describe Ronin::Web::Spider::GitArchive do
|
7
|
+
let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
|
8
|
+
|
9
|
+
describe ".open" do
|
10
|
+
subject { described_class }
|
11
|
+
|
12
|
+
context "when the root directory does not already exist" do
|
13
|
+
let(:root) { File.join(Dir.tmpdir,'ronin-web-spider-new-dir') }
|
14
|
+
|
15
|
+
it "must run `git init` on the new archive directory" do
|
16
|
+
subject.open(root)
|
17
|
+
|
18
|
+
expect(File.directory?(File.join(root,'.git'))).to be(true)
|
19
|
+
end
|
20
|
+
|
21
|
+
after { FileUtils.rm_r(root) }
|
22
|
+
end
|
23
|
+
|
24
|
+
context "when the root directory already exists" do
|
25
|
+
context "but does not contain a .git directory" do
|
26
|
+
it "must run `git init` within the root directory" do
|
27
|
+
subject.open(root)
|
28
|
+
|
29
|
+
expect(File.directory?(File.join(root,'.git'))).to be(true)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
subject { described_class.open(root) }
|
36
|
+
|
37
|
+
describe "#git?" do
|
38
|
+
subject { described_class.new(root) }
|
39
|
+
|
40
|
+
context "when the archive directory contains a .git directory" do
|
41
|
+
before do
|
42
|
+
FileUtils.mkdir(File.join(root,'.git'))
|
43
|
+
end
|
44
|
+
|
45
|
+
it "must return true" do
|
46
|
+
expect(subject.git?).to be(true)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
context "when the archive directory does not contains a .git directory" do
|
51
|
+
it "must return false" do
|
52
|
+
expect(subject.git?).to be(false)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "#init" do
|
58
|
+
it "must run the 'git init' command" do
|
59
|
+
expect(subject).to receive(:system).with('git','-C',root,'init').and_return(true)
|
60
|
+
|
61
|
+
subject.init
|
62
|
+
end
|
63
|
+
|
64
|
+
context "when the 'git init' command fails" do
|
65
|
+
it do
|
66
|
+
allow(subject).to receive(:system).with('git','-C',root,'init').and_return(false)
|
67
|
+
|
68
|
+
expect {
|
69
|
+
subject.init
|
70
|
+
}.to raise_error(Ronin::Web::Spider::GitError,"git command failed: git -C #{root} init")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
context "when 'git' is not installed" do
|
75
|
+
it do
|
76
|
+
allow(subject).to receive(:system).with('git','-C',root,'init').and_return(nil)
|
77
|
+
|
78
|
+
expect {
|
79
|
+
subject.init
|
80
|
+
}.to raise_error(Ronin::Web::Spider::GitError,"the git command was not found")
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
describe "#write" do
|
86
|
+
let(:url) { URI('https://example.com/foo/bar.html') }
|
87
|
+
let(:body) { 'test file' }
|
88
|
+
|
89
|
+
it "must automatically create parent directory" do
|
90
|
+
subject.write(url,body)
|
91
|
+
|
92
|
+
expect(File.directory?(File.join(root,'foo'))).to be(true)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "must write the body into the file" do
|
96
|
+
subject.write(url,body)
|
97
|
+
|
98
|
+
expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
|
99
|
+
end
|
100
|
+
|
101
|
+
it "must add the file using `git add`" do
|
102
|
+
absolute_path = File.join(root,'foo','bar.html')
|
103
|
+
|
104
|
+
expect(subject).to receive(:system).with(
|
105
|
+
'git', '-C', root, 'add', absolute_path
|
106
|
+
).and_return(true)
|
107
|
+
|
108
|
+
subject.write(url,body)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
describe "#commit" do
|
113
|
+
let(:message) { 'commit message' }
|
114
|
+
|
115
|
+
context "when a block is given" do
|
116
|
+
it "must yield control before calling `git commit -m ...` with the commit message" do
|
117
|
+
expect(subject).to receive(:system).with(
|
118
|
+
'git', '-C', root, 'commit', '-m', message
|
119
|
+
).and_return(true)
|
120
|
+
|
121
|
+
expect { |b|
|
122
|
+
subject.commit(message,&b)
|
123
|
+
}.to yield_with_args(subject)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
context "when no block is given" do
|
128
|
+
it "must not yield and call `git commit -m ...` with the commit message" do
|
129
|
+
expect(subject).to receive(:system).with(
|
130
|
+
'git', '-C', root, 'commit', '-m', message
|
131
|
+
).and_return(true)
|
132
|
+
|
133
|
+
subject.commit(message)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/spec/spider_spec.rb
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_app'
|
3
|
+
|
4
|
+
require 'ronin/web/spider'
|
5
|
+
|
6
|
+
describe Ronin::Web::Spider do
|
7
|
+
include_context "example App"
|
8
|
+
|
9
|
+
describe ".start_at" do
|
10
|
+
module TestAgentStartAt
|
11
|
+
class ExampleApp < Sinatra::Base
|
12
|
+
|
13
|
+
set :host, 'example.com'
|
14
|
+
set :port, 80
|
15
|
+
|
16
|
+
get '/' do
|
17
|
+
'<html><body>should not get here</body></html>'
|
18
|
+
end
|
19
|
+
|
20
|
+
get '/entry-point' do
|
21
|
+
<<~HTML
|
22
|
+
<html>
|
23
|
+
<body>
|
24
|
+
<a href="/link1">link1</a>
|
25
|
+
<a href="http://other.com/offsite-link">offsite link</a>
|
26
|
+
<a href="/link2">link2</a>
|
27
|
+
</body>
|
28
|
+
</html>
|
29
|
+
HTML
|
30
|
+
end
|
31
|
+
|
32
|
+
get '/link1' do
|
33
|
+
'<html><body>got here</body></html>'
|
34
|
+
end
|
35
|
+
|
36
|
+
get '/link2' do
|
37
|
+
'<html><body>got here</body></html>'
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class OtherApp < Sinatra::Base
|
42
|
+
|
43
|
+
set :host, 'other.com'
|
44
|
+
set :port, 80
|
45
|
+
|
46
|
+
get '/offsite-link' do
|
47
|
+
'<html><body>should not get here</body></html>'
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
subject { described_class }
|
54
|
+
|
55
|
+
let(:host) { 'example.com' }
|
56
|
+
let(:other_host) { 'other.com' }
|
57
|
+
let(:url) { URI("http://#{host}/entry-point") }
|
58
|
+
|
59
|
+
let(:app) { TestAgentStartAt::ExampleApp }
|
60
|
+
let(:other_app) { TestAgentStartAt::OtherApp }
|
61
|
+
|
62
|
+
before do
|
63
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
64
|
+
stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app)
|
65
|
+
end
|
66
|
+
|
67
|
+
it "must spider the website starting at the given URL" do
|
68
|
+
agent = subject.start_at(url)
|
69
|
+
|
70
|
+
expect(agent.history).to be == Set[
|
71
|
+
URI("http://#{host}/entry-point"),
|
72
|
+
URI("http://#{host}/link1"),
|
73
|
+
URI("http://#{other_host}/offsite-link"),
|
74
|
+
URI("http://#{host}/link2")
|
75
|
+
]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe ".site" do
|
80
|
+
module TestAgentSite
|
81
|
+
class ExampleApp < Sinatra::Base
|
82
|
+
|
83
|
+
set :host, 'example.com'
|
84
|
+
set :port, 80
|
85
|
+
|
86
|
+
get '/' do
|
87
|
+
'<html><body>should not get here</body></html>'
|
88
|
+
end
|
89
|
+
|
90
|
+
get '/entry-point' do
|
91
|
+
<<~HTML
|
92
|
+
<html>
|
93
|
+
<body>
|
94
|
+
<a href="/link1">link1</a>
|
95
|
+
<a href="http://other.com/offsite-link">offsite link</a>
|
96
|
+
<a href="/link2">link2</a>
|
97
|
+
</body>
|
98
|
+
</html>
|
99
|
+
HTML
|
100
|
+
end
|
101
|
+
|
102
|
+
get '/link1' do
|
103
|
+
'<html><body>got here</body></html>'
|
104
|
+
end
|
105
|
+
|
106
|
+
get '/link2' do
|
107
|
+
'<html><body>got here</body></html>'
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
subject { described_class }
|
114
|
+
|
115
|
+
let(:host) { 'example.com' }
|
116
|
+
let(:url) { URI("http://#{host}/entry-point") }
|
117
|
+
|
118
|
+
let(:app) { TestAgentSite::ExampleApp }
|
119
|
+
|
120
|
+
before do
|
121
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
122
|
+
end
|
123
|
+
|
124
|
+
it "must spider the website starting at the given URL" do
|
125
|
+
agent = subject.site(url)
|
126
|
+
|
127
|
+
expect(agent.history).to be == Set[
|
128
|
+
URI("http://#{host}/entry-point"),
|
129
|
+
URI("http://#{host}/link1"),
|
130
|
+
URI("http://#{host}/link2")
|
131
|
+
]
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
describe ".host" do
|
136
|
+
module TestAgentHost
|
137
|
+
class ExampleApp < Sinatra::Base
|
138
|
+
|
139
|
+
set :host, 'example.com'
|
140
|
+
set :port, 80
|
141
|
+
|
142
|
+
get '/' do
|
143
|
+
<<~HTML
|
144
|
+
<html>
|
145
|
+
<body>
|
146
|
+
<a href="/link1">link1</a>
|
147
|
+
<a href="http://other.com/offsite-link">offsite link</a>
|
148
|
+
<a href="/link2">link2</a>
|
149
|
+
</body>
|
150
|
+
</html>
|
151
|
+
HTML
|
152
|
+
end
|
153
|
+
|
154
|
+
get '/link1' do
|
155
|
+
'<html><body>got here</body></html>'
|
156
|
+
end
|
157
|
+
|
158
|
+
get '/link2' do
|
159
|
+
'<html><body>got here</body></html>'
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
subject { described_class }
|
166
|
+
|
167
|
+
let(:host) { 'example.com' }
|
168
|
+
let(:app) { TestAgentHost::ExampleApp }
|
169
|
+
|
170
|
+
before do
|
171
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
172
|
+
end
|
173
|
+
|
174
|
+
it "must spider the website starting at the given URL" do
|
175
|
+
agent = subject.host(host)
|
176
|
+
|
177
|
+
# XXX: for some reason Set#== was returning false, so convert to an Array
|
178
|
+
expect(agent.history.to_a).to be == [
|
179
|
+
URI("http://#{host}/"),
|
180
|
+
URI("http://#{host}/link1"),
|
181
|
+
URI("http://#{host}/link2")
|
182
|
+
]
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
describe ".domain" do
|
187
|
+
module TestAgentDomain
|
188
|
+
class ExampleApp < Sinatra::Base
|
189
|
+
|
190
|
+
set :host, 'example.com'
|
191
|
+
set :port, 80
|
192
|
+
|
193
|
+
get '/' do
|
194
|
+
<<~HTML
|
195
|
+
<html>
|
196
|
+
<body>
|
197
|
+
<a href="/link1">link1</a>
|
198
|
+
<a href="http://sub.example.com/subdomain-link">subdomain link</a>
|
199
|
+
<a href="/link2">link2</a>
|
200
|
+
</body>
|
201
|
+
</html>
|
202
|
+
HTML
|
203
|
+
end
|
204
|
+
|
205
|
+
get '/link1' do
|
206
|
+
'<html><body>got here</body></html>'
|
207
|
+
end
|
208
|
+
|
209
|
+
get '/link2' do
|
210
|
+
'<html><body>got here</body></html>'
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
class SubDomainApp < Sinatra::Base
|
216
|
+
|
217
|
+
set :host, 'sub.example.com'
|
218
|
+
set :port, 80
|
219
|
+
|
220
|
+
get '/subdomain-link' do
|
221
|
+
'<html><body>should get here</body></html>'
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
subject { described_class }
|
228
|
+
|
229
|
+
let(:domain) { 'example.com' }
|
230
|
+
let(:domain_app) { TestAgentDomain::ExampleApp }
|
231
|
+
|
232
|
+
let(:subdomain) { 'sub.example.com' }
|
233
|
+
let(:subdomain_app) { TestAgentDomain::SubDomainApp }
|
234
|
+
|
235
|
+
before do
|
236
|
+
stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app)
|
237
|
+
stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app)
|
238
|
+
end
|
239
|
+
|
240
|
+
it "must spider the domain and subdomains starting at the given domain" do
|
241
|
+
agent = subject.domain(domain)
|
242
|
+
|
243
|
+
# XXX: for some reason Set#== was returning false, so convert to an Array
|
244
|
+
expect(agent.history.to_a).to be == [
|
245
|
+
URI("http://#{domain}/"),
|
246
|
+
URI("http://#{domain}/link1"),
|
247
|
+
URI("http://#{subdomain}/subdomain-link"),
|
248
|
+
URI("http://#{domain}/link2")
|
249
|
+
]
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
metadata
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ronin-web-spider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0.beta1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Postmodern
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-01-01 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: spidr
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: ronin-support
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.0'
|
55
|
+
description: ronin-web-spider is a collection of common web spidering routines using
|
56
|
+
the spidr gem.
|
57
|
+
email: postmodern.mod3@gmail.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files:
|
61
|
+
- COPYING.txt
|
62
|
+
- ChangeLog.md
|
63
|
+
- README.md
|
64
|
+
files:
|
65
|
+
- ".document"
|
66
|
+
- ".github/workflows/ruby.yml"
|
67
|
+
- ".gitignore"
|
68
|
+
- ".rspec"
|
69
|
+
- ".ruby-version"
|
70
|
+
- ".yardopts"
|
71
|
+
- COPYING.txt
|
72
|
+
- ChangeLog.md
|
73
|
+
- Gemfile
|
74
|
+
- README.md
|
75
|
+
- Rakefile
|
76
|
+
- gemspec.yml
|
77
|
+
- lib/ronin/web/spider.rb
|
78
|
+
- lib/ronin/web/spider/agent.rb
|
79
|
+
- lib/ronin/web/spider/archive.rb
|
80
|
+
- lib/ronin/web/spider/exceptions.rb
|
81
|
+
- lib/ronin/web/spider/git_archive.rb
|
82
|
+
- lib/ronin/web/spider/version.rb
|
83
|
+
- ronin-web-spider.gemspec
|
84
|
+
- spec/agent_spec.rb
|
85
|
+
- spec/archive_spec.rb
|
86
|
+
- spec/example_app.rb
|
87
|
+
- spec/git_archive_spec.rb
|
88
|
+
- spec/spec_helper.rb
|
89
|
+
- spec/spider_spec.rb
|
90
|
+
homepage: https://ronin-rb.dev/
|
91
|
+
licenses:
|
92
|
+
- LGPL-3.0
|
93
|
+
metadata:
|
94
|
+
documentation_uri: https://rubydoc.info/gems/ronin-web-spider
|
95
|
+
source_code_uri: https://github.com/ronin-rb/ronin-web-spider
|
96
|
+
bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
|
97
|
+
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/master/ChangeLog.md
|
98
|
+
rubygems_mfa_required: 'true'
|
99
|
+
post_install_message:
|
100
|
+
rdoc_options: []
|
101
|
+
require_paths:
|
102
|
+
- lib
|
103
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - ">="
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: 3.0.0
|
108
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: '0'
|
113
|
+
requirements: []
|
114
|
+
rubygems_version: 3.3.26
|
115
|
+
signing_key:
|
116
|
+
specification_version: 4
|
117
|
+
summary: collection of common web spidering routines
|
118
|
+
test_files:
|
119
|
+
- spec/agent_spec.rb
|
120
|
+
- spec/archive_spec.rb
|
121
|
+
- spec/git_archive_spec.rb
|
122
|
+
- spec/spider_spec.rb
|