ronin-web-spider 0.1.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ require 'spec_helper'
2
+ require 'ronin/web/spider/git_archive'
3
+
4
+ require 'tmpdir'
5
+
6
+ describe Ronin::Web::Spider::GitArchive do
7
+ let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
8
+
9
+ describe ".open" do
10
+ subject { described_class }
11
+
12
+ context "when the root directory does not already exist" do
13
+ let(:root) { File.join(Dir.tmpdir,'ronin-web-spider-new-dir') }
14
+
15
+ it "must run `git init` on the new archive directory" do
16
+ subject.open(root)
17
+
18
+ expect(File.directory?(File.join(root,'.git'))).to be(true)
19
+ end
20
+
21
+ after { FileUtils.rm_r(root) }
22
+ end
23
+
24
+ context "when the root directory already exists" do
25
+ context "but does not contain a .git directory" do
26
+ it "must run `git init` within the root directory" do
27
+ subject.open(root)
28
+
29
+ expect(File.directory?(File.join(root,'.git'))).to be(true)
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ subject { described_class.open(root) }
36
+
37
+ describe "#git?" do
38
+ subject { described_class.new(root) }
39
+
40
+ context "when the archive directory contains a .git directory" do
41
+ before do
42
+ FileUtils.mkdir(File.join(root,'.git'))
43
+ end
44
+
45
+ it "must return true" do
46
+ expect(subject.git?).to be(true)
47
+ end
48
+ end
49
+
50
+ context "when the archive directory does not contains a .git directory" do
51
+ it "must return false" do
52
+ expect(subject.git?).to be(false)
53
+ end
54
+ end
55
+ end
56
+
57
+ describe "#init" do
58
+ it "must run the 'git init' command" do
59
+ expect(subject).to receive(:system).with('git','-C',root,'init').and_return(true)
60
+
61
+ subject.init
62
+ end
63
+
64
+ context "when the 'git init' command fails" do
65
+ it do
66
+ allow(subject).to receive(:system).with('git','-C',root,'init').and_return(false)
67
+
68
+ expect {
69
+ subject.init
70
+ }.to raise_error(Ronin::Web::Spider::GitError,"git command failed: git -C #{root} init")
71
+ end
72
+ end
73
+
74
+ context "when 'git' is not installed" do
75
+ it do
76
+ allow(subject).to receive(:system).with('git','-C',root,'init').and_return(nil)
77
+
78
+ expect {
79
+ subject.init
80
+ }.to raise_error(Ronin::Web::Spider::GitError,"the git command was not found")
81
+ end
82
+ end
83
+ end
84
+
85
+ describe "#write" do
86
+ let(:url) { URI('https://example.com/foo/bar.html') }
87
+ let(:body) { 'test file' }
88
+
89
+ it "must automatically create parent directory" do
90
+ subject.write(url,body)
91
+
92
+ expect(File.directory?(File.join(root,'foo'))).to be(true)
93
+ end
94
+
95
+ it "must write the body into the file" do
96
+ subject.write(url,body)
97
+
98
+ expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
99
+ end
100
+
101
+ it "must add the file using `git add`" do
102
+ absolute_path = File.join(root,'foo','bar.html')
103
+
104
+ expect(subject).to receive(:system).with(
105
+ 'git', '-C', root, 'add', absolute_path
106
+ ).and_return(true)
107
+
108
+ subject.write(url,body)
109
+ end
110
+ end
111
+
112
+ describe "#commit" do
113
+ let(:message) { 'commit message' }
114
+
115
+ context "when a block is given" do
116
+ it "must yield control before calling `git commit -m ...` with the commit message" do
117
+ expect(subject).to receive(:system).with(
118
+ 'git', '-C', root, 'commit', '-m', message
119
+ ).and_return(true)
120
+
121
+ expect { |b|
122
+ subject.commit(message,&b)
123
+ }.to yield_with_args(subject)
124
+ end
125
+ end
126
+
127
+ context "when no block is given" do
128
+ it "must not yield and call `git commit -m ...` with the commit message" do
129
+ expect(subject).to receive(:system).with(
130
+ 'git', '-C', root, 'commit', '-m', message
131
+ ).and_return(true)
132
+
133
+ subject.commit(message)
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,4 @@
1
+ require 'rspec'
2
+ require 'simplecov'
3
+
4
+ SimpleCov.start
@@ -0,0 +1,252 @@
1
+ require 'spec_helper'
2
+ require 'example_app'
3
+
4
+ require 'ronin/web/spider'
5
+
6
+ describe Ronin::Web::Spider do
7
+ include_context "example App"
8
+
9
+ describe ".start_at" do
10
+ module TestAgentStartAt
11
+ class ExampleApp < Sinatra::Base
12
+
13
+ set :host, 'example.com'
14
+ set :port, 80
15
+
16
+ get '/' do
17
+ '<html><body>should not get here</body></html>'
18
+ end
19
+
20
+ get '/entry-point' do
21
+ <<~HTML
22
+ <html>
23
+ <body>
24
+ <a href="/link1">link1</a>
25
+ <a href="http://other.com/offsite-link">offsite link</a>
26
+ <a href="/link2">link2</a>
27
+ </body>
28
+ </html>
29
+ HTML
30
+ end
31
+
32
+ get '/link1' do
33
+ '<html><body>got here</body></html>'
34
+ end
35
+
36
+ get '/link2' do
37
+ '<html><body>got here</body></html>'
38
+ end
39
+ end
40
+
41
+ class OtherApp < Sinatra::Base
42
+
43
+ set :host, 'other.com'
44
+ set :port, 80
45
+
46
+ get '/offsite-link' do
47
+ '<html><body>should not get here</body></html>'
48
+ end
49
+
50
+ end
51
+ end
52
+
53
+ subject { described_class }
54
+
55
+ let(:host) { 'example.com' }
56
+ let(:other_host) { 'other.com' }
57
+ let(:url) { URI("http://#{host}/entry-point") }
58
+
59
+ let(:app) { TestAgentStartAt::ExampleApp }
60
+ let(:other_app) { TestAgentStartAt::OtherApp }
61
+
62
+ before do
63
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
64
+ stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app)
65
+ end
66
+
67
+ it "must spider the website starting at the given URL" do
68
+ agent = subject.start_at(url)
69
+
70
+ expect(agent.history).to be == Set[
71
+ URI("http://#{host}/entry-point"),
72
+ URI("http://#{host}/link1"),
73
+ URI("http://#{other_host}/offsite-link"),
74
+ URI("http://#{host}/link2")
75
+ ]
76
+ end
77
+ end
78
+
79
+ describe ".site" do
80
+ module TestAgentSite
81
+ class ExampleApp < Sinatra::Base
82
+
83
+ set :host, 'example.com'
84
+ set :port, 80
85
+
86
+ get '/' do
87
+ '<html><body>should not get here</body></html>'
88
+ end
89
+
90
+ get '/entry-point' do
91
+ <<~HTML
92
+ <html>
93
+ <body>
94
+ <a href="/link1">link1</a>
95
+ <a href="http://other.com/offsite-link">offsite link</a>
96
+ <a href="/link2">link2</a>
97
+ </body>
98
+ </html>
99
+ HTML
100
+ end
101
+
102
+ get '/link1' do
103
+ '<html><body>got here</body></html>'
104
+ end
105
+
106
+ get '/link2' do
107
+ '<html><body>got here</body></html>'
108
+ end
109
+
110
+ end
111
+ end
112
+
113
+ subject { described_class }
114
+
115
+ let(:host) { 'example.com' }
116
+ let(:url) { URI("http://#{host}/entry-point") }
117
+
118
+ let(:app) { TestAgentSite::ExampleApp }
119
+
120
+ before do
121
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
122
+ end
123
+
124
+ it "must spider the website starting at the given URL" do
125
+ agent = subject.site(url)
126
+
127
+ expect(agent.history).to be == Set[
128
+ URI("http://#{host}/entry-point"),
129
+ URI("http://#{host}/link1"),
130
+ URI("http://#{host}/link2")
131
+ ]
132
+ end
133
+ end
134
+
135
+ describe ".host" do
136
+ module TestAgentHost
137
+ class ExampleApp < Sinatra::Base
138
+
139
+ set :host, 'example.com'
140
+ set :port, 80
141
+
142
+ get '/' do
143
+ <<~HTML
144
+ <html>
145
+ <body>
146
+ <a href="/link1">link1</a>
147
+ <a href="http://other.com/offsite-link">offsite link</a>
148
+ <a href="/link2">link2</a>
149
+ </body>
150
+ </html>
151
+ HTML
152
+ end
153
+
154
+ get '/link1' do
155
+ '<html><body>got here</body></html>'
156
+ end
157
+
158
+ get '/link2' do
159
+ '<html><body>got here</body></html>'
160
+ end
161
+
162
+ end
163
+ end
164
+
165
+ subject { described_class }
166
+
167
+ let(:host) { 'example.com' }
168
+ let(:app) { TestAgentHost::ExampleApp }
169
+
170
+ before do
171
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
172
+ end
173
+
174
+ it "must spider the website starting at the given URL" do
175
+ agent = subject.host(host)
176
+
177
+ # XXX: for some reason Set#== was returning false, so convert to an Array
178
+ expect(agent.history.to_a).to be == [
179
+ URI("http://#{host}/"),
180
+ URI("http://#{host}/link1"),
181
+ URI("http://#{host}/link2")
182
+ ]
183
+ end
184
+ end
185
+
186
+ describe ".domain" do
187
+ module TestAgentDomain
188
+ class ExampleApp < Sinatra::Base
189
+
190
+ set :host, 'example.com'
191
+ set :port, 80
192
+
193
+ get '/' do
194
+ <<~HTML
195
+ <html>
196
+ <body>
197
+ <a href="/link1">link1</a>
198
+ <a href="http://sub.example.com/subdomain-link">subdomain link</a>
199
+ <a href="/link2">link2</a>
200
+ </body>
201
+ </html>
202
+ HTML
203
+ end
204
+
205
+ get '/link1' do
206
+ '<html><body>got here</body></html>'
207
+ end
208
+
209
+ get '/link2' do
210
+ '<html><body>got here</body></html>'
211
+ end
212
+
213
+ end
214
+
215
+ class SubDomainApp < Sinatra::Base
216
+
217
+ set :host, 'sub.example.com'
218
+ set :port, 80
219
+
220
+ get '/subdomain-link' do
221
+ '<html><body>should get here</body></html>'
222
+ end
223
+
224
+ end
225
+ end
226
+
227
+ subject { described_class }
228
+
229
+ let(:domain) { 'example.com' }
230
+ let(:domain_app) { TestAgentDomain::ExampleApp }
231
+
232
+ let(:subdomain) { 'sub.example.com' }
233
+ let(:subdomain_app) { TestAgentDomain::SubDomainApp }
234
+
235
+ before do
236
+ stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app)
237
+ stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app)
238
+ end
239
+
240
+ it "must spider the domain and subdomains starting at the given domain" do
241
+ agent = subject.domain(domain)
242
+
243
+ # XXX: for some reason Set#== was returning false, so convert to an Array
244
+ expect(agent.history.to_a).to be == [
245
+ URI("http://#{domain}/"),
246
+ URI("http://#{domain}/link1"),
247
+ URI("http://#{subdomain}/subdomain-link"),
248
+ URI("http://#{domain}/link2")
249
+ ]
250
+ end
251
+ end
252
+ end
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ronin-web-spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0.beta1
5
+ platform: ruby
6
+ authors:
7
+ - Postmodern
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-01-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: spidr
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: ronin-support
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.0'
55
+ description: ronin-web-spider is a collection of common web spidering routines using
56
+ the spidr gem.
57
+ email: postmodern.mod3@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files:
61
+ - COPYING.txt
62
+ - ChangeLog.md
63
+ - README.md
64
+ files:
65
+ - ".document"
66
+ - ".github/workflows/ruby.yml"
67
+ - ".gitignore"
68
+ - ".rspec"
69
+ - ".ruby-version"
70
+ - ".yardopts"
71
+ - COPYING.txt
72
+ - ChangeLog.md
73
+ - Gemfile
74
+ - README.md
75
+ - Rakefile
76
+ - gemspec.yml
77
+ - lib/ronin/web/spider.rb
78
+ - lib/ronin/web/spider/agent.rb
79
+ - lib/ronin/web/spider/archive.rb
80
+ - lib/ronin/web/spider/exceptions.rb
81
+ - lib/ronin/web/spider/git_archive.rb
82
+ - lib/ronin/web/spider/version.rb
83
+ - ronin-web-spider.gemspec
84
+ - spec/agent_spec.rb
85
+ - spec/archive_spec.rb
86
+ - spec/example_app.rb
87
+ - spec/git_archive_spec.rb
88
+ - spec/spec_helper.rb
89
+ - spec/spider_spec.rb
90
+ homepage: https://ronin-rb.dev/
91
+ licenses:
92
+ - LGPL-3.0
93
+ metadata:
94
+ documentation_uri: https://rubydoc.info/gems/ronin-web-spider
95
+ source_code_uri: https://github.com/ronin-rb/ronin-web-spider
96
+ bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
97
+ changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/master/ChangeLog.md
98
+ rubygems_mfa_required: 'true'
99
+ post_install_message:
100
+ rdoc_options: []
101
+ require_paths:
102
+ - lib
103
+ required_ruby_version: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ version: 3.0.0
108
+ required_rubygems_version: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
113
+ requirements: []
114
+ rubygems_version: 3.3.26
115
+ signing_key:
116
+ specification_version: 4
117
+ summary: collection of common web spidering routines
118
+ test_files:
119
+ - spec/agent_spec.rb
120
+ - spec/archive_spec.rb
121
+ - spec/git_archive_spec.rb
122
+ - spec/spider_spec.rb