ronin-web-spider 0.1.0.beta2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -4
- data/.yardopts +1 -1
- data/ChangeLog.md +3 -0
- data/README.md +302 -30
- data/gemspec.yml +3 -3
- data/lib/ronin/web/spider/agent.rb +62 -2
- data/lib/ronin/web/spider/archive.rb +3 -0
- data/lib/ronin/web/spider/exceptions.rb +1 -1
- data/lib/ronin/web/spider/git_archive.rb +1 -1
- data/lib/ronin/web/spider/version.rb +2 -2
- data/lib/ronin/web/spider.rb +289 -1
- data/ronin-web-spider.gemspec +2 -1
- metadata +7 -17
- data/spec/agent_spec.rb +0 -585
- data/spec/archive_spec.rb +0 -91
- data/spec/example_app.rb +0 -27
- data/spec/git_archive_spec.rb +0 -137
- data/spec/spec_helper.rb +0 -4
- data/spec/spider_spec.rb +0 -252
data/lib/ronin/web/spider.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2006-
|
4
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -22,6 +22,286 @@ require 'ronin/web/spider/version'
|
|
22
22
|
|
23
23
|
module Ronin
|
24
24
|
module Web
|
25
|
+
#
|
26
|
+
# A collection of common web spidering routines using the [spidr] gem.
|
27
|
+
#
|
28
|
+
# [spidr]: https://github.com/postmodern/spidr#readme
|
29
|
+
#
|
30
|
+
# ## Examples
|
31
|
+
#
|
32
|
+
# Spider a host:
|
33
|
+
#
|
34
|
+
# ```ruby
|
35
|
+
# require 'ronin/web/spider'
|
36
|
+
#
|
37
|
+
# Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
|
38
|
+
# # ...
|
39
|
+
# end
|
40
|
+
# ```
|
41
|
+
#
|
42
|
+
# Spider a host:
|
43
|
+
#
|
44
|
+
# ```ruby
|
45
|
+
# Ronin::Web::Spider.host('solnic.eu') do |agent|
|
46
|
+
# # ...
|
47
|
+
# end
|
48
|
+
# ```
|
49
|
+
#
|
50
|
+
# Spider a domain (and any sub-domains):
|
51
|
+
#
|
52
|
+
# ```ruby
|
53
|
+
# Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
|
54
|
+
# # ...
|
55
|
+
# end
|
56
|
+
# ```
|
57
|
+
#
|
58
|
+
# Spider a site:
|
59
|
+
#
|
60
|
+
# ```ruby
|
61
|
+
# Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
|
62
|
+
# # ...
|
63
|
+
# end
|
64
|
+
# ```
|
65
|
+
#
|
66
|
+
# Spider multiple hosts:
|
67
|
+
#
|
68
|
+
# ```ruby
|
69
|
+
# Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
70
|
+
# # ...
|
71
|
+
# end
|
72
|
+
# ```
|
73
|
+
#
|
74
|
+
# Do not spider certain links:
|
75
|
+
#
|
76
|
+
# ```ruby
|
77
|
+
# Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
78
|
+
# # ...
|
79
|
+
# end
|
80
|
+
# ```
|
81
|
+
#
|
82
|
+
# Do not spider links on certain ports:
|
83
|
+
#
|
84
|
+
# ```ruby
|
85
|
+
# Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
86
|
+
# # ...
|
87
|
+
# end
|
88
|
+
# ```
|
89
|
+
#
|
90
|
+
# Do not spider links blacklisted in robots.txt:
|
91
|
+
#
|
92
|
+
# ```ruby
|
93
|
+
# Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
|
94
|
+
# # ...
|
95
|
+
# end
|
96
|
+
# ```
|
97
|
+
#
|
98
|
+
# Print out visited URLs:
|
99
|
+
#
|
100
|
+
# ```ruby
|
101
|
+
# Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
|
102
|
+
# spider.every_url { |url| puts url }
|
103
|
+
# end
|
104
|
+
# ```
|
105
|
+
#
|
106
|
+
# Build a URL map of a site:
|
107
|
+
#
|
108
|
+
# ```ruby
|
109
|
+
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
110
|
+
#
|
111
|
+
# Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
112
|
+
# spider.every_link do |origin,dest|
|
113
|
+
# url_map[dest] << origin
|
114
|
+
# end
|
115
|
+
# end
|
116
|
+
# ```
|
117
|
+
#
|
118
|
+
# Print out the URLs that could not be requested:
|
119
|
+
#
|
120
|
+
# ```ruby
|
121
|
+
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
122
|
+
# spider.every_failed_url { |url| puts url }
|
123
|
+
# end
|
124
|
+
# ```
|
125
|
+
#
|
126
|
+
# Finds all pages which have broken links:
|
127
|
+
#
|
128
|
+
# ```ruby
|
129
|
+
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
130
|
+
#
|
131
|
+
# spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
132
|
+
# spider.every_link do |origin,dest|
|
133
|
+
# url_map[dest] << origin
|
134
|
+
# end
|
135
|
+
# end
|
136
|
+
#
|
137
|
+
# spider.failures.each do |url|
|
138
|
+
# puts "Broken link #{url} found in:"
|
139
|
+
#
|
140
|
+
# url_map[url].each { |page| puts " #{page}" }
|
141
|
+
# end
|
142
|
+
# ```
|
143
|
+
#
|
144
|
+
# Search HTML and XML pages:
|
145
|
+
#
|
146
|
+
# ```ruby
|
147
|
+
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
148
|
+
# spider.every_page do |page|
|
149
|
+
# puts ">>> #{page.url}"
|
150
|
+
#
|
151
|
+
# page.search('//meta').each do |meta|
|
152
|
+
# name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
153
|
+
# value = meta.attributes['content']
|
154
|
+
#
|
155
|
+
# puts " #{name} = #{value}"
|
156
|
+
# end
|
157
|
+
# end
|
158
|
+
# end
|
159
|
+
# ```
|
160
|
+
#
|
161
|
+
# Print out the titles from every page:
|
162
|
+
#
|
163
|
+
# ```ruby
|
164
|
+
# Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
|
165
|
+
# spider.every_html_page do |page|
|
166
|
+
# puts page.title
|
167
|
+
# end
|
168
|
+
# end
|
169
|
+
# ```
|
170
|
+
#
|
171
|
+
# Print out every HTTP redirect:
|
172
|
+
#
|
173
|
+
# ```ruby
|
174
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
175
|
+
# spider.every_redirect_page do |page|
|
176
|
+
# puts "#{page.url} -> #{page.headers['Location']}"
|
177
|
+
# end
|
178
|
+
# end
|
179
|
+
# ```
|
180
|
+
#
|
181
|
+
# Find what kinds of web servers a host is using, by accessing the headers:
|
182
|
+
#
|
183
|
+
# ```ruby
|
184
|
+
# servers = Set[]
|
185
|
+
#
|
186
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
187
|
+
# spider.all_headers do |headers|
|
188
|
+
# servers << headers['server']
|
189
|
+
# end
|
190
|
+
# end
|
191
|
+
# ```
|
192
|
+
#
|
193
|
+
# Pause the spider on a forbidden page:
|
194
|
+
#
|
195
|
+
# ```ruby
|
196
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
197
|
+
# spider.every_forbidden_page do |page|
|
198
|
+
# spider.pause!
|
199
|
+
# end
|
200
|
+
# end
|
201
|
+
# ```
|
202
|
+
#
|
203
|
+
# Skip the processing of a page:
|
204
|
+
#
|
205
|
+
# ```ruby
|
206
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
207
|
+
# spider.every_missing_page do |page|
|
208
|
+
# spider.skip_page!
|
209
|
+
# end
|
210
|
+
# end
|
211
|
+
# ```
|
212
|
+
#
|
213
|
+
# Skip the processing of links:
|
214
|
+
#
|
215
|
+
# ```ruby
|
216
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
217
|
+
# spider.every_url do |url|
|
218
|
+
# if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
219
|
+
# spider.skip_link!
|
220
|
+
# end
|
221
|
+
# end
|
222
|
+
# end
|
223
|
+
# ```
|
224
|
+
#
|
225
|
+
# Detect when a new host name is spidered:
|
226
|
+
#
|
227
|
+
# ```ruby
|
228
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
229
|
+
# spider.every_host do |host|
|
230
|
+
# puts "Spidring #{host} ..."
|
231
|
+
# end
|
232
|
+
# end
|
233
|
+
# ```
|
234
|
+
#
|
235
|
+
# Detect when a new SSL/TLS certificate is encountered:
|
236
|
+
#
|
237
|
+
# ```ruby
|
238
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
239
|
+
# spider.every_cert do |cert|
|
240
|
+
# puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
241
|
+
# end
|
242
|
+
# end
|
243
|
+
# ```
|
244
|
+
#
|
245
|
+
# Print the MD5 checksum of every `favicon.ico` file:
|
246
|
+
#
|
247
|
+
# ```ruby
|
248
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
249
|
+
# spider.every_favicon do |page|
|
250
|
+
# puts "#{page.url}: #{page.body.md5}"
|
251
|
+
# end
|
252
|
+
# end
|
253
|
+
# ```
|
254
|
+
#
|
255
|
+
# Print every HTML comment:
|
256
|
+
#
|
257
|
+
# ```ruby
|
258
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
259
|
+
# spider.every_html_comment do |comment|
|
260
|
+
# puts comment
|
261
|
+
# end
|
262
|
+
# end
|
263
|
+
# ```
|
264
|
+
#
|
265
|
+
# Print all JavaScript source code:
|
266
|
+
#
|
267
|
+
# ```ruby
|
268
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
269
|
+
# spider.every_javascript do |js|
|
270
|
+
# puts js
|
271
|
+
# end
|
272
|
+
# end
|
273
|
+
# ```
|
274
|
+
#
|
275
|
+
# Print every JavaScript string literal:
|
276
|
+
#
|
277
|
+
# ```ruby
|
278
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
279
|
+
# spider.every_javascript_string do |str|
|
280
|
+
# puts str
|
281
|
+
# end
|
282
|
+
# end
|
283
|
+
# ```
|
284
|
+
#
|
285
|
+
# Print every JavaScript comment:
|
286
|
+
#
|
287
|
+
# ```ruby
|
288
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
289
|
+
# spider.every_javascript_comment do |comment|
|
290
|
+
# puts comment
|
291
|
+
# end
|
292
|
+
# end
|
293
|
+
# ```
|
294
|
+
#
|
295
|
+
# Print every HTML and JavaScript comment:
|
296
|
+
#
|
297
|
+
# ```ruby
|
298
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
299
|
+
# spider.every_comment do |comment|
|
300
|
+
# puts comment
|
301
|
+
# end
|
302
|
+
# end
|
303
|
+
# ```
|
304
|
+
#
|
25
305
|
module Spider
|
26
306
|
#
|
27
307
|
# Creates a new agent and begin spidering at the given URL.
|
@@ -41,6 +321,8 @@ module Ronin
|
|
41
321
|
#
|
42
322
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#start_at-class_method
|
43
323
|
#
|
324
|
+
# @api public
|
325
|
+
#
|
44
326
|
def self.start_at(url,**kwargs,&block)
|
45
327
|
Agent.start_at(url,**kwargs,&block)
|
46
328
|
end
|
@@ -63,6 +345,8 @@ module Ronin
|
|
63
345
|
#
|
64
346
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#host-class_method
|
65
347
|
#
|
348
|
+
# @api public
|
349
|
+
#
|
66
350
|
def self.host(name,**kwargs,&block)
|
67
351
|
Agent.host(name,**kwargs,&block)
|
68
352
|
end
|
@@ -85,6 +369,8 @@ module Ronin
|
|
85
369
|
#
|
86
370
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#site-class_method
|
87
371
|
#
|
372
|
+
# @api public
|
373
|
+
#
|
88
374
|
def self.site(url,**kwargs,&block)
|
89
375
|
Agent.site(url,**kwargs,&block)
|
90
376
|
end
|
@@ -107,6 +393,8 @@ module Ronin
|
|
107
393
|
#
|
108
394
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#domain-class_method
|
109
395
|
#
|
396
|
+
# @api public
|
397
|
+
#
|
110
398
|
def self.domain(name,**kwargs,&block)
|
111
399
|
Agent.domain(name,**kwargs,&block)
|
112
400
|
end
|
data/ronin-web-spider.gemspec
CHANGED
@@ -27,13 +27,14 @@ Gem::Specification.new do |gem|
|
|
27
27
|
gem.files = `git ls-files`.split($/)
|
28
28
|
gem.files = glob[gemspec['files']] if gemspec['files']
|
29
29
|
gem.files += Array(gemspec['generated_files'])
|
30
|
+
# exclude test files from the packages gem
|
31
|
+
gem.files -= glob[gemspec['test_files'] || 'spec/{**/}*']
|
30
32
|
|
31
33
|
gem.executables = gemspec.fetch('executables') do
|
32
34
|
glob['bin/*'].map { |path| File.basename(path) }
|
33
35
|
end
|
34
36
|
|
35
37
|
gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
|
36
|
-
gem.test_files = glob[gemspec['test_files'] || 'spec/{**/}*_spec.rb']
|
37
38
|
gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
|
38
39
|
|
39
40
|
gem.require_paths = Array(gemspec.fetch('require_paths') {
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ronin-web-spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-02-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.0
|
33
|
+
version: '1.0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.0
|
40
|
+
version: '1.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: bundler
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -81,20 +81,14 @@ files:
|
|
81
81
|
- lib/ronin/web/spider/git_archive.rb
|
82
82
|
- lib/ronin/web/spider/version.rb
|
83
83
|
- ronin-web-spider.gemspec
|
84
|
-
- spec/agent_spec.rb
|
85
|
-
- spec/archive_spec.rb
|
86
|
-
- spec/example_app.rb
|
87
|
-
- spec/git_archive_spec.rb
|
88
|
-
- spec/spec_helper.rb
|
89
|
-
- spec/spider_spec.rb
|
90
84
|
homepage: https://ronin-rb.dev/
|
91
85
|
licenses:
|
92
86
|
- LGPL-3.0
|
93
87
|
metadata:
|
94
|
-
documentation_uri: https://
|
88
|
+
documentation_uri: https://ronin-rb.dev/docs/ronin-web-spider
|
95
89
|
source_code_uri: https://github.com/ronin-rb/ronin-web-spider
|
96
90
|
bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
|
97
|
-
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/
|
91
|
+
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/main/ChangeLog.md
|
98
92
|
rubygems_mfa_required: 'true'
|
99
93
|
post_install_message:
|
100
94
|
rdoc_options: []
|
@@ -115,8 +109,4 @@ rubygems_version: 3.3.26
|
|
115
109
|
signing_key:
|
116
110
|
specification_version: 4
|
117
111
|
summary: collection of common web spidering routines
|
118
|
-
test_files:
|
119
|
-
- spec/agent_spec.rb
|
120
|
-
- spec/archive_spec.rb
|
121
|
-
- spec/git_archive_spec.rb
|
122
|
-
- spec/spider_spec.rb
|
112
|
+
test_files: []
|