ronin-web-spider 0.1.0.beta1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -4
- data/.yardopts +1 -1
- data/ChangeLog.md +3 -0
- data/Gemfile +2 -2
- data/README.md +302 -30
- data/gemspec.yml +2 -2
- data/lib/ronin/web/spider/agent.rb +62 -2
- data/lib/ronin/web/spider/archive.rb +3 -0
- data/lib/ronin/web/spider/exceptions.rb +1 -1
- data/lib/ronin/web/spider/git_archive.rb +1 -1
- data/lib/ronin/web/spider/version.rb +2 -2
- data/lib/ronin/web/spider.rb +289 -1
- data/ronin-web-spider.gemspec +2 -1
- metadata +5 -15
- data/spec/agent_spec.rb +0 -585
- data/spec/archive_spec.rb +0 -91
- data/spec/example_app.rb +0 -27
- data/spec/git_archive_spec.rb +0 -137
- data/spec/spec_helper.rb +0 -4
- data/spec/spider_spec.rb +0 -252
data/lib/ronin/web/spider.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2006-
|
4
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -22,6 +22,286 @@ require 'ronin/web/spider/version'
|
|
22
22
|
|
23
23
|
module Ronin
|
24
24
|
module Web
|
25
|
+
#
|
26
|
+
# A collection of common web spidering routines using the [spidr] gem.
|
27
|
+
#
|
28
|
+
# [spidr]: https://github.com/postmodern/spidr#readme
|
29
|
+
#
|
30
|
+
# ## Examples
|
31
|
+
#
|
32
|
+
# Spider a host:
|
33
|
+
#
|
34
|
+
# ```ruby
|
35
|
+
# require 'ronin/web/spider'
|
36
|
+
#
|
37
|
+
# Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
|
38
|
+
# # ...
|
39
|
+
# end
|
40
|
+
# ```
|
41
|
+
#
|
42
|
+
# Spider a host:
|
43
|
+
#
|
44
|
+
# ```ruby
|
45
|
+
# Ronin::Web::Spider.host('solnic.eu') do |agent|
|
46
|
+
# # ...
|
47
|
+
# end
|
48
|
+
# ```
|
49
|
+
#
|
50
|
+
# Spider a domain (and any sub-domains):
|
51
|
+
#
|
52
|
+
# ```ruby
|
53
|
+
# Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
|
54
|
+
# # ...
|
55
|
+
# end
|
56
|
+
# ```
|
57
|
+
#
|
58
|
+
# Spider a site:
|
59
|
+
#
|
60
|
+
# ```ruby
|
61
|
+
# Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
|
62
|
+
# # ...
|
63
|
+
# end
|
64
|
+
# ```
|
65
|
+
#
|
66
|
+
# Spider multiple hosts:
|
67
|
+
#
|
68
|
+
# ```ruby
|
69
|
+
# Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
70
|
+
# # ...
|
71
|
+
# end
|
72
|
+
# ```
|
73
|
+
#
|
74
|
+
# Do not spider certain links:
|
75
|
+
#
|
76
|
+
# ```ruby
|
77
|
+
# Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
78
|
+
# # ...
|
79
|
+
# end
|
80
|
+
# ```
|
81
|
+
#
|
82
|
+
# Do not spider links on certain ports:
|
83
|
+
#
|
84
|
+
# ```ruby
|
85
|
+
# Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
86
|
+
# # ...
|
87
|
+
# end
|
88
|
+
# ```
|
89
|
+
#
|
90
|
+
# Do not spider links blacklisted in robots.txt:
|
91
|
+
#
|
92
|
+
# ```ruby
|
93
|
+
# Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
|
94
|
+
# # ...
|
95
|
+
# end
|
96
|
+
# ```
|
97
|
+
#
|
98
|
+
# Print out visited URLs:
|
99
|
+
#
|
100
|
+
# ```ruby
|
101
|
+
# Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
|
102
|
+
# spider.every_url { |url| puts url }
|
103
|
+
# end
|
104
|
+
# ```
|
105
|
+
#
|
106
|
+
# Build a URL map of a site:
|
107
|
+
#
|
108
|
+
# ```ruby
|
109
|
+
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
110
|
+
#
|
111
|
+
# Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
112
|
+
# spider.every_link do |origin,dest|
|
113
|
+
# url_map[dest] << origin
|
114
|
+
# end
|
115
|
+
# end
|
116
|
+
# ```
|
117
|
+
#
|
118
|
+
# Print out the URLs that could not be requested:
|
119
|
+
#
|
120
|
+
# ```ruby
|
121
|
+
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
122
|
+
# spider.every_failed_url { |url| puts url }
|
123
|
+
# end
|
124
|
+
# ```
|
125
|
+
#
|
126
|
+
# Finds all pages which have broken links:
|
127
|
+
#
|
128
|
+
# ```ruby
|
129
|
+
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
130
|
+
#
|
131
|
+
# spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
132
|
+
# spider.every_link do |origin,dest|
|
133
|
+
# url_map[dest] << origin
|
134
|
+
# end
|
135
|
+
# end
|
136
|
+
#
|
137
|
+
# spider.failures.each do |url|
|
138
|
+
# puts "Broken link #{url} found in:"
|
139
|
+
#
|
140
|
+
# url_map[url].each { |page| puts " #{page}" }
|
141
|
+
# end
|
142
|
+
# ```
|
143
|
+
#
|
144
|
+
# Search HTML and XML pages:
|
145
|
+
#
|
146
|
+
# ```ruby
|
147
|
+
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
148
|
+
# spider.every_page do |page|
|
149
|
+
# puts ">>> #{page.url}"
|
150
|
+
#
|
151
|
+
# page.search('//meta').each do |meta|
|
152
|
+
# name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
153
|
+
# value = meta.attributes['content']
|
154
|
+
#
|
155
|
+
# puts " #{name} = #{value}"
|
156
|
+
# end
|
157
|
+
# end
|
158
|
+
# end
|
159
|
+
# ```
|
160
|
+
#
|
161
|
+
# Print out the titles from every page:
|
162
|
+
#
|
163
|
+
# ```ruby
|
164
|
+
# Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
|
165
|
+
# spider.every_html_page do |page|
|
166
|
+
# puts page.title
|
167
|
+
# end
|
168
|
+
# end
|
169
|
+
# ```
|
170
|
+
#
|
171
|
+
# Print out every HTTP redirect:
|
172
|
+
#
|
173
|
+
# ```ruby
|
174
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
175
|
+
# spider.every_redirect_page do |page|
|
176
|
+
# puts "#{page.url} -> #{page.headers['Location']}"
|
177
|
+
# end
|
178
|
+
# end
|
179
|
+
# ```
|
180
|
+
#
|
181
|
+
# Find what kinds of web servers a host is using, by accessing the headers:
|
182
|
+
#
|
183
|
+
# ```ruby
|
184
|
+
# servers = Set[]
|
185
|
+
#
|
186
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
187
|
+
# spider.all_headers do |headers|
|
188
|
+
# servers << headers['server']
|
189
|
+
# end
|
190
|
+
# end
|
191
|
+
# ```
|
192
|
+
#
|
193
|
+
# Pause the spider on a forbidden page:
|
194
|
+
#
|
195
|
+
# ```ruby
|
196
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
197
|
+
# spider.every_forbidden_page do |page|
|
198
|
+
# spider.pause!
|
199
|
+
# end
|
200
|
+
# end
|
201
|
+
# ```
|
202
|
+
#
|
203
|
+
# Skip the processing of a page:
|
204
|
+
#
|
205
|
+
# ```ruby
|
206
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
207
|
+
# spider.every_missing_page do |page|
|
208
|
+
# spider.skip_page!
|
209
|
+
# end
|
210
|
+
# end
|
211
|
+
# ```
|
212
|
+
#
|
213
|
+
# Skip the processing of links:
|
214
|
+
#
|
215
|
+
# ```ruby
|
216
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
217
|
+
# spider.every_url do |url|
|
218
|
+
# if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
219
|
+
# spider.skip_link!
|
220
|
+
# end
|
221
|
+
# end
|
222
|
+
# end
|
223
|
+
# ```
|
224
|
+
#
|
225
|
+
# Detect when a new host name is spidered:
|
226
|
+
#
|
227
|
+
# ```ruby
|
228
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
229
|
+
# spider.every_host do |host|
|
230
|
+
# puts "Spidring #{host} ..."
|
231
|
+
# end
|
232
|
+
# end
|
233
|
+
# ```
|
234
|
+
#
|
235
|
+
# Detect when a new SSL/TLS certificate is encountered:
|
236
|
+
#
|
237
|
+
# ```ruby
|
238
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
239
|
+
# spider.every_cert do |cert|
|
240
|
+
# puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
241
|
+
# end
|
242
|
+
# end
|
243
|
+
# ```
|
244
|
+
#
|
245
|
+
# Print the MD5 checksum of every `favicon.ico` file:
|
246
|
+
#
|
247
|
+
# ```ruby
|
248
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
249
|
+
# spider.every_favicon do |page|
|
250
|
+
# puts "#{page.url}: #{page.body.md5}"
|
251
|
+
# end
|
252
|
+
# end
|
253
|
+
# ```
|
254
|
+
#
|
255
|
+
# Print every HTML comment:
|
256
|
+
#
|
257
|
+
# ```ruby
|
258
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
259
|
+
# spider.every_html_comment do |comment|
|
260
|
+
# puts comment
|
261
|
+
# end
|
262
|
+
# end
|
263
|
+
# ```
|
264
|
+
#
|
265
|
+
# Print all JavaScript source code:
|
266
|
+
#
|
267
|
+
# ```ruby
|
268
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
269
|
+
# spider.every_javascript do |js|
|
270
|
+
# puts js
|
271
|
+
# end
|
272
|
+
# end
|
273
|
+
# ```
|
274
|
+
#
|
275
|
+
# Print every JavaScript string literal:
|
276
|
+
#
|
277
|
+
# ```ruby
|
278
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
279
|
+
# spider.every_javascript_string do |str|
|
280
|
+
# puts str
|
281
|
+
# end
|
282
|
+
# end
|
283
|
+
# ```
|
284
|
+
#
|
285
|
+
# Print every JavaScript comment:
|
286
|
+
#
|
287
|
+
# ```ruby
|
288
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
289
|
+
# spider.every_javascript_comment do |comment|
|
290
|
+
# puts comment
|
291
|
+
# end
|
292
|
+
# end
|
293
|
+
# ```
|
294
|
+
#
|
295
|
+
# Print every HTML and JavaScript comment:
|
296
|
+
#
|
297
|
+
# ```ruby
|
298
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
299
|
+
# spider.every_comment do |comment|
|
300
|
+
# puts comment
|
301
|
+
# end
|
302
|
+
# end
|
303
|
+
# ```
|
304
|
+
#
|
25
305
|
module Spider
|
26
306
|
#
|
27
307
|
# Creates a new agent and begin spidering at the given URL.
|
@@ -41,6 +321,8 @@ module Ronin
|
|
41
321
|
#
|
42
322
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#start_at-class_method
|
43
323
|
#
|
324
|
+
# @api public
|
325
|
+
#
|
44
326
|
def self.start_at(url,**kwargs,&block)
|
45
327
|
Agent.start_at(url,**kwargs,&block)
|
46
328
|
end
|
@@ -63,6 +345,8 @@ module Ronin
|
|
63
345
|
#
|
64
346
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#host-class_method
|
65
347
|
#
|
348
|
+
# @api public
|
349
|
+
#
|
66
350
|
def self.host(name,**kwargs,&block)
|
67
351
|
Agent.host(name,**kwargs,&block)
|
68
352
|
end
|
@@ -85,6 +369,8 @@ module Ronin
|
|
85
369
|
#
|
86
370
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#site-class_method
|
87
371
|
#
|
372
|
+
# @api public
|
373
|
+
#
|
88
374
|
def self.site(url,**kwargs,&block)
|
89
375
|
Agent.site(url,**kwargs,&block)
|
90
376
|
end
|
@@ -107,6 +393,8 @@ module Ronin
|
|
107
393
|
#
|
108
394
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#domain-class_method
|
109
395
|
#
|
396
|
+
# @api public
|
397
|
+
#
|
110
398
|
def self.domain(name,**kwargs,&block)
|
111
399
|
Agent.domain(name,**kwargs,&block)
|
112
400
|
end
|
data/ronin-web-spider.gemspec
CHANGED
@@ -27,13 +27,14 @@ Gem::Specification.new do |gem|
|
|
27
27
|
gem.files = `git ls-files`.split($/)
|
28
28
|
gem.files = glob[gemspec['files']] if gemspec['files']
|
29
29
|
gem.files += Array(gemspec['generated_files'])
|
30
|
+
# exclude test files from the packages gem
|
31
|
+
gem.files -= glob[gemspec['test_files'] || 'spec/{**/}*']
|
30
32
|
|
31
33
|
gem.executables = gemspec.fetch('executables') do
|
32
34
|
glob['bin/*'].map { |path| File.basename(path) }
|
33
35
|
end
|
34
36
|
|
35
37
|
gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
|
36
|
-
gem.test_files = glob[gemspec['test_files'] || 'spec/{**/}*_spec.rb']
|
37
38
|
gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
|
38
39
|
|
39
40
|
gem.require_paths = Array(gemspec.fetch('require_paths') {
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ronin-web-spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-02-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -81,20 +81,14 @@ files:
|
|
81
81
|
- lib/ronin/web/spider/git_archive.rb
|
82
82
|
- lib/ronin/web/spider/version.rb
|
83
83
|
- ronin-web-spider.gemspec
|
84
|
-
- spec/agent_spec.rb
|
85
|
-
- spec/archive_spec.rb
|
86
|
-
- spec/example_app.rb
|
87
|
-
- spec/git_archive_spec.rb
|
88
|
-
- spec/spec_helper.rb
|
89
|
-
- spec/spider_spec.rb
|
90
84
|
homepage: https://ronin-rb.dev/
|
91
85
|
licenses:
|
92
86
|
- LGPL-3.0
|
93
87
|
metadata:
|
94
|
-
documentation_uri: https://
|
88
|
+
documentation_uri: https://ronin-rb.dev/docs/ronin-web-spider
|
95
89
|
source_code_uri: https://github.com/ronin-rb/ronin-web-spider
|
96
90
|
bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
|
97
|
-
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/
|
91
|
+
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/main/ChangeLog.md
|
98
92
|
rubygems_mfa_required: 'true'
|
99
93
|
post_install_message:
|
100
94
|
rdoc_options: []
|
@@ -115,8 +109,4 @@ rubygems_version: 3.3.26
|
|
115
109
|
signing_key:
|
116
110
|
specification_version: 4
|
117
111
|
summary: collection of common web spidering routines
|
118
|
-
test_files:
|
119
|
-
- spec/agent_spec.rb
|
120
|
-
- spec/archive_spec.rb
|
121
|
-
- spec/git_archive_spec.rb
|
122
|
-
- spec/spider_spec.rb
|
112
|
+
test_files: []
|