ronin-web-spider 0.1.0.beta2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # ronin-web-spider - A collection of common web spidering routines.
3
3
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
7
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,6 +22,286 @@ require 'ronin/web/spider/version'
22
22
 
23
23
  module Ronin
24
24
  module Web
25
+ #
26
+ # A collection of common web spidering routines using the [spidr] gem.
27
+ #
28
+ # [spidr]: https://github.com/postmodern/spidr#readme
29
+ #
30
+ # ## Examples
31
+ #
32
+ # Spider a host:
33
+ #
34
+ # ```ruby
35
+ # require 'ronin/web/spider'
36
+ #
37
+ # Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
38
+ # # ...
39
+ # end
40
+ # ```
41
+ #
42
+ # Spider a host:
43
+ #
44
+ # ```ruby
45
+ # Ronin::Web::Spider.host('solnic.eu') do |agent|
46
+ # # ...
47
+ # end
48
+ # ```
49
+ #
50
+ # Spider a domain (and any sub-domains):
51
+ #
52
+ # ```ruby
53
+ # Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
54
+ # # ...
55
+ # end
56
+ # ```
57
+ #
58
+ # Spider a site:
59
+ #
60
+ # ```ruby
61
+ # Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
62
+ # # ...
63
+ # end
64
+ # ```
65
+ #
66
+ # Spider multiple hosts:
67
+ #
68
+ # ```ruby
69
+ # Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
70
+ # # ...
71
+ # end
72
+ # ```
73
+ #
74
+ # Do not spider certain links:
75
+ #
76
+ # ```ruby
77
+ # Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
78
+ # # ...
79
+ # end
80
+ # ```
81
+ #
82
+ # Do not spider links on certain ports:
83
+ #
84
+ # ```ruby
85
+ # Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
86
+ # # ...
87
+ # end
88
+ # ```
89
+ #
90
+ # Do not spider links blacklisted in robots.txt:
91
+ #
92
+ # ```ruby
93
+ # Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
94
+ # # ...
95
+ # end
96
+ # ```
97
+ #
98
+ # Print out visited URLs:
99
+ #
100
+ # ```ruby
101
+ # Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
102
+ # spider.every_url { |url| puts url }
103
+ # end
104
+ # ```
105
+ #
106
+ # Build a URL map of a site:
107
+ #
108
+ # ```ruby
109
+ # url_map = Hash.new { |hash,key| hash[key] = [] }
110
+ #
111
+ # Ronin::Web::Spider.site('http://intranet.com/') do |spider|
112
+ # spider.every_link do |origin,dest|
113
+ # url_map[dest] << origin
114
+ # end
115
+ # end
116
+ # ```
117
+ #
118
+ # Print out the URLs that could not be requested:
119
+ #
120
+ # ```ruby
121
+ # Ronin::Web::Spider.site('http://company.com/') do |spider|
122
+ # spider.every_failed_url { |url| puts url }
123
+ # end
124
+ # ```
125
+ #
126
+ # Finds all pages which have broken links:
127
+ #
128
+ # ```ruby
129
+ # url_map = Hash.new { |hash,key| hash[key] = [] }
130
+ #
131
+ # spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
132
+ # spider.every_link do |origin,dest|
133
+ # url_map[dest] << origin
134
+ # end
135
+ # end
136
+ #
137
+ # spider.failures.each do |url|
138
+ # puts "Broken link #{url} found in:"
139
+ #
140
+ # url_map[url].each { |page| puts " #{page}" }
141
+ # end
142
+ # ```
143
+ #
144
+ # Search HTML and XML pages:
145
+ #
146
+ # ```ruby
147
+ # Ronin::Web::Spider.site('http://company.com/') do |spider|
148
+ # spider.every_page do |page|
149
+ # puts ">>> #{page.url}"
150
+ #
151
+ # page.search('//meta').each do |meta|
152
+ # name = (meta.attributes['name'] || meta.attributes['http-equiv'])
153
+ # value = meta.attributes['content']
154
+ #
155
+ # puts " #{name} = #{value}"
156
+ # end
157
+ # end
158
+ # end
159
+ # ```
160
+ #
161
+ # Print out the titles from every page:
162
+ #
163
+ # ```ruby
164
+ # Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
165
+ # spider.every_html_page do |page|
166
+ # puts page.title
167
+ # end
168
+ # end
169
+ # ```
170
+ #
171
+ # Print out every HTTP redirect:
172
+ #
173
+ # ```ruby
174
+ # Ronin::Web::Spider.host('company.com') do |spider|
175
+ # spider.every_redirect_page do |page|
176
+ # puts "#{page.url} -> #{page.headers['Location']}"
177
+ # end
178
+ # end
179
+ # ```
180
+ #
181
+ # Find what kinds of web servers a host is using, by accessing the headers:
182
+ #
183
+ # ```ruby
184
+ # servers = Set[]
185
+ #
186
+ # Ronin::Web::Spider.host('company.com') do |spider|
187
+ # spider.all_headers do |headers|
188
+ # servers << headers['server']
189
+ # end
190
+ # end
191
+ # ```
192
+ #
193
+ # Pause the spider on a forbidden page:
194
+ #
195
+ # ```ruby
196
+ # Ronin::Web::Spider.host('company.com') do |spider|
197
+ # spider.every_forbidden_page do |page|
198
+ # spider.pause!
199
+ # end
200
+ # end
201
+ # ```
202
+ #
203
+ # Skip the processing of a page:
204
+ #
205
+ # ```ruby
206
+ # Ronin::Web::Spider.host('company.com') do |spider|
207
+ # spider.every_missing_page do |page|
208
+ # spider.skip_page!
209
+ # end
210
+ # end
211
+ # ```
212
+ #
213
+ # Skip the processing of links:
214
+ #
215
+ # ```ruby
216
+ # Ronin::Web::Spider.host('company.com') do |spider|
217
+ # spider.every_url do |url|
218
+ # if url.path.split('/').find { |dir| dir.to_i > 1000 }
219
+ # spider.skip_link!
220
+ # end
221
+ # end
222
+ # end
223
+ # ```
224
+ #
225
+ # Detect when a new host name is spidered:
226
+ #
227
+ # ```ruby
228
+ # Ronin::Web::Spider.domain('example.com') do |spider|
229
+ # spider.every_host do |host|
230
+ # puts "Spidring #{host} ..."
231
+ # end
232
+ # end
233
+ # ```
234
+ #
235
+ # Detect when a new SSL/TLS certificate is encountered:
236
+ #
237
+ # ```ruby
238
+ # Ronin::Web::Spider.domain('example.com') do |spider|
239
+ # spider.every_cert do |cert|
240
+ # puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
241
+ # end
242
+ # end
243
+ # ```
244
+ #
245
+ # Print the MD5 checksum of every `favicon.ico` file:
246
+ #
247
+ # ```ruby
248
+ # Ronin::Web::Spider.domain('example.com') do |spider|
249
+ # spider.every_favicon do |page|
250
+ # puts "#{page.url}: #{page.body.md5}"
251
+ # end
252
+ # end
253
+ # ```
254
+ #
255
+ # Print every HTML comment:
256
+ #
257
+ # ```ruby
258
+ # Ronin::Web::Spider.domain('example.com') do |spider|
259
+ # spider.every_html_comment do |comment|
260
+ # puts comment
261
+ # end
262
+ # end
263
+ # ```
264
+ #
265
+ # Print all JavaScript source code:
266
+ #
267
+ # ```ruby
268
+ # Ronin::Web::Spider.domain('example.com') do |spider|
269
+ # spider.every_javascript do |js|
270
+ # puts js
271
+ # end
272
+ # end
273
+ # ```
274
+ #
275
+ # Print every JavaScript string literal:
276
+ #
277
+ # ```ruby
278
+ # Ronin::Web::Spider.domain('example.com') do |spider|
279
+ # spider.every_javascript_string do |str|
280
+ # puts str
281
+ # end
282
+ # end
283
+ # ```
284
+ #
285
+ # Print every JavaScript comment:
286
+ #
287
+ # ```ruby
288
+ # Ronin::Web::Spider.domain('example.com') do |spider|
289
+ # spider.every_javascript_comment do |comment|
290
+ # puts comment
291
+ # end
292
+ # end
293
+ # ```
294
+ #
295
+ # Print every HTML and JavaScript comment:
296
+ #
297
+ # ```ruby
298
+ # Ronin::Web::Spider.domain('example.com') do |spider|
299
+ # spider.every_comment do |comment|
300
+ # puts comment
301
+ # end
302
+ # end
303
+ # ```
304
+ #
25
305
  module Spider
26
306
  #
27
307
  # Creates a new agent and begin spidering at the given URL.
@@ -41,6 +321,8 @@ module Ronin
41
321
  #
42
322
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#start_at-class_method
43
323
  #
324
+ # @api public
325
+ #
44
326
  def self.start_at(url,**kwargs,&block)
45
327
  Agent.start_at(url,**kwargs,&block)
46
328
  end
@@ -63,6 +345,8 @@ module Ronin
63
345
  #
64
346
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#host-class_method
65
347
  #
348
+ # @api public
349
+ #
66
350
  def self.host(name,**kwargs,&block)
67
351
  Agent.host(name,**kwargs,&block)
68
352
  end
@@ -85,6 +369,8 @@ module Ronin
85
369
  #
86
370
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#site-class_method
87
371
  #
372
+ # @api public
373
+ #
88
374
  def self.site(url,**kwargs,&block)
89
375
  Agent.site(url,**kwargs,&block)
90
376
  end
@@ -107,6 +393,8 @@ module Ronin
107
393
  #
108
394
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#domain-class_method
109
395
  #
396
+ # @api public
397
+ #
110
398
  def self.domain(name,**kwargs,&block)
111
399
  Agent.domain(name,**kwargs,&block)
112
400
  end
@@ -27,13 +27,14 @@ Gem::Specification.new do |gem|
27
27
  gem.files = `git ls-files`.split($/)
28
28
  gem.files = glob[gemspec['files']] if gemspec['files']
29
29
  gem.files += Array(gemspec['generated_files'])
30
+ # exclude test files from the packages gem
31
+ gem.files -= glob[gemspec['test_files'] || 'spec/{**/}*']
30
32
 
31
33
  gem.executables = gemspec.fetch('executables') do
32
34
  glob['bin/*'].map { |path| File.basename(path) }
33
35
  end
34
36
 
35
37
  gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
36
- gem.test_files = glob[gemspec['test_files'] || 'spec/{**/}*_spec.rb']
37
38
  gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
38
39
 
39
40
  gem.require_paths = Array(gemspec.fetch('require_paths') {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ronin-web-spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.beta2
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-01 00:00:00.000000000 Z
11
+ date: 2023-02-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.0.0.beta1
33
+ version: '1.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.0.0.beta1
40
+ version: '1.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -81,20 +81,14 @@ files:
81
81
  - lib/ronin/web/spider/git_archive.rb
82
82
  - lib/ronin/web/spider/version.rb
83
83
  - ronin-web-spider.gemspec
84
- - spec/agent_spec.rb
85
- - spec/archive_spec.rb
86
- - spec/example_app.rb
87
- - spec/git_archive_spec.rb
88
- - spec/spec_helper.rb
89
- - spec/spider_spec.rb
90
84
  homepage: https://ronin-rb.dev/
91
85
  licenses:
92
86
  - LGPL-3.0
93
87
  metadata:
94
- documentation_uri: https://rubydoc.info/gems/ronin-web-spider
88
+ documentation_uri: https://ronin-rb.dev/docs/ronin-web-spider
95
89
  source_code_uri: https://github.com/ronin-rb/ronin-web-spider
96
90
  bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
97
- changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/master/ChangeLog.md
91
+ changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/main/ChangeLog.md
98
92
  rubygems_mfa_required: 'true'
99
93
  post_install_message:
100
94
  rdoc_options: []
@@ -115,8 +109,4 @@ rubygems_version: 3.3.26
115
109
  signing_key:
116
110
  specification_version: 4
117
111
  summary: collection of common web spidering routines
118
- test_files:
119
- - spec/agent_spec.rb
120
- - spec/archive_spec.rb
121
- - spec/git_archive_spec.rb
122
- - spec/spider_spec.rb
112
+ test_files: []