ronin-web-spider 0.1.0.beta1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # ronin-web-spider - A collection of common web spidering routines.
3
3
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
7
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,6 +22,286 @@ require 'ronin/web/spider/version'
22
22
 
23
23
  module Ronin
24
24
  module Web
25
+ #
26
+ # A collection of common web spidering routines using the [spidr] gem.
27
+ #
28
+ # [spidr]: https://github.com/postmodern/spidr#readme
29
+ #
30
+ # ## Examples
31
+ #
32
+ # Spider a host:
33
+ #
34
+ # ```ruby
35
+ # require 'ronin/web/spider'
36
+ #
37
+ # Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
38
+ # # ...
39
+ # end
40
+ # ```
41
+ #
42
+ # Spider a host:
43
+ #
44
+ # ```ruby
45
+ # Ronin::Web::Spider.host('solnic.eu') do |agent|
46
+ # # ...
47
+ # end
48
+ # ```
49
+ #
50
+ # Spider a domain (and any sub-domains):
51
+ #
52
+ # ```ruby
53
+ # Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
54
+ # # ...
55
+ # end
56
+ # ```
57
+ #
58
+ # Spider a site:
59
+ #
60
+ # ```ruby
61
+ # Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
62
+ # # ...
63
+ # end
64
+ # ```
65
+ #
66
+ # Spider multiple hosts:
67
+ #
68
+ # ```ruby
69
+ # Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
70
+ # # ...
71
+ # end
72
+ # ```
73
+ #
74
+ # Do not spider certain links:
75
+ #
76
+ # ```ruby
77
+ # Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
78
+ # # ...
79
+ # end
80
+ # ```
81
+ #
82
+ # Do not spider links on certain ports:
83
+ #
84
+ # ```ruby
85
+ # Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
86
+ # # ...
87
+ # end
88
+ # ```
89
+ #
90
+ # Do not spider links blacklisted in robots.txt:
91
+ #
92
+ # ```ruby
93
+ # Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
94
+ # # ...
95
+ # end
96
+ # ```
97
+ #
98
+ # Print out visited URLs:
99
+ #
100
+ # ```ruby
101
+ # Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
102
+ # spider.every_url { |url| puts url }
103
+ # end
104
+ # ```
105
+ #
106
+ # Build a URL map of a site:
107
+ #
108
+ # ```ruby
109
+ # url_map = Hash.new { |hash,key| hash[key] = [] }
110
+ #
111
+ # Ronin::Web::Spider.site('http://intranet.com/') do |spider|
112
+ # spider.every_link do |origin,dest|
113
+ # url_map[dest] << origin
114
+ # end
115
+ # end
116
+ # ```
117
+ #
118
+ # Print out the URLs that could not be requested:
119
+ #
120
+ # ```ruby
121
+ # Ronin::Web::Spider.site('http://company.com/') do |spider|
122
+ # spider.every_failed_url { |url| puts url }
123
+ # end
124
+ # ```
125
+ #
126
+ # Finds all pages which have broken links:
127
+ #
128
+ # ```ruby
129
+ # url_map = Hash.new { |hash,key| hash[key] = [] }
130
+ #
131
+ # spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
132
+ # spider.every_link do |origin,dest|
133
+ # url_map[dest] << origin
134
+ # end
135
+ # end
136
+ #
137
+ # spider.failures.each do |url|
138
+ # puts "Broken link #{url} found in:"
139
+ #
140
+ # url_map[url].each { |page| puts " #{page}" }
141
+ # end
142
+ # ```
143
+ #
144
+ # Search HTML and XML pages:
145
+ #
146
+ # ```ruby
147
+ # Ronin::Web::Spider.site('http://company.com/') do |spider|
148
+ # spider.every_page do |page|
149
+ # puts ">>> #{page.url}"
150
+ #
151
+ # page.search('//meta').each do |meta|
152
+ # name = (meta.attributes['name'] || meta.attributes['http-equiv'])
153
+ # value = meta.attributes['content']
154
+ #
155
+ # puts " #{name} = #{value}"
156
+ # end
157
+ # end
158
+ # end
159
+ # ```
160
+ #
161
+ # Print out the titles from every page:
162
+ #
163
+ # ```ruby
164
+ # Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
165
+ # spider.every_html_page do |page|
166
+ # puts page.title
167
+ # end
168
+ # end
169
+ # ```
170
+ #
171
+ # Print out every HTTP redirect:
172
+ #
173
+ # ```ruby
174
+ # Ronin::Web::Spider.host('company.com') do |spider|
175
+ # spider.every_redirect_page do |page|
176
+ # puts "#{page.url} -> #{page.headers['Location']}"
177
+ # end
178
+ # end
179
+ # ```
180
+ #
181
+ # Find what kinds of web servers a host is using, by accessing the headers:
182
+ #
183
+ # ```ruby
184
+ # servers = Set[]
185
+ #
186
+ # Ronin::Web::Spider.host('company.com') do |spider|
187
+ # spider.all_headers do |headers|
188
+ # servers << headers['server']
189
+ # end
190
+ # end
191
+ # ```
192
+ #
193
+ # Pause the spider on a forbidden page:
194
+ #
195
+ # ```ruby
196
+ # Ronin::Web::Spider.host('company.com') do |spider|
197
+ # spider.every_forbidden_page do |page|
198
+ # spider.pause!
199
+ # end
200
+ # end
201
+ # ```
202
+ #
203
+ # Skip the processing of a page:
204
+ #
205
+ # ```ruby
206
+ # Ronin::Web::Spider.host('company.com') do |spider|
207
+ # spider.every_missing_page do |page|
208
+ # spider.skip_page!
209
+ # end
210
+ # end
211
+ # ```
212
+ #
213
+ # Skip the processing of links:
214
+ #
215
+ # ```ruby
216
+ # Ronin::Web::Spider.host('company.com') do |spider|
217
+ # spider.every_url do |url|
218
+ # if url.path.split('/').find { |dir| dir.to_i > 1000 }
219
+ # spider.skip_link!
220
+ # end
221
+ # end
222
+ # end
223
+ # ```
224
+ #
225
+ # Detect when a new host name is spidered:
226
+ #
227
+ # ```ruby
228
+ # Ronin::Web::Spider.domain('example.com') do |spider|
229
+ # spider.every_host do |host|
230
+ # puts "Spidring #{host} ..."
231
+ # end
232
+ # end
233
+ # ```
234
+ #
235
+ # Detect when a new SSL/TLS certificate is encountered:
236
+ #
237
+ # ```ruby
238
+ # Ronin::Web::Spider.domain('example.com') do |spider|
239
+ # spider.every_cert do |cert|
240
+ # puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
241
+ # end
242
+ # end
243
+ # ```
244
+ #
245
+ # Print the MD5 checksum of every `favicon.ico` file:
246
+ #
247
+ # ```ruby
248
+ # Ronin::Web::Spider.domain('example.com') do |spider|
249
+ # spider.every_favicon do |page|
250
+ # puts "#{page.url}: #{page.body.md5}"
251
+ # end
252
+ # end
253
+ # ```
254
+ #
255
+ # Print every HTML comment:
256
+ #
257
+ # ```ruby
258
+ # Ronin::Web::Spider.domain('example.com') do |spider|
259
+ # spider.every_html_comment do |comment|
260
+ # puts comment
261
+ # end
262
+ # end
263
+ # ```
264
+ #
265
+ # Print all JavaScript source code:
266
+ #
267
+ # ```ruby
268
+ # Ronin::Web::Spider.domain('example.com') do |spider|
269
+ # spider.every_javascript do |js|
270
+ # puts js
271
+ # end
272
+ # end
273
+ # ```
274
+ #
275
+ # Print every JavaScript string literal:
276
+ #
277
+ # ```ruby
278
+ # Ronin::Web::Spider.domain('example.com') do |spider|
279
+ # spider.every_javascript_string do |str|
280
+ # puts str
281
+ # end
282
+ # end
283
+ # ```
284
+ #
285
+ # Print every JavaScript comment:
286
+ #
287
+ # ```ruby
288
+ # Ronin::Web::Spider.domain('example.com') do |spider|
289
+ # spider.every_javascript_comment do |comment|
290
+ # puts comment
291
+ # end
292
+ # end
293
+ # ```
294
+ #
295
+ # Print every HTML and JavaScript comment:
296
+ #
297
+ # ```ruby
298
+ # Ronin::Web::Spider.domain('example.com') do |spider|
299
+ # spider.every_comment do |comment|
300
+ # puts comment
301
+ # end
302
+ # end
303
+ # ```
304
+ #
25
305
  module Spider
26
306
  #
27
307
  # Creates a new agent and begin spidering at the given URL.
@@ -41,6 +321,8 @@ module Ronin
41
321
  #
42
322
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#start_at-class_method
43
323
  #
324
+ # @api public
325
+ #
44
326
  def self.start_at(url,**kwargs,&block)
45
327
  Agent.start_at(url,**kwargs,&block)
46
328
  end
@@ -63,6 +345,8 @@ module Ronin
63
345
  #
64
346
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#host-class_method
65
347
  #
348
+ # @api public
349
+ #
66
350
  def self.host(name,**kwargs,&block)
67
351
  Agent.host(name,**kwargs,&block)
68
352
  end
@@ -85,6 +369,8 @@ module Ronin
85
369
  #
86
370
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#site-class_method
87
371
  #
372
+ # @api public
373
+ #
88
374
  def self.site(url,**kwargs,&block)
89
375
  Agent.site(url,**kwargs,&block)
90
376
  end
@@ -107,6 +393,8 @@ module Ronin
107
393
  #
108
394
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#domain-class_method
109
395
  #
396
+ # @api public
397
+ #
110
398
  def self.domain(name,**kwargs,&block)
111
399
  Agent.domain(name,**kwargs,&block)
112
400
  end
@@ -27,13 +27,14 @@ Gem::Specification.new do |gem|
27
27
  gem.files = `git ls-files`.split($/)
28
28
  gem.files = glob[gemspec['files']] if gemspec['files']
29
29
  gem.files += Array(gemspec['generated_files'])
30
+ # exclude test files from the packages gem
31
+ gem.files -= glob[gemspec['test_files'] || 'spec/{**/}*']
30
32
 
31
33
  gem.executables = gemspec.fetch('executables') do
32
34
  glob['bin/*'].map { |path| File.basename(path) }
33
35
  end
34
36
 
35
37
  gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
36
- gem.test_files = glob[gemspec['test_files'] || 'spec/{**/}*_spec.rb']
37
38
  gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
38
39
 
39
40
  gem.require_paths = Array(gemspec.fetch('require_paths') {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ronin-web-spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.beta1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-01 00:00:00.000000000 Z
11
+ date: 2023-02-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -81,20 +81,14 @@ files:
81
81
  - lib/ronin/web/spider/git_archive.rb
82
82
  - lib/ronin/web/spider/version.rb
83
83
  - ronin-web-spider.gemspec
84
- - spec/agent_spec.rb
85
- - spec/archive_spec.rb
86
- - spec/example_app.rb
87
- - spec/git_archive_spec.rb
88
- - spec/spec_helper.rb
89
- - spec/spider_spec.rb
90
84
  homepage: https://ronin-rb.dev/
91
85
  licenses:
92
86
  - LGPL-3.0
93
87
  metadata:
94
- documentation_uri: https://rubydoc.info/gems/ronin-web-spider
88
+ documentation_uri: https://ronin-rb.dev/docs/ronin-web-spider
95
89
  source_code_uri: https://github.com/ronin-rb/ronin-web-spider
96
90
  bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
97
- changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/master/ChangeLog.md
91
+ changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/main/ChangeLog.md
98
92
  rubygems_mfa_required: 'true'
99
93
  post_install_message:
100
94
  rdoc_options: []
@@ -115,8 +109,4 @@ rubygems_version: 3.3.26
115
109
  signing_key:
116
110
  specification_version: 4
117
111
  summary: collection of common web spidering routines
118
- test_files:
119
- - spec/agent_spec.rb
120
- - spec/archive_spec.rb
121
- - spec/git_archive_spec.rb
122
- - spec/spider_spec.rb
112
+ test_files: []