ronin-web-spider 0.1.0.beta2 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -122,6 +123,8 @@ module Ronin
122
123
  # The visited host names.
123
124
  #
124
125
  # @return [Set<String>, nil]
126
+ #
127
+ # @api public
125
128
  attr_reader :visited_hosts
126
129
 
127
130
  #
@@ -132,6 +135,13 @@ module Ronin
132
135
  #
133
136
  # @yieldparam [String] host
134
137
  #
138
+ # @example
139
+ # spider.every_host do |host|
140
+ # puts "Spidring #{host} ..."
141
+ # end
142
+ #
143
+ # @api public
144
+ #
135
145
  def every_host
136
146
  @visited_hosts ||= Set.new
137
147
 
@@ -147,6 +157,8 @@ module Ronin
147
157
  # All certificates encountered while spidering.
148
158
  #
149
159
  # @return [Array<Ronin::Support::Crypto::Cert>]
160
+ #
161
+ # @api public
150
162
  attr_reader :collected_certs
151
163
 
152
164
  #
@@ -157,6 +169,13 @@ module Ronin
157
169
  #
158
170
  # @yieldparam [Ronin::Support::Crypto::Cert]
159
171
  #
172
+ # @example
173
+ # spider.every_cert do |cert|
174
+ # puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
175
+ # end
176
+ #
177
+ # @api public
178
+ #
160
179
  def every_cert
161
180
  @collected_certs ||= []
162
181
 
@@ -185,8 +204,15 @@ module Ronin
185
204
  # @yieldparam [Spidr::Page] favicon
186
205
  # An encountered `.ico` file.
187
206
  #
207
+ # @example
208
+ # spider.every_favicon do |page|
209
+ # # ...
210
+ # end
211
+ #
188
212
  # @see https://rubydoc.info/gems/spidr/Spidr/Page
189
213
  #
214
+ # @api public
215
+ #
190
216
  def every_favicon
191
217
  every_page do |page|
192
218
  yield page if page.icon?
@@ -197,14 +223,23 @@ module Ronin
197
223
  # Passes every non-empty HTML comment to the given block.
198
224
  #
199
225
  # @yield [comment]
200
- # The given block will be passevery HTML comment.
226
+ # The given block will be pass every HTML comment.
201
227
  #
202
228
  # @yieldparam [String] comment
203
229
  # The HTML comment inner text, with leading and trailing whitespace
204
230
  # stripped.
205
231
  #
232
+ # @example
233
+ # spider.every_html_comment do |comment|
234
+ # puts comment
235
+ # end
236
+ #
237
+ # @api public
238
+ #
206
239
  def every_html_comment
207
240
  every_html_page do |page|
241
+ next unless page.doc
242
+
208
243
  page.doc.xpath('//comment()').each do |comment|
209
244
  comment_text = comment.inner_text.strip
210
245
 
@@ -224,24 +259,71 @@ module Ronin
224
259
  # @yieldparam [String] js
225
260
  # The JavaScript source code.
226
261
  #
262
+ # @example
263
+ # spider.every_javascript do |js|
264
+ # puts js
265
+ # end
266
+ #
267
+ # @api public
268
+ #
227
269
  def every_javascript
228
270
  # yield inner text of every `<script type="text/javascript">` tag
229
271
  # and every `.js` URL.
230
272
  every_html_page do |page|
273
+ next unless page.doc
274
+
231
275
  page.doc.xpath('//script[@type="text/javascript"]').each do |script|
232
- unless script.inner_text.empty?
233
- yield script.inner_text
276
+ source = script.inner_text
277
+ source.force_encoding(Encoding::UTF_8)
278
+
279
+ unless source.empty?
280
+ yield source
234
281
  end
235
282
  end
236
283
  end
237
284
 
238
285
  every_javascript_page do |page|
239
- yield page.body
286
+ source = page.body
287
+ source.force_encoding(Encoding::UTF_8)
288
+
289
+ yield source
240
290
  end
241
291
  end
242
292
 
243
293
  alias every_js every_javascript
244
294
 
295
+ # Regex to match and skip JavaScript inline regexes.
296
+ #
297
+ # @api private
298
+ #
299
+ # @since 0.1.1
300
+ JAVASCRIPT_INLINE_REGEX = %r{
301
+ (?# match before the regex to avoid matching division operators )
302
+ (?:[\{\[\(;:,]\s*|=\s*)
303
+ /
304
+ (?# inline regex contents )
305
+ (?:
306
+ \[ (?:\\. | [^\]]) \] (?# [...] ) |
307
+ \\. (?# backslash escaped characters ) |
308
+ [^/] (?# everything else )
309
+ )+
310
+ /[dgimsuvy]* (?# also match any regex flags )
311
+ }mx
312
+
313
+ # Regex to match and skip JavaScript template literals.
314
+ #
315
+ # @note
316
+ # This regex will not properly match nested template literals:
317
+ #
318
+ # ```javascript
319
+ # `foo ${`bar ${1+1}`}`
320
+ # ```
321
+ #
322
+ # @api private
323
+ #
324
+ # @since 0.1.1
325
+ JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
326
+
245
327
  #
246
328
  # Passes every JavaScript string value to the given block.
247
329
  #
@@ -252,10 +334,30 @@ module Ronin
252
334
  # @yieldparam [String] string
253
335
  # The parsed contents of a JavaScript string.
254
336
  #
337
+ # @example
338
+ # spider.every_javascript_string do |str|
339
+ # puts str
340
+ # end
341
+ #
342
+ # @api public
343
+ #
255
344
  def every_javascript_string
256
345
  every_javascript do |js|
257
- js.scan(Support::Text::Patterns::STRING) do |js_string|
258
- yield Support::Encoding::JS.unquote(js_string)
346
+ scanner = StringScanner.new(js)
347
+
348
+ until scanner.eos?
349
+ # NOTE: this is a naive JavaScript string scanner and should
350
+ # eventually be replaced with a real JavaScript lexer or parser.
351
+ case scanner.peek(1)
352
+ when '"', "'" # beginning of a quoted string
353
+ js_string = scanner.scan(Support::Text::Patterns::STRING)
354
+
355
+ yield Support::Encoding::JS.unquote(js_string)
356
+ else
357
+ scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
358
+ scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
359
+ scanner.getch
360
+ end
259
361
  end
260
362
  end
261
363
  end
@@ -271,6 +373,13 @@ module Ronin
271
373
  # @yieldparam [String] comment
272
374
  # The contents of a JavaScript comment.
273
375
  #
376
+ # @example
377
+ # spider.every_javascript_comment do |comment|
378
+ # puts comment
379
+ # end
380
+ #
381
+ # @api public
382
+ #
274
383
  def every_javascript_comment(&block)
275
384
  every_javascript do |js|
276
385
  js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
@@ -288,9 +397,16 @@ module Ronin
288
397
  # @yieldparam [String] comment
289
398
  # The contents of a HTML or JavaScript comment.
290
399
  #
400
+ # @example
401
+ # spider.every_comment do |comment|
402
+ # puts comment
403
+ # end
404
+ #
291
405
  # @see #every_html_comment
292
406
  # @see #every_javascript_comment
293
407
  #
408
+ # @api public
409
+ #
294
410
  def every_comment(&block)
295
411
  every_html_comment(&block)
296
412
  every_javascript_comment(&block)
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
@@ -29,6 +30,9 @@ module Ronin
29
30
  #
30
31
  # Spider a host and archive every web page:
31
32
  #
33
+ # require 'ronin/web/spider'
34
+ # require 'ronin/web/spider/archive'
35
+ #
32
36
  # Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
33
37
  # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
34
38
  # archive.write(page.url,page.body)
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
@@ -30,10 +31,10 @@ module Ronin
30
31
  #
31
32
  # Spider a host and archive every web page to a Git repository:
32
33
  #
33
- # require 'ronin/web/spider/git_archive'
34
34
  # require 'ronin/web/spider'
35
+ # require 'ronin/web/spider/git_archive'
35
36
  # require 'date'
36
- #
37
+ #
37
38
  # Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
38
39
  # archive.commit("Updated #{Date.today}") do
39
40
  # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -21,7 +22,7 @@ module Ronin
21
22
  module Web
22
23
  module Spider
23
24
  # ronin-web-spider version
24
- VERSION = '0.1.0.beta2'
25
+ VERSION = '0.1.1'
25
26
  end
26
27
  end
27
28
  end
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,6 +23,286 @@ require 'ronin/web/spider/version'
22
23
 
23
24
  module Ronin
24
25
  module Web
26
+ #
27
+ # A collection of common web spidering routines using the [spidr] gem.
28
+ #
29
+ # [spidr]: https://github.com/postmodern/spidr#readme
30
+ #
31
+ # ## Examples
32
+ #
33
+ # Spider a host:
34
+ #
35
+ # ```ruby
36
+ # require 'ronin/web/spider'
37
+ #
38
+ # Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
39
+ # # ...
40
+ # end
41
+ # ```
42
+ #
43
+ # Spider a host:
44
+ #
45
+ # ```ruby
46
+ # Ronin::Web::Spider.host('solnic.eu') do |agent|
47
+ # # ...
48
+ # end
49
+ # ```
50
+ #
51
+ # Spider a domain (and any sub-domains):
52
+ #
53
+ # ```ruby
54
+ # Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
55
+ # # ...
56
+ # end
57
+ # ```
58
+ #
59
+ # Spider a site:
60
+ #
61
+ # ```ruby
62
+ # Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
63
+ # # ...
64
+ # end
65
+ # ```
66
+ #
67
+ # Spider multiple hosts:
68
+ #
69
+ # ```ruby
70
+ # Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
71
+ # # ...
72
+ # end
73
+ # ```
74
+ #
75
+ # Do not spider certain links:
76
+ #
77
+ # ```ruby
78
+ # Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
79
+ # # ...
80
+ # end
81
+ # ```
82
+ #
83
+ # Do not spider links on certain ports:
84
+ #
85
+ # ```ruby
86
+ # Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
87
+ # # ...
88
+ # end
89
+ # ```
90
+ #
91
+ # Do not spider links blacklisted in robots.txt:
92
+ #
93
+ # ```ruby
94
+ # Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
95
+ # # ...
96
+ # end
97
+ # ```
98
+ #
99
+ # Print out visited URLs:
100
+ #
101
+ # ```ruby
102
+ # Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
103
+ # spider.every_url { |url| puts url }
104
+ # end
105
+ # ```
106
+ #
107
+ # Build a URL map of a site:
108
+ #
109
+ # ```ruby
110
+ # url_map = Hash.new { |hash,key| hash[key] = [] }
111
+ #
112
+ # Ronin::Web::Spider.site('http://intranet.com/') do |spider|
113
+ # spider.every_link do |origin,dest|
114
+ # url_map[dest] << origin
115
+ # end
116
+ # end
117
+ # ```
118
+ #
119
+ # Print out the URLs that could not be requested:
120
+ #
121
+ # ```ruby
122
+ # Ronin::Web::Spider.site('http://company.com/') do |spider|
123
+ # spider.every_failed_url { |url| puts url }
124
+ # end
125
+ # ```
126
+ #
127
+ # Finds all pages which have broken links:
128
+ #
129
+ # ```ruby
130
+ # url_map = Hash.new { |hash,key| hash[key] = [] }
131
+ #
132
+ # spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
133
+ # spider.every_link do |origin,dest|
134
+ # url_map[dest] << origin
135
+ # end
136
+ # end
137
+ #
138
+ # spider.failures.each do |url|
139
+ # puts "Broken link #{url} found in:"
140
+ #
141
+ # url_map[url].each { |page| puts " #{page}" }
142
+ # end
143
+ # ```
144
+ #
145
+ # Search HTML and XML pages:
146
+ #
147
+ # ```ruby
148
+ # Ronin::Web::Spider.site('http://company.com/') do |spider|
149
+ # spider.every_page do |page|
150
+ # puts ">>> #{page.url}"
151
+ #
152
+ # page.search('//meta').each do |meta|
153
+ # name = (meta.attributes['name'] || meta.attributes['http-equiv'])
154
+ # value = meta.attributes['content']
155
+ #
156
+ # puts " #{name} = #{value}"
157
+ # end
158
+ # end
159
+ # end
160
+ # ```
161
+ #
162
+ # Print out the titles from every page:
163
+ #
164
+ # ```ruby
165
+ # Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
166
+ # spider.every_html_page do |page|
167
+ # puts page.title
168
+ # end
169
+ # end
170
+ # ```
171
+ #
172
+ # Print out every HTTP redirect:
173
+ #
174
+ # ```ruby
175
+ # Ronin::Web::Spider.host('company.com') do |spider|
176
+ # spider.every_redirect_page do |page|
177
+ # puts "#{page.url} -> #{page.headers['Location']}"
178
+ # end
179
+ # end
180
+ # ```
181
+ #
182
+ # Find what kinds of web servers a host is using, by accessing the headers:
183
+ #
184
+ # ```ruby
185
+ # servers = Set[]
186
+ #
187
+ # Ronin::Web::Spider.host('company.com') do |spider|
188
+ # spider.all_headers do |headers|
189
+ # servers << headers['server']
190
+ # end
191
+ # end
192
+ # ```
193
+ #
194
+ # Pause the spider on a forbidden page:
195
+ #
196
+ # ```ruby
197
+ # Ronin::Web::Spider.host('company.com') do |spider|
198
+ # spider.every_forbidden_page do |page|
199
+ # spider.pause!
200
+ # end
201
+ # end
202
+ # ```
203
+ #
204
+ # Skip the processing of a page:
205
+ #
206
+ # ```ruby
207
+ # Ronin::Web::Spider.host('company.com') do |spider|
208
+ # spider.every_missing_page do |page|
209
+ # spider.skip_page!
210
+ # end
211
+ # end
212
+ # ```
213
+ #
214
+ # Skip the processing of links:
215
+ #
216
+ # ```ruby
217
+ # Ronin::Web::Spider.host('company.com') do |spider|
218
+ # spider.every_url do |url|
219
+ # if url.path.split('/').find { |dir| dir.to_i > 1000 }
220
+ # spider.skip_link!
221
+ # end
222
+ # end
223
+ # end
224
+ # ```
225
+ #
226
+ # Detect when a new host name is spidered:
227
+ #
228
+ # ```ruby
229
+ # Ronin::Web::Spider.domain('example.com') do |spider|
230
+ # spider.every_host do |host|
231
+ # puts "Spidring #{host} ..."
232
+ # end
233
+ # end
234
+ # ```
235
+ #
236
+ # Detect when a new SSL/TLS certificate is encountered:
237
+ #
238
+ # ```ruby
239
+ # Ronin::Web::Spider.domain('example.com') do |spider|
240
+ # spider.every_cert do |cert|
241
+ # puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
242
+ # end
243
+ # end
244
+ # ```
245
+ #
246
+ # Print the MD5 checksum of every `favicon.ico` file:
247
+ #
248
+ # ```ruby
249
+ # Ronin::Web::Spider.domain('example.com') do |spider|
250
+ # spider.every_favicon do |page|
251
+ # puts "#{page.url}: #{page.body.md5}"
252
+ # end
253
+ # end
254
+ # ```
255
+ #
256
+ # Print every HTML comment:
257
+ #
258
+ # ```ruby
259
+ # Ronin::Web::Spider.domain('example.com') do |spider|
260
+ # spider.every_html_comment do |comment|
261
+ # puts comment
262
+ # end
263
+ # end
264
+ # ```
265
+ #
266
+ # Print all JavaScript source code:
267
+ #
268
+ # ```ruby
269
+ # Ronin::Web::Spider.domain('example.com') do |spider|
270
+ # spider.every_javascript do |js|
271
+ # puts js
272
+ # end
273
+ # end
274
+ # ```
275
+ #
276
+ # Print every JavaScript string literal:
277
+ #
278
+ # ```ruby
279
+ # Ronin::Web::Spider.domain('example.com') do |spider|
280
+ # spider.every_javascript_string do |str|
281
+ # puts str
282
+ # end
283
+ # end
284
+ # ```
285
+ #
286
+ # Print every JavaScript comment:
287
+ #
288
+ # ```ruby
289
+ # Ronin::Web::Spider.domain('example.com') do |spider|
290
+ # spider.every_javascript_comment do |comment|
291
+ # puts comment
292
+ # end
293
+ # end
294
+ # ```
295
+ #
296
+ # Print every HTML and JavaScript comment:
297
+ #
298
+ # ```ruby
299
+ # Ronin::Web::Spider.domain('example.com') do |spider|
300
+ # spider.every_comment do |comment|
301
+ # puts comment
302
+ # end
303
+ # end
304
+ # ```
305
+ #
25
306
  module Spider
26
307
  #
27
308
  # Creates a new agent and begin spidering at the given URL.
@@ -41,6 +322,8 @@ module Ronin
41
322
  #
42
323
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#start_at-class_method
43
324
  #
325
+ # @api public
326
+ #
44
327
  def self.start_at(url,**kwargs,&block)
45
328
  Agent.start_at(url,**kwargs,&block)
46
329
  end
@@ -63,6 +346,8 @@ module Ronin
63
346
  #
64
347
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#host-class_method
65
348
  #
349
+ # @api public
350
+ #
66
351
  def self.host(name,**kwargs,&block)
67
352
  Agent.host(name,**kwargs,&block)
68
353
  end
@@ -85,6 +370,8 @@ module Ronin
85
370
  #
86
371
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#site-class_method
87
372
  #
373
+ # @api public
374
+ #
88
375
  def self.site(url,**kwargs,&block)
89
376
  Agent.site(url,**kwargs,&block)
90
377
  end
@@ -107,6 +394,8 @@ module Ronin
107
394
  #
108
395
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#domain-class_method
109
396
  #
397
+ # @api public
398
+ #
110
399
  def self.domain(name,**kwargs,&block)
111
400
  Agent.domain(name,**kwargs,&block)
112
401
  end
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  require 'yaml'
4
4
 
@@ -22,18 +22,19 @@ Gem::Specification.new do |gem|
22
22
  gem.homepage = gemspec['homepage']
23
23
  gem.metadata = gemspec['metadata'] if gemspec['metadata']
24
24
 
25
- glob = lambda { |patterns| gem.files & Dir[*patterns] }
25
+ glob = ->(patterns) { gem.files & Dir[*patterns] }
26
26
 
27
27
  gem.files = `git ls-files`.split($/)
28
28
  gem.files = glob[gemspec['files']] if gemspec['files']
29
29
  gem.files += Array(gemspec['generated_files'])
30
+ # exclude test files from the packages gem
31
+ gem.files -= glob[gemspec['test_files'] || 'spec/{**/}*']
30
32
 
31
33
  gem.executables = gemspec.fetch('executables') do
32
34
  glob['bin/*'].map { |path| File.basename(path) }
33
35
  end
34
36
 
35
37
  gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
36
- gem.test_files = glob[gemspec['test_files'] || 'spec/{**/}*_spec.rb']
37
38
  gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
38
39
 
39
40
  gem.require_paths = Array(gemspec.fetch('require_paths') {
@@ -45,7 +46,7 @@ Gem::Specification.new do |gem|
45
46
  gem.required_rubygems_version = gemspec['required_rubygems_version']
46
47
  gem.post_install_message = gemspec['post_install_message']
47
48
 
48
- split = lambda { |string| string.split(/,\s*/) }
49
+ split = ->(string) { string.split(/,\s*/) }
49
50
 
50
51
  if gemspec['dependencies']
51
52
  gemspec['dependencies'].each do |name,versions|