ronin-web-spider 0.1.0.beta2 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -122,6 +123,8 @@ module Ronin
122
123
  # The visited host names.
123
124
  #
124
125
  # @return [Set<String>, nil]
126
+ #
127
+ # @api public
125
128
  attr_reader :visited_hosts
126
129
 
127
130
  #
@@ -132,6 +135,13 @@ module Ronin
132
135
  #
133
136
  # @yieldparam [String] host
134
137
  #
138
+ # @example
139
+ # spider.every_host do |host|
140
+ # puts "Spidring #{host} ..."
141
+ # end
142
+ #
143
+ # @api public
144
+ #
135
145
  def every_host
136
146
  @visited_hosts ||= Set.new
137
147
 
@@ -147,6 +157,8 @@ module Ronin
147
157
  # All certificates encountered while spidering.
148
158
  #
149
159
  # @return [Array<Ronin::Support::Crypto::Cert>]
160
+ #
161
+ # @api public
150
162
  attr_reader :collected_certs
151
163
 
152
164
  #
@@ -157,6 +169,13 @@ module Ronin
157
169
  #
158
170
  # @yieldparam [Ronin::Support::Crypto::Cert]
159
171
  #
172
+ # @example
173
+ # spider.every_cert do |cert|
174
+ # puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
175
+ # end
176
+ #
177
+ # @api public
178
+ #
160
179
  def every_cert
161
180
  @collected_certs ||= []
162
181
 
@@ -185,8 +204,15 @@ module Ronin
185
204
  # @yieldparam [Spidr::Page] favicon
186
205
  # An encountered `.ico` file.
187
206
  #
207
+ # @example
208
+ # spider.every_favicon do |page|
209
+ # # ...
210
+ # end
211
+ #
188
212
  # @see https://rubydoc.info/gems/spidr/Spidr/Page
189
213
  #
214
+ # @api public
215
+ #
190
216
  def every_favicon
191
217
  every_page do |page|
192
218
  yield page if page.icon?
@@ -197,14 +223,23 @@ module Ronin
197
223
  # Passes every non-empty HTML comment to the given block.
198
224
  #
199
225
  # @yield [comment]
200
- # The given block will be passevery HTML comment.
226
+ # The given block will be pass every HTML comment.
201
227
  #
202
228
  # @yieldparam [String] comment
203
229
  # The HTML comment inner text, with leading and trailing whitespace
204
230
  # stripped.
205
231
  #
232
+ # @example
233
+ # spider.every_html_comment do |comment|
234
+ # puts comment
235
+ # end
236
+ #
237
+ # @api public
238
+ #
206
239
  def every_html_comment
207
240
  every_html_page do |page|
241
+ next unless page.doc
242
+
208
243
  page.doc.xpath('//comment()').each do |comment|
209
244
  comment_text = comment.inner_text.strip
210
245
 
@@ -224,24 +259,71 @@ module Ronin
224
259
  # @yieldparam [String] js
225
260
  # The JavaScript source code.
226
261
  #
262
+ # @example
263
+ # spider.every_javascript do |js|
264
+ # puts js
265
+ # end
266
+ #
267
+ # @api public
268
+ #
227
269
  def every_javascript
228
270
  # yield inner text of every `<script type="text/javascript">` tag
229
271
  # and every `.js` URL.
230
272
  every_html_page do |page|
273
+ next unless page.doc
274
+
231
275
  page.doc.xpath('//script[@type="text/javascript"]').each do |script|
232
- unless script.inner_text.empty?
233
- yield script.inner_text
276
+ source = script.inner_text
277
+ source.force_encoding(Encoding::UTF_8)
278
+
279
+ unless source.empty?
280
+ yield source
234
281
  end
235
282
  end
236
283
  end
237
284
 
238
285
  every_javascript_page do |page|
239
- yield page.body
286
+ source = page.body
287
+ source.force_encoding(Encoding::UTF_8)
288
+
289
+ yield source
240
290
  end
241
291
  end
242
292
 
243
293
  alias every_js every_javascript
244
294
 
295
+ # Regex to match and skip JavaScript inline regexes.
296
+ #
297
+ # @api private
298
+ #
299
+ # @since 0.1.1
300
+ JAVASCRIPT_INLINE_REGEX = %r{
301
+ (?# match before the regex to avoid matching division operators )
302
+ (?:[\{\[\(;:,]\s*|=\s*)
303
+ /
304
+ (?# inline regex contents )
305
+ (?:
306
+ \[ (?:\\. | [^\]]) \] (?# [...] ) |
307
+ \\. (?# backslash escaped characters ) |
308
+ [^/] (?# everything else )
309
+ )+
310
+ /[dgimsuvy]* (?# also match any regex flags )
311
+ }mx
312
+
313
+ # Regex to match and skip JavaScript template literals.
314
+ #
315
+ # @note
316
+ # This regex will not properly match nested template literals:
317
+ #
318
+ # ```javascript
319
+ # `foo ${`bar ${1+1}`}`
320
+ # ```
321
+ #
322
+ # @api private
323
+ #
324
+ # @since 0.1.1
325
+ JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
326
+
245
327
  #
246
328
  # Passes every JavaScript string value to the given block.
247
329
  #
@@ -252,10 +334,30 @@ module Ronin
252
334
  # @yieldparam [String] string
253
335
  # The parsed contents of a JavaScript string.
254
336
  #
337
+ # @example
338
+ # spider.every_javascript_string do |str|
339
+ # puts str
340
+ # end
341
+ #
342
+ # @api public
343
+ #
255
344
  def every_javascript_string
256
345
  every_javascript do |js|
257
- js.scan(Support::Text::Patterns::STRING) do |js_string|
258
- yield Support::Encoding::JS.unquote(js_string)
346
+ scanner = StringScanner.new(js)
347
+
348
+ until scanner.eos?
349
+ # NOTE: this is a naive JavaScript string scanner and should
350
+ # eventually be replaced with a real JavaScript lexer or parser.
351
+ case scanner.peek(1)
352
+ when '"', "'" # beginning of a quoted string
353
+ js_string = scanner.scan(Support::Text::Patterns::STRING)
354
+
355
+ yield Support::Encoding::JS.unquote(js_string)
356
+ else
357
+ scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
358
+ scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
359
+ scanner.getch
360
+ end
259
361
  end
260
362
  end
261
363
  end
@@ -271,6 +373,13 @@ module Ronin
271
373
  # @yieldparam [String] comment
272
374
  # The contents of a JavaScript comment.
273
375
  #
376
+ # @example
377
+ # spider.every_javascript_comment do |comment|
378
+ # puts comment
379
+ # end
380
+ #
381
+ # @api public
382
+ #
274
383
  def every_javascript_comment(&block)
275
384
  every_javascript do |js|
276
385
  js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
@@ -288,9 +397,16 @@ module Ronin
288
397
  # @yieldparam [String] comment
289
398
  # The contents of a HTML or JavaScript comment.
290
399
  #
400
+ # @example
401
+ # spider.every_comment do |comment|
402
+ # puts comment
403
+ # end
404
+ #
291
405
  # @see #every_html_comment
292
406
  # @see #every_javascript_comment
293
407
  #
408
+ # @api public
409
+ #
294
410
  def every_comment(&block)
295
411
  every_html_comment(&block)
296
412
  every_javascript_comment(&block)
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
@@ -29,6 +30,9 @@ module Ronin
29
30
  #
30
31
  # Spider a host and archive every web page:
31
32
  #
33
+ # require 'ronin/web/spider'
34
+ # require 'ronin/web/spider/archive'
35
+ #
32
36
  # Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
33
37
  # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
34
38
  # archive.write(page.url,page.body)
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
@@ -30,10 +31,10 @@ module Ronin
30
31
  #
31
32
  # Spider a host and archive every web page to a Git repository:
32
33
  #
33
- # require 'ronin/web/spider/git_archive'
34
34
  # require 'ronin/web/spider'
35
+ # require 'ronin/web/spider/git_archive'
35
36
  # require 'date'
36
- #
37
+ #
37
38
  # Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
38
39
  # archive.commit("Updated #{Date.today}") do
39
40
  # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -21,7 +22,7 @@ module Ronin
21
22
  module Web
22
23
  module Spider
23
24
  # ronin-web-spider version
24
- VERSION = '0.1.0.beta2'
25
+ VERSION = '0.1.1'
25
26
  end
26
27
  end
27
28
  end
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,6 +23,286 @@ require 'ronin/web/spider/version'
22
23
 
23
24
  module Ronin
24
25
  module Web
26
+ #
27
+ # A collection of common web spidering routines using the [spidr] gem.
28
+ #
29
+ # [spidr]: https://github.com/postmodern/spidr#readme
30
+ #
31
+ # ## Examples
32
+ #
33
+ # Spider a host:
34
+ #
35
+ # ```ruby
36
+ # require 'ronin/web/spider'
37
+ #
38
+ # Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
39
+ # # ...
40
+ # end
41
+ # ```
42
+ #
43
+ # Spider a host:
44
+ #
45
+ # ```ruby
46
+ # Ronin::Web::Spider.host('solnic.eu') do |agent|
47
+ # # ...
48
+ # end
49
+ # ```
50
+ #
51
+ # Spider a domain (and any sub-domains):
52
+ #
53
+ # ```ruby
54
+ # Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
55
+ # # ...
56
+ # end
57
+ # ```
58
+ #
59
+ # Spider a site:
60
+ #
61
+ # ```ruby
62
+ # Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
63
+ # # ...
64
+ # end
65
+ # ```
66
+ #
67
+ # Spider multiple hosts:
68
+ #
69
+ # ```ruby
70
+ # Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
71
+ # # ...
72
+ # end
73
+ # ```
74
+ #
75
+ # Do not spider certain links:
76
+ #
77
+ # ```ruby
78
+ # Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
79
+ # # ...
80
+ # end
81
+ # ```
82
+ #
83
+ # Do not spider links on certain ports:
84
+ #
85
+ # ```ruby
86
+ # Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
87
+ # # ...
88
+ # end
89
+ # ```
90
+ #
91
+ # Do not spider links blacklisted in robots.txt:
92
+ #
93
+ # ```ruby
94
+ # Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
95
+ # # ...
96
+ # end
97
+ # ```
98
+ #
99
+ # Print out visited URLs:
100
+ #
101
+ # ```ruby
102
+ # Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
103
+ # spider.every_url { |url| puts url }
104
+ # end
105
+ # ```
106
+ #
107
+ # Build a URL map of a site:
108
+ #
109
+ # ```ruby
110
+ # url_map = Hash.new { |hash,key| hash[key] = [] }
111
+ #
112
+ # Ronin::Web::Spider.site('http://intranet.com/') do |spider|
113
+ # spider.every_link do |origin,dest|
114
+ # url_map[dest] << origin
115
+ # end
116
+ # end
117
+ # ```
118
+ #
119
+ # Print out the URLs that could not be requested:
120
+ #
121
+ # ```ruby
122
+ # Ronin::Web::Spider.site('http://company.com/') do |spider|
123
+ # spider.every_failed_url { |url| puts url }
124
+ # end
125
+ # ```
126
+ #
127
+ # Finds all pages which have broken links:
128
+ #
129
+ # ```ruby
130
+ # url_map = Hash.new { |hash,key| hash[key] = [] }
131
+ #
132
+ # spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
133
+ # spider.every_link do |origin,dest|
134
+ # url_map[dest] << origin
135
+ # end
136
+ # end
137
+ #
138
+ # spider.failures.each do |url|
139
+ # puts "Broken link #{url} found in:"
140
+ #
141
+ # url_map[url].each { |page| puts " #{page}" }
142
+ # end
143
+ # ```
144
+ #
145
+ # Search HTML and XML pages:
146
+ #
147
+ # ```ruby
148
+ # Ronin::Web::Spider.site('http://company.com/') do |spider|
149
+ # spider.every_page do |page|
150
+ # puts ">>> #{page.url}"
151
+ #
152
+ # page.search('//meta').each do |meta|
153
+ # name = (meta.attributes['name'] || meta.attributes['http-equiv'])
154
+ # value = meta.attributes['content']
155
+ #
156
+ # puts " #{name} = #{value}"
157
+ # end
158
+ # end
159
+ # end
160
+ # ```
161
+ #
162
+ # Print out the titles from every page:
163
+ #
164
+ # ```ruby
165
+ # Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
166
+ # spider.every_html_page do |page|
167
+ # puts page.title
168
+ # end
169
+ # end
170
+ # ```
171
+ #
172
+ # Print out every HTTP redirect:
173
+ #
174
+ # ```ruby
175
+ # Ronin::Web::Spider.host('company.com') do |spider|
176
+ # spider.every_redirect_page do |page|
177
+ # puts "#{page.url} -> #{page.headers['Location']}"
178
+ # end
179
+ # end
180
+ # ```
181
+ #
182
+ # Find what kinds of web servers a host is using, by accessing the headers:
183
+ #
184
+ # ```ruby
185
+ # servers = Set[]
186
+ #
187
+ # Ronin::Web::Spider.host('company.com') do |spider|
188
+ # spider.all_headers do |headers|
189
+ # servers << headers['server']
190
+ # end
191
+ # end
192
+ # ```
193
+ #
194
+ # Pause the spider on a forbidden page:
195
+ #
196
+ # ```ruby
197
+ # Ronin::Web::Spider.host('company.com') do |spider|
198
+ # spider.every_forbidden_page do |page|
199
+ # spider.pause!
200
+ # end
201
+ # end
202
+ # ```
203
+ #
204
+ # Skip the processing of a page:
205
+ #
206
+ # ```ruby
207
+ # Ronin::Web::Spider.host('company.com') do |spider|
208
+ # spider.every_missing_page do |page|
209
+ # spider.skip_page!
210
+ # end
211
+ # end
212
+ # ```
213
+ #
214
+ # Skip the processing of links:
215
+ #
216
+ # ```ruby
217
+ # Ronin::Web::Spider.host('company.com') do |spider|
218
+ # spider.every_url do |url|
219
+ # if url.path.split('/').find { |dir| dir.to_i > 1000 }
220
+ # spider.skip_link!
221
+ # end
222
+ # end
223
+ # end
224
+ # ```
225
+ #
226
+ # Detect when a new host name is spidered:
227
+ #
228
+ # ```ruby
229
+ # Ronin::Web::Spider.domain('example.com') do |spider|
230
+ # spider.every_host do |host|
231
+ # puts "Spidring #{host} ..."
232
+ # end
233
+ # end
234
+ # ```
235
+ #
236
+ # Detect when a new SSL/TLS certificate is encountered:
237
+ #
238
+ # ```ruby
239
+ # Ronin::Web::Spider.domain('example.com') do |spider|
240
+ # spider.every_cert do |cert|
241
+ # puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
242
+ # end
243
+ # end
244
+ # ```
245
+ #
246
+ # Print the MD5 checksum of every `favicon.ico` file:
247
+ #
248
+ # ```ruby
249
+ # Ronin::Web::Spider.domain('example.com') do |spider|
250
+ # spider.every_favicon do |page|
251
+ # puts "#{page.url}: #{page.body.md5}"
252
+ # end
253
+ # end
254
+ # ```
255
+ #
256
+ # Print every HTML comment:
257
+ #
258
+ # ```ruby
259
+ # Ronin::Web::Spider.domain('example.com') do |spider|
260
+ # spider.every_html_comment do |comment|
261
+ # puts comment
262
+ # end
263
+ # end
264
+ # ```
265
+ #
266
+ # Print all JavaScript source code:
267
+ #
268
+ # ```ruby
269
+ # Ronin::Web::Spider.domain('example.com') do |spider|
270
+ # spider.every_javascript do |js|
271
+ # puts js
272
+ # end
273
+ # end
274
+ # ```
275
+ #
276
+ # Print every JavaScript string literal:
277
+ #
278
+ # ```ruby
279
+ # Ronin::Web::Spider.domain('example.com') do |spider|
280
+ # spider.every_javascript_string do |str|
281
+ # puts str
282
+ # end
283
+ # end
284
+ # ```
285
+ #
286
+ # Print every JavaScript comment:
287
+ #
288
+ # ```ruby
289
+ # Ronin::Web::Spider.domain('example.com') do |spider|
290
+ # spider.every_javascript_comment do |comment|
291
+ # puts comment
292
+ # end
293
+ # end
294
+ # ```
295
+ #
296
+ # Print every HTML and JavaScript comment:
297
+ #
298
+ # ```ruby
299
+ # Ronin::Web::Spider.domain('example.com') do |spider|
300
+ # spider.every_comment do |comment|
301
+ # puts comment
302
+ # end
303
+ # end
304
+ # ```
305
+ #
25
306
  module Spider
26
307
  #
27
308
  # Creates a new agent and begin spidering at the given URL.
@@ -41,6 +322,8 @@ module Ronin
41
322
  #
42
323
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#start_at-class_method
43
324
  #
325
+ # @api public
326
+ #
44
327
  def self.start_at(url,**kwargs,&block)
45
328
  Agent.start_at(url,**kwargs,&block)
46
329
  end
@@ -63,6 +346,8 @@ module Ronin
63
346
  #
64
347
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#host-class_method
65
348
  #
349
+ # @api public
350
+ #
66
351
  def self.host(name,**kwargs,&block)
67
352
  Agent.host(name,**kwargs,&block)
68
353
  end
@@ -85,6 +370,8 @@ module Ronin
85
370
  #
86
371
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#site-class_method
87
372
  #
373
+ # @api public
374
+ #
88
375
  def self.site(url,**kwargs,&block)
89
376
  Agent.site(url,**kwargs,&block)
90
377
  end
@@ -107,6 +394,8 @@ module Ronin
107
394
  #
108
395
  # @see https://rubydoc.info/gems/spidr/Spidr/Agent#domain-class_method
109
396
  #
397
+ # @api public
398
+ #
110
399
  def self.domain(name,**kwargs,&block)
111
400
  Agent.domain(name,**kwargs,&block)
112
401
  end
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  require 'yaml'
4
4
 
@@ -22,18 +22,19 @@ Gem::Specification.new do |gem|
22
22
  gem.homepage = gemspec['homepage']
23
23
  gem.metadata = gemspec['metadata'] if gemspec['metadata']
24
24
 
25
- glob = lambda { |patterns| gem.files & Dir[*patterns] }
25
+ glob = ->(patterns) { gem.files & Dir[*patterns] }
26
26
 
27
27
  gem.files = `git ls-files`.split($/)
28
28
  gem.files = glob[gemspec['files']] if gemspec['files']
29
29
  gem.files += Array(gemspec['generated_files'])
30
+ # exclude test files from the packages gem
31
+ gem.files -= glob[gemspec['test_files'] || 'spec/{**/}*']
30
32
 
31
33
  gem.executables = gemspec.fetch('executables') do
32
34
  glob['bin/*'].map { |path| File.basename(path) }
33
35
  end
34
36
 
35
37
  gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
36
- gem.test_files = glob[gemspec['test_files'] || 'spec/{**/}*_spec.rb']
37
38
  gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
38
39
 
39
40
  gem.require_paths = Array(gemspec.fetch('require_paths') {
@@ -45,7 +46,7 @@ Gem::Specification.new do |gem|
45
46
  gem.required_rubygems_version = gemspec['required_rubygems_version']
46
47
  gem.post_install_message = gemspec['post_install_message']
47
48
 
48
- split = lambda { |string| string.split(/,\s*/) }
49
+ split = ->(string) { string.split(/,\s*/) }
49
50
 
50
51
  if gemspec['dependencies']
51
52
  gemspec['dependencies'].each do |name,versions|