ronin-web-spider 0.1.0 → 0.2.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +16 -1
- data/.rubocop.yml +11 -0
- data/ChangeLog.md +35 -1
- data/Gemfile +3 -0
- data/README.md +12 -3
- data/Rakefile +2 -2
- data/gemspec.yml +1 -1
- data/lib/ronin/web/spider/agent.rb +311 -15
- data/lib/ronin/web/spider/archive.rb +2 -1
- data/lib/ronin/web/spider/exceptions.rb +2 -1
- data/lib/ronin/web/spider/git_archive.rb +2 -1
- data/lib/ronin/web/spider/version.rb +3 -2
- data/lib/ronin/web/spider.rb +64 -63
- data/ronin-web-spider.gemspec +3 -3
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e1637185a37e17f587cab3bb0ec451dfa763ab58b167404ec5b1161a4a80316f
|
4
|
+
data.tar.gz: ea11da89c3c232feaca90c103c45cdb653eac9b94c6c97be34998d6b5089e896
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d2f5ac7b650096856b87f5e37cf42044a12db416eab2375715a39073606a1f4c30103fda27cb1faaf9a53533bdf323ae9912787078f24f120366fb519a3b4a1
|
7
|
+
data.tar.gz: b2ae98ce51187a5a65cd2355816b98d1916edb0d731f158e83a9366ec70261ef2e3eae9d5f3abe95b311a766c489064f855b4ee6afa4606592a5f9b560fbcb9d
|
data/.github/workflows/ruby.yml
CHANGED
@@ -12,11 +12,12 @@ jobs:
|
|
12
12
|
- '3.0'
|
13
13
|
- '3.1'
|
14
14
|
- '3.2'
|
15
|
+
- '3.3'
|
15
16
|
- jruby
|
16
17
|
- truffleruby
|
17
18
|
name: Ruby ${{ matrix.ruby }}
|
18
19
|
steps:
|
19
|
-
- uses: actions/checkout@
|
20
|
+
- uses: actions/checkout@v4
|
20
21
|
- name: Set up Ruby
|
21
22
|
uses: ruby/setup-ruby@v1
|
22
23
|
with:
|
@@ -26,3 +27,17 @@ jobs:
|
|
26
27
|
run: bundle install --jobs 4 --retry 3
|
27
28
|
- name: Run tests
|
28
29
|
run: bundle exec rake test
|
30
|
+
|
31
|
+
# rubocop linting
|
32
|
+
rubocop:
|
33
|
+
runs-on: ubuntu-latest
|
34
|
+
steps:
|
35
|
+
- uses: actions/checkout@v4
|
36
|
+
- name: Set up Ruby
|
37
|
+
uses: ruby/setup-ruby@v1
|
38
|
+
with:
|
39
|
+
ruby-version: 3.0
|
40
|
+
- name: Install dependencies
|
41
|
+
run: bundle install --jobs 4 --retry 3
|
42
|
+
- name: Run rubocop
|
43
|
+
run: bundle exec rubocop --parallel
|
data/.rubocop.yml
ADDED
data/ChangeLog.md
CHANGED
@@ -1,4 +1,37 @@
|
|
1
|
-
### 0.
|
1
|
+
### 0.2.0 / 2024-XX-XX
|
2
|
+
|
3
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
|
4
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
|
5
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
|
6
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
|
7
|
+
* Allow {Ronin::Web::Spider::Agent#every_html_comment},
|
8
|
+
{Ronin::Web::Spider::Agent#every_javascript every_javascript},
|
9
|
+
{Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
|
10
|
+
{Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
|
11
|
+
{Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
|
12
|
+
{Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
|
13
|
+
{Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
|
14
|
+
to also yield a `Spidr::Page` block argument for additional context.
|
15
|
+
|
16
|
+
### 0.1.1 / 2024-06-19
|
17
|
+
|
18
|
+
* Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
|
19
|
+
{Ronin::Web::Spider::Agent#every_javascript} when the page's `Content-Type`
|
20
|
+
header included `text/html` but lacked a response body, causing `page.doc` to
|
21
|
+
be `nil`.
|
22
|
+
* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript} where parsed
|
23
|
+
JavaScript source code strings containing UTF-8 characters where being
|
24
|
+
incorrectly encoded as ASCII-8bit strings, if the page's `Content-Type` header
|
25
|
+
did not include a `charset=` attribute.
|
26
|
+
* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript_string} where
|
27
|
+
inline JavaScript regexes containing the `"` or `'` characters (ex: `/["'=]/`)
|
28
|
+
would incorrectly be treated as the beginning or ends of JavaScript string
|
29
|
+
literals. Note that while this greatly improves the accuracy of
|
30
|
+
{Ronin::Web::Spider::Agent#every_javascript_string}, it still does not
|
31
|
+
support parsing JavaScript template literals that may also contain string
|
32
|
+
literals (ex: ````Hello \"World\"```` or ````Hello ${myFunc("string literal")}````).
|
33
|
+
|
34
|
+
### 0.1.0 / 2023-02-01
|
2
35
|
|
3
36
|
* Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
|
4
37
|
* Relicensed as LGPL-3.0.
|
@@ -20,3 +53,4 @@
|
|
20
53
|
* `every_comment` - yields every HTML or JavaScript comment.
|
21
54
|
* Supports archiving spidered pages to a directory or git repository.
|
22
55
|
|
56
|
+
[spidr]: https://github.com/postmodern/spidr#readme
|
data/Gemfile
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
source 'https://rubygems.org'
|
2
3
|
|
3
4
|
gemspec
|
@@ -28,4 +29,6 @@ group :development do
|
|
28
29
|
gem 'dead_end', require: false
|
29
30
|
gem 'sord', require: false, platform: :mri
|
30
31
|
gem 'stackprof', require: false, platform: :mri
|
32
|
+
gem 'rubocop', require: false, platform: :mri
|
33
|
+
gem 'rubocop-ronin', require: false, platform: :mri
|
31
34
|
end
|
data/README.md
CHANGED
@@ -9,7 +9,6 @@
|
|
9
9
|
* [Issues](https://github.com/ronin-rb/ronin-web-spider/issues)
|
10
10
|
* [Documentation](https://ronin-rb.dev/docs/ronin-web-spider/frames)
|
11
11
|
* [Discord](https://discord.gg/6WAb3PsVX9) |
|
12
|
-
[Twitter](https://twitter.com/ronin_rb) |
|
13
12
|
[Mastodon](https://infosec.exchange/@ronin_rb)
|
14
13
|
|
15
14
|
## Description
|
@@ -38,7 +37,7 @@ ronin-web-spider is a collection of common web spidering routines using the
|
|
38
37
|
* [every_comment][docs-every_comment] - yields every HTML or JavaScript
|
39
38
|
comment.
|
40
39
|
* Supports archiving spidered pages to a directory or git repository.
|
41
|
-
* Has
|
40
|
+
* Has 97% documentation coverage.
|
42
41
|
* Has 94% test coverage.
|
43
42
|
|
44
43
|
[docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
|
@@ -305,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
|
|
305
304
|
end
|
306
305
|
```
|
307
306
|
|
307
|
+
Print every JavaScript URL string literal:
|
308
|
+
|
309
|
+
```ruby
|
310
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
311
|
+
spider.every_javascript_url_string do |url|
|
312
|
+
puts url
|
313
|
+
end
|
314
|
+
end
|
315
|
+
```
|
316
|
+
|
308
317
|
Print every JavaScript comment:
|
309
318
|
|
310
319
|
```ruby
|
@@ -391,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
|
|
391
400
|
|
392
401
|
## License
|
393
402
|
|
394
|
-
Copyright (c) 2006-
|
403
|
+
Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
395
404
|
|
396
405
|
ronin-web-spider is free software: you can redistribute it and/or modify
|
397
406
|
it under the terms of the GNU Lesser General Public License as published
|
data/Rakefile
CHANGED
data/gemspec.yml
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -22,6 +23,7 @@ require 'spidr/agent'
|
|
22
23
|
require 'ronin/support/network/http'
|
23
24
|
require 'ronin/support/crypto/cert'
|
24
25
|
require 'ronin/support/text/patterns/source_code'
|
26
|
+
require 'ronin/support/text/patterns/network'
|
25
27
|
require 'ronin/support/encoding/js'
|
26
28
|
|
27
29
|
module Ronin
|
@@ -224,10 +226,17 @@ module Ronin
|
|
224
226
|
# @yield [comment]
|
225
227
|
# The given block will be pass every HTML comment.
|
226
228
|
#
|
229
|
+
# @yield [comment, page]
|
230
|
+
# If the block accepts two arguments, the HTML comment and the page
|
231
|
+
# that the comment was found on will be passed to the given block.
|
232
|
+
#
|
227
233
|
# @yieldparam [String] comment
|
228
234
|
# The HTML comment inner text, with leading and trailing whitespace
|
229
235
|
# stripped.
|
230
236
|
#
|
237
|
+
# @yieldparam [Spidr::Page] page
|
238
|
+
# The page that the HTML comment exists on.
|
239
|
+
#
|
231
240
|
# @example
|
232
241
|
# spider.every_html_comment do |comment|
|
233
242
|
# puts comment
|
@@ -235,13 +244,19 @@ module Ronin
|
|
235
244
|
#
|
236
245
|
# @api public
|
237
246
|
#
|
238
|
-
def every_html_comment
|
247
|
+
def every_html_comment(&block)
|
239
248
|
every_html_page do |page|
|
249
|
+
next unless page.doc
|
250
|
+
|
240
251
|
page.doc.xpath('//comment()').each do |comment|
|
241
252
|
comment_text = comment.inner_text.strip
|
242
253
|
|
243
254
|
unless comment_text.empty?
|
244
|
-
|
255
|
+
if block.arity == 2
|
256
|
+
yield comment_text, page
|
257
|
+
else
|
258
|
+
yield comment_text
|
259
|
+
end
|
245
260
|
end
|
246
261
|
end
|
247
262
|
end
|
@@ -253,9 +268,17 @@ module Ronin
|
|
253
268
|
# @yield [js]
|
254
269
|
# The given block will be passed every piece of JavaScript source.
|
255
270
|
#
|
271
|
+
# @yield [js, page]
|
272
|
+
# If the block accepts two arguments, the JavaScript source and the
|
273
|
+
# page that the JavaScript source was found on will be passed to the
|
274
|
+
# given block.
|
275
|
+
#
|
256
276
|
# @yieldparam [String] js
|
257
277
|
# The JavaScript source code.
|
258
278
|
#
|
279
|
+
# @yieldparam [Spidr::Page] page
|
280
|
+
# The page that the JavaScript source was found in or on.
|
281
|
+
#
|
259
282
|
# @example
|
260
283
|
# spider.every_javascript do |js|
|
261
284
|
# puts js
|
@@ -263,24 +286,72 @@ module Ronin
|
|
263
286
|
#
|
264
287
|
# @api public
|
265
288
|
#
|
266
|
-
def every_javascript
|
289
|
+
def every_javascript(&block)
|
267
290
|
# yield inner text of every `<script type="text/javascript">` tag
|
268
291
|
# and every `.js` URL.
|
269
292
|
every_html_page do |page|
|
293
|
+
next unless page.doc
|
294
|
+
|
270
295
|
page.doc.xpath('//script[@type="text/javascript"]').each do |script|
|
271
|
-
|
272
|
-
|
296
|
+
source = script.inner_text
|
297
|
+
source.force_encoding(Encoding::UTF_8)
|
298
|
+
|
299
|
+
unless source.empty?
|
300
|
+
if block.arity == 2
|
301
|
+
yield source, page
|
302
|
+
else
|
303
|
+
yield source
|
304
|
+
end
|
273
305
|
end
|
274
306
|
end
|
275
307
|
end
|
276
308
|
|
277
309
|
every_javascript_page do |page|
|
278
|
-
|
310
|
+
source = page.body
|
311
|
+
source.force_encoding(Encoding::UTF_8)
|
312
|
+
|
313
|
+
if block.arity == 2
|
314
|
+
yield source, page
|
315
|
+
else
|
316
|
+
yield source
|
317
|
+
end
|
279
318
|
end
|
280
319
|
end
|
281
320
|
|
282
321
|
alias every_js every_javascript
|
283
322
|
|
323
|
+
# Regex to match and skip JavaScript inline regexes.
|
324
|
+
#
|
325
|
+
# @api private
|
326
|
+
#
|
327
|
+
# @since 0.1.1
|
328
|
+
JAVASCRIPT_INLINE_REGEX = %r{
|
329
|
+
(?# match before the regex to avoid matching division operators )
|
330
|
+
(?:[\{\[\(;:,]\s*|=\s*)
|
331
|
+
/
|
332
|
+
(?# inline regex contents )
|
333
|
+
(?:
|
334
|
+
\[ (?:\\. | [^\]]) \] (?# [...] ) |
|
335
|
+
\\. (?# backslash escaped characters ) |
|
336
|
+
[^/] (?# everything else )
|
337
|
+
)+
|
338
|
+
/[dgimsuvy]* (?# also match any regex flags )
|
339
|
+
}mx
|
340
|
+
|
341
|
+
# Regex to match and skip JavaScript template literals.
|
342
|
+
#
|
343
|
+
# @note
|
344
|
+
# This regex will not properly match nested template literals:
|
345
|
+
#
|
346
|
+
# ```javascript
|
347
|
+
# `foo ${`bar ${1+1}`}`
|
348
|
+
# ```
|
349
|
+
#
|
350
|
+
# @api private
|
351
|
+
#
|
352
|
+
# @since 0.1.1
|
353
|
+
JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
|
354
|
+
|
284
355
|
#
|
285
356
|
# Passes every JavaScript string value to the given block.
|
286
357
|
#
|
@@ -288,35 +359,246 @@ module Ronin
|
|
288
359
|
# The given block will be passed each JavaScript string with the quote
|
289
360
|
# marks removed.
|
290
361
|
#
|
362
|
+
# @yield [string, page]
|
363
|
+
# If the block accepts two arguments, the JavaScript string and the
|
364
|
+
# page that the JavaScript string was found on will be passed to the
|
365
|
+
# given block.
|
366
|
+
#
|
291
367
|
# @yieldparam [String] string
|
292
368
|
# The parsed contents of a JavaScript string.
|
293
369
|
#
|
370
|
+
# @yieldparam [Spidr::Page] page
|
371
|
+
# The page that the JavaScript string was found in or on.
|
372
|
+
#
|
294
373
|
# @example
|
295
374
|
# spider.every_javascript_string do |str|
|
296
|
-
#
|
297
|
-
#
|
375
|
+
# puts str
|
376
|
+
# end
|
298
377
|
#
|
299
378
|
# @api public
|
300
379
|
#
|
301
|
-
def every_javascript_string
|
302
|
-
every_javascript do |js|
|
303
|
-
|
304
|
-
|
380
|
+
def every_javascript_string(&block)
|
381
|
+
every_javascript do |js,page|
|
382
|
+
scanner = StringScanner.new(js)
|
383
|
+
|
384
|
+
until scanner.eos?
|
385
|
+
# NOTE: this is a naive JavaScript string scanner and should
|
386
|
+
# eventually be replaced with a real JavaScript lexer or parser.
|
387
|
+
case scanner.peek(1)
|
388
|
+
when '"', "'" # beginning of a quoted string
|
389
|
+
js_string = scanner.scan(Support::Text::Patterns::STRING)
|
390
|
+
string = Support::Encoding::JS.unquote(js_string)
|
391
|
+
|
392
|
+
if block.arity == 2
|
393
|
+
yield string, page
|
394
|
+
else
|
395
|
+
yield string
|
396
|
+
end
|
397
|
+
else
|
398
|
+
scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
|
399
|
+
scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
|
400
|
+
scanner.getch
|
401
|
+
end
|
305
402
|
end
|
306
403
|
end
|
307
404
|
end
|
308
405
|
|
309
406
|
alias every_js_string every_javascript_string
|
310
407
|
|
408
|
+
# Regular expression that matches relative paths within JavaScript.
|
409
|
+
#
|
410
|
+
# @note
|
411
|
+
# This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
|
412
|
+
# but *not* `/foo`, `foo`, or `foo.`.
|
413
|
+
JAVASCRIPT_RELATIVE_PATH = %r{
|
414
|
+
\A
|
415
|
+
(?:
|
416
|
+
[^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
|
417
|
+
|
|
418
|
+
[^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
|
419
|
+
)
|
420
|
+
\z
|
421
|
+
}x
|
422
|
+
|
423
|
+
#
|
424
|
+
# Passes every JavaScript relative path string to the given block.
|
425
|
+
#
|
426
|
+
# @yield [string]
|
427
|
+
# The given block will be passed each JavaScript relative path string
|
428
|
+
# with the quote marks removed.
|
429
|
+
#
|
430
|
+
# @yield [string, page]
|
431
|
+
# If the block accepts two arguments, the JavaScript relative path
|
432
|
+
# string and the page that the JavaScript relative path string was
|
433
|
+
# found on will be passed to the given block.
|
434
|
+
#
|
435
|
+
# @yieldparam [String] string
|
436
|
+
# The parsed contents of a literal JavaScript relative path string.
|
437
|
+
#
|
438
|
+
# @yieldparam [Spidr::Page] page
|
439
|
+
# The page that the JavaScript relative path string was found in or
|
440
|
+
# on.
|
441
|
+
#
|
442
|
+
# @example
|
443
|
+
# spider.every_javascript_relative_path_string do |relative_path|
|
444
|
+
# puts relative_path
|
445
|
+
# end
|
446
|
+
#
|
447
|
+
# @api public
|
448
|
+
#
|
449
|
+
# @since 0.2.0
|
450
|
+
#
|
451
|
+
def every_javascript_relative_path_string(&block)
|
452
|
+
every_javascript_string do |string,page|
|
453
|
+
if string =~ JAVASCRIPT_RELATIVE_PATH
|
454
|
+
if block.arity == 2
|
455
|
+
yield string, page
|
456
|
+
else
|
457
|
+
yield string
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
alias every_js_relative_path_string every_javascript_relative_path_string
|
464
|
+
|
465
|
+
# Regular expression that matches absolute paths within JavaScript.
|
466
|
+
JAVASCRIPT_ABSOLUTE_PATH = %r{\A(?:/[^/\\ ]+)+\z}
|
467
|
+
|
468
|
+
#
|
469
|
+
# Passes every JavaScript absolute path string to the given block.
|
470
|
+
#
|
471
|
+
# @yield [string]
|
472
|
+
# The given block will be passed each JavaScript absolute path string
|
473
|
+
# with the quote marks removed.
|
474
|
+
#
|
475
|
+
# @yield [string, page]
|
476
|
+
# If the block accepts two arguments, the JavaScript absolute path
|
477
|
+
# string and the page that the JavaScript absolute path string was
|
478
|
+
# found on will be passed to the given block.
|
479
|
+
#
|
480
|
+
# @yieldparam [String] string
|
481
|
+
# The parsed contents of a literal JavaScript absolute path string.
|
482
|
+
#
|
483
|
+
# @yieldparam [Spidr::Page] page
|
484
|
+
# The page that the JavaScript absolute path string was found in or
|
485
|
+
# on.
|
486
|
+
#
|
487
|
+
# @example
|
488
|
+
# spider.every_javascript_absolute_path_string do |absolute_path|
|
489
|
+
# puts absolute_path
|
490
|
+
# end
|
491
|
+
#
|
492
|
+
# @api public
|
493
|
+
#
|
494
|
+
# @since 0.2.0
|
495
|
+
#
|
496
|
+
def every_javascript_absolute_path_string(&block)
|
497
|
+
every_javascript_string do |string,page|
|
498
|
+
if string =~ JAVASCRIPT_ABSOLUTE_PATH
|
499
|
+
if block.arity == 2
|
500
|
+
yield string, page
|
501
|
+
else
|
502
|
+
yield string
|
503
|
+
end
|
504
|
+
end
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
alias every_js_absolute_path_string every_javascript_absolute_path_string
|
509
|
+
|
510
|
+
#
|
511
|
+
# Passes every JavaScript path string to the given block.
|
512
|
+
#
|
513
|
+
# @yield [string]
|
514
|
+
# The given block will be passed each JavaScript path string with the
|
515
|
+
# quote marks removed.
|
516
|
+
#
|
517
|
+
# @yield [string, page]
|
518
|
+
# If the block accepts two arguments, the JavaScript path string and
|
519
|
+
# the page that the JavaScript path string was found on will be
|
520
|
+
# passed to the given block.
|
521
|
+
#
|
522
|
+
# @yieldparam [String] string
|
523
|
+
# The parsed contents of a literal JavaScript path string.
|
524
|
+
#
|
525
|
+
# @yieldparam [Spidr::Page] page
|
526
|
+
# The page that the JavaScript path string was found in or on.
|
527
|
+
#
|
528
|
+
# @example
|
529
|
+
# spider.every_javascript_path_string do |path|
|
530
|
+
# puts path
|
531
|
+
# end
|
532
|
+
#
|
533
|
+
# @api public
|
534
|
+
#
|
535
|
+
# @since 0.2.0
|
536
|
+
#
|
537
|
+
def every_javascript_path_string(&block)
|
538
|
+
every_javascript_relative_path_string(&block)
|
539
|
+
every_javascript_absolute_path_string(&block)
|
540
|
+
end
|
541
|
+
|
542
|
+
alias every_js_path_string every_javascript_path_string
|
543
|
+
|
544
|
+
#
|
545
|
+
# Passes every JavaScript URL string to the given block.
|
546
|
+
#
|
547
|
+
# @yield [string]
|
548
|
+
# The given block will be passed each JavaScript URL string with the
|
549
|
+
# quote marks removed.
|
550
|
+
#
|
551
|
+
# @yield [string, page]
|
552
|
+
# If the block accepts two arguments, the JavaScript URL string and
|
553
|
+
# the page that the JavaScript URL string was found on will be passed
|
554
|
+
# to the given block.
|
555
|
+
#
|
556
|
+
# @yieldparam [String] string
|
557
|
+
# The parsed contents of a literal JavaScript URL string.
|
558
|
+
#
|
559
|
+
# @yieldparam [Spidr::Page] page
|
560
|
+
# The page that the JavaScript URL string was found in or on.
|
561
|
+
#
|
562
|
+
# @example
|
563
|
+
# spider.every_javascript_url_string do |url|
|
564
|
+
# puts url
|
565
|
+
# end
|
566
|
+
#
|
567
|
+
# @api public
|
568
|
+
#
|
569
|
+
# @since 0.2.0
|
570
|
+
#
|
571
|
+
def every_javascript_url_string(&block)
|
572
|
+
every_javascript_string do |string,page|
|
573
|
+
if string =~ Support::Text::Patterns::URL
|
574
|
+
if block.arity == 2
|
575
|
+
yield string, page
|
576
|
+
else
|
577
|
+
yield string
|
578
|
+
end
|
579
|
+
end
|
580
|
+
end
|
581
|
+
end
|
582
|
+
|
583
|
+
alias every_js_url_string every_javascript_url_string
|
584
|
+
|
311
585
|
#
|
312
586
|
# Passes every JavaScript comment to the given block.
|
313
587
|
#
|
314
588
|
# @yield [comment]
|
315
589
|
# The given block will be passed each JavaScript comment.
|
316
590
|
#
|
591
|
+
# @yield [comment, page]
|
592
|
+
# If the block accepts two arguments, the JavaScript comment and the
|
593
|
+
# page that the JavaScript comment was found on will be passed to the
|
594
|
+
# given block.
|
595
|
+
#
|
317
596
|
# @yieldparam [String] comment
|
318
597
|
# The contents of a JavaScript comment.
|
319
598
|
#
|
599
|
+
# @yieldparam [Spidr::Page] page
|
600
|
+
# The page that the JavaScript comment was found in or on.
|
601
|
+
#
|
320
602
|
# @example
|
321
603
|
# spider.every_javascript_comment do |comment|
|
322
604
|
# puts comment
|
@@ -325,8 +607,14 @@ module Ronin
|
|
325
607
|
# @api public
|
326
608
|
#
|
327
609
|
def every_javascript_comment(&block)
|
328
|
-
every_javascript do |js|
|
329
|
-
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT
|
610
|
+
every_javascript do |js,page|
|
611
|
+
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
|
612
|
+
if block.arity == 2
|
613
|
+
yield comment, page
|
614
|
+
else
|
615
|
+
yield comment
|
616
|
+
end
|
617
|
+
end
|
330
618
|
end
|
331
619
|
end
|
332
620
|
|
@@ -338,9 +626,17 @@ module Ronin
|
|
338
626
|
# @yield [comment]
|
339
627
|
# The given block will be passed each HTML or JavaScript comment.
|
340
628
|
#
|
629
|
+
# @yield [comment, page]
|
630
|
+
# If the block accepts two arguments, the HTML or JavaScript comment
|
631
|
+
# and the page that the HTML/JavaScript comment was found on will be
|
632
|
+
# passed to the given block.
|
633
|
+
#
|
341
634
|
# @yieldparam [String] comment
|
342
635
|
# The contents of a HTML or JavaScript comment.
|
343
636
|
#
|
637
|
+
# @yieldparam [Spidr::Page] page
|
638
|
+
# The page that the HTML or JavaScript comment was found in or on.
|
639
|
+
#
|
344
640
|
# @example
|
345
641
|
# spider.every_comment do |comment|
|
346
642
|
# puts comment
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -31,7 +32,7 @@ module Ronin
|
|
31
32
|
#
|
32
33
|
# require 'ronin/web/spider'
|
33
34
|
# require 'ronin/web/spider/archive'
|
34
|
-
#
|
35
|
+
#
|
35
36
|
# Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
36
37
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
37
38
|
# archive.write(page.url,page.body)
|
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -33,7 +34,7 @@ module Ronin
|
|
33
34
|
# require 'ronin/web/spider'
|
34
35
|
# require 'ronin/web/spider/git_archive'
|
35
36
|
# require 'date'
|
36
|
-
#
|
37
|
+
#
|
37
38
|
# Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
38
39
|
# archive.commit("Updated #{Date.today}") do
|
39
40
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -21,7 +22,7 @@ module Ronin
|
|
21
22
|
module Web
|
22
23
|
module Spider
|
23
24
|
# ronin-web-spider version
|
24
|
-
VERSION = '0.
|
25
|
+
VERSION = '0.2.0.rc1'
|
25
26
|
end
|
26
27
|
end
|
27
28
|
end
|
data/lib/ronin/web/spider.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -30,136 +31,136 @@ module Ronin
|
|
30
31
|
# ## Examples
|
31
32
|
#
|
32
33
|
# Spider a host:
|
33
|
-
#
|
34
|
+
#
|
34
35
|
# ```ruby
|
35
36
|
# require 'ronin/web/spider'
|
36
|
-
#
|
37
|
+
#
|
37
38
|
# Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
|
38
39
|
# # ...
|
39
40
|
# end
|
40
41
|
# ```
|
41
|
-
#
|
42
|
+
#
|
42
43
|
# Spider a host:
|
43
|
-
#
|
44
|
+
#
|
44
45
|
# ```ruby
|
45
46
|
# Ronin::Web::Spider.host('solnic.eu') do |agent|
|
46
47
|
# # ...
|
47
48
|
# end
|
48
49
|
# ```
|
49
|
-
#
|
50
|
+
#
|
50
51
|
# Spider a domain (and any sub-domains):
|
51
|
-
#
|
52
|
+
#
|
52
53
|
# ```ruby
|
53
54
|
# Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
|
54
55
|
# # ...
|
55
56
|
# end
|
56
57
|
# ```
|
57
|
-
#
|
58
|
+
#
|
58
59
|
# Spider a site:
|
59
|
-
#
|
60
|
+
#
|
60
61
|
# ```ruby
|
61
62
|
# Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
|
62
63
|
# # ...
|
63
64
|
# end
|
64
65
|
# ```
|
65
|
-
#
|
66
|
+
#
|
66
67
|
# Spider multiple hosts:
|
67
|
-
#
|
68
|
+
#
|
68
69
|
# ```ruby
|
69
70
|
# Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
70
71
|
# # ...
|
71
72
|
# end
|
72
73
|
# ```
|
73
|
-
#
|
74
|
+
#
|
74
75
|
# Do not spider certain links:
|
75
|
-
#
|
76
|
+
#
|
76
77
|
# ```ruby
|
77
78
|
# Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
78
79
|
# # ...
|
79
80
|
# end
|
80
81
|
# ```
|
81
|
-
#
|
82
|
+
#
|
82
83
|
# Do not spider links on certain ports:
|
83
|
-
#
|
84
|
+
#
|
84
85
|
# ```ruby
|
85
86
|
# Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
86
87
|
# # ...
|
87
88
|
# end
|
88
89
|
# ```
|
89
|
-
#
|
90
|
+
#
|
90
91
|
# Do not spider links blacklisted in robots.txt:
|
91
|
-
#
|
92
|
+
#
|
92
93
|
# ```ruby
|
93
94
|
# Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
|
94
95
|
# # ...
|
95
96
|
# end
|
96
97
|
# ```
|
97
|
-
#
|
98
|
+
#
|
98
99
|
# Print out visited URLs:
|
99
|
-
#
|
100
|
+
#
|
100
101
|
# ```ruby
|
101
102
|
# Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
|
102
103
|
# spider.every_url { |url| puts url }
|
103
104
|
# end
|
104
105
|
# ```
|
105
|
-
#
|
106
|
+
#
|
106
107
|
# Build a URL map of a site:
|
107
|
-
#
|
108
|
+
#
|
108
109
|
# ```ruby
|
109
110
|
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
110
|
-
#
|
111
|
+
#
|
111
112
|
# Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
112
113
|
# spider.every_link do |origin,dest|
|
113
114
|
# url_map[dest] << origin
|
114
115
|
# end
|
115
116
|
# end
|
116
117
|
# ```
|
117
|
-
#
|
118
|
+
#
|
118
119
|
# Print out the URLs that could not be requested:
|
119
|
-
#
|
120
|
+
#
|
120
121
|
# ```ruby
|
121
122
|
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
122
123
|
# spider.every_failed_url { |url| puts url }
|
123
124
|
# end
|
124
125
|
# ```
|
125
|
-
#
|
126
|
+
#
|
126
127
|
# Finds all pages which have broken links:
|
127
|
-
#
|
128
|
+
#
|
128
129
|
# ```ruby
|
129
130
|
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
130
|
-
#
|
131
|
+
#
|
131
132
|
# spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
132
133
|
# spider.every_link do |origin,dest|
|
133
134
|
# url_map[dest] << origin
|
134
135
|
# end
|
135
136
|
# end
|
136
|
-
#
|
137
|
+
#
|
137
138
|
# spider.failures.each do |url|
|
138
139
|
# puts "Broken link #{url} found in:"
|
139
|
-
#
|
140
|
+
#
|
140
141
|
# url_map[url].each { |page| puts " #{page}" }
|
141
142
|
# end
|
142
143
|
# ```
|
143
|
-
#
|
144
|
+
#
|
144
145
|
# Search HTML and XML pages:
|
145
|
-
#
|
146
|
+
#
|
146
147
|
# ```ruby
|
147
148
|
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
148
149
|
# spider.every_page do |page|
|
149
150
|
# puts ">>> #{page.url}"
|
150
|
-
#
|
151
|
+
#
|
151
152
|
# page.search('//meta').each do |meta|
|
152
153
|
# name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
153
154
|
# value = meta.attributes['content']
|
154
|
-
#
|
155
|
+
#
|
155
156
|
# puts " #{name} = #{value}"
|
156
157
|
# end
|
157
158
|
# end
|
158
159
|
# end
|
159
160
|
# ```
|
160
|
-
#
|
161
|
+
#
|
161
162
|
# Print out the titles from every page:
|
162
|
-
#
|
163
|
+
#
|
163
164
|
# ```ruby
|
164
165
|
# Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
|
165
166
|
# spider.every_html_page do |page|
|
@@ -167,9 +168,9 @@ module Ronin
|
|
167
168
|
# end
|
168
169
|
# end
|
169
170
|
# ```
|
170
|
-
#
|
171
|
+
#
|
171
172
|
# Print out every HTTP redirect:
|
172
|
-
#
|
173
|
+
#
|
173
174
|
# ```ruby
|
174
175
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
175
176
|
# spider.every_redirect_page do |page|
|
@@ -177,21 +178,21 @@ module Ronin
|
|
177
178
|
# end
|
178
179
|
# end
|
179
180
|
# ```
|
180
|
-
#
|
181
|
+
#
|
181
182
|
# Find what kinds of web servers a host is using, by accessing the headers:
|
182
|
-
#
|
183
|
+
#
|
183
184
|
# ```ruby
|
184
185
|
# servers = Set[]
|
185
|
-
#
|
186
|
+
#
|
186
187
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
187
188
|
# spider.all_headers do |headers|
|
188
189
|
# servers << headers['server']
|
189
190
|
# end
|
190
191
|
# end
|
191
192
|
# ```
|
192
|
-
#
|
193
|
+
#
|
193
194
|
# Pause the spider on a forbidden page:
|
194
|
-
#
|
195
|
+
#
|
195
196
|
# ```ruby
|
196
197
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
197
198
|
# spider.every_forbidden_page do |page|
|
@@ -199,9 +200,9 @@ module Ronin
|
|
199
200
|
# end
|
200
201
|
# end
|
201
202
|
# ```
|
202
|
-
#
|
203
|
+
#
|
203
204
|
# Skip the processing of a page:
|
204
|
-
#
|
205
|
+
#
|
205
206
|
# ```ruby
|
206
207
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
207
208
|
# spider.every_missing_page do |page|
|
@@ -209,9 +210,9 @@ module Ronin
|
|
209
210
|
# end
|
210
211
|
# end
|
211
212
|
# ```
|
212
|
-
#
|
213
|
+
#
|
213
214
|
# Skip the processing of links:
|
214
|
-
#
|
215
|
+
#
|
215
216
|
# ```ruby
|
216
217
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
217
218
|
# spider.every_url do |url|
|
@@ -221,9 +222,9 @@ module Ronin
|
|
221
222
|
# end
|
222
223
|
# end
|
223
224
|
# ```
|
224
|
-
#
|
225
|
+
#
|
225
226
|
# Detect when a new host name is spidered:
|
226
|
-
#
|
227
|
+
#
|
227
228
|
# ```ruby
|
228
229
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
229
230
|
# spider.every_host do |host|
|
@@ -231,9 +232,9 @@ module Ronin
|
|
231
232
|
# end
|
232
233
|
# end
|
233
234
|
# ```
|
234
|
-
#
|
235
|
+
#
|
235
236
|
# Detect when a new SSL/TLS certificate is encountered:
|
236
|
-
#
|
237
|
+
#
|
237
238
|
# ```ruby
|
238
239
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
239
240
|
# spider.every_cert do |cert|
|
@@ -241,9 +242,9 @@ module Ronin
|
|
241
242
|
# end
|
242
243
|
# end
|
243
244
|
# ```
|
244
|
-
#
|
245
|
+
#
|
245
246
|
# Print the MD5 checksum of every `favicon.ico` file:
|
246
|
-
#
|
247
|
+
#
|
247
248
|
# ```ruby
|
248
249
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
249
250
|
# spider.every_favicon do |page|
|
@@ -251,9 +252,9 @@ module Ronin
|
|
251
252
|
# end
|
252
253
|
# end
|
253
254
|
# ```
|
254
|
-
#
|
255
|
+
#
|
255
256
|
# Print every HTML comment:
|
256
|
-
#
|
257
|
+
#
|
257
258
|
# ```ruby
|
258
259
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
259
260
|
# spider.every_html_comment do |comment|
|
@@ -261,9 +262,9 @@ module Ronin
|
|
261
262
|
# end
|
262
263
|
# end
|
263
264
|
# ```
|
264
|
-
#
|
265
|
+
#
|
265
266
|
# Print all JavaScript source code:
|
266
|
-
#
|
267
|
+
#
|
267
268
|
# ```ruby
|
268
269
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
269
270
|
# spider.every_javascript do |js|
|
@@ -271,9 +272,9 @@ module Ronin
|
|
271
272
|
# end
|
272
273
|
# end
|
273
274
|
# ```
|
274
|
-
#
|
275
|
+
#
|
275
276
|
# Print every JavaScript string literal:
|
276
|
-
#
|
277
|
+
#
|
277
278
|
# ```ruby
|
278
279
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
279
280
|
# spider.every_javascript_string do |str|
|
@@ -281,9 +282,9 @@ module Ronin
|
|
281
282
|
# end
|
282
283
|
# end
|
283
284
|
# ```
|
284
|
-
#
|
285
|
+
#
|
285
286
|
# Print every JavaScript comment:
|
286
|
-
#
|
287
|
+
#
|
287
288
|
# ```ruby
|
288
289
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
289
290
|
# spider.every_javascript_comment do |comment|
|
@@ -291,9 +292,9 @@ module Ronin
|
|
291
292
|
# end
|
292
293
|
# end
|
293
294
|
# ```
|
294
|
-
#
|
295
|
+
#
|
295
296
|
# Print every HTML and JavaScript comment:
|
296
|
-
#
|
297
|
+
#
|
297
298
|
# ```ruby
|
298
299
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
299
300
|
# spider.every_comment do |comment|
|
@@ -301,7 +302,7 @@ module Ronin
|
|
301
302
|
# end
|
302
303
|
# end
|
303
304
|
# ```
|
304
|
-
#
|
305
|
+
#
|
305
306
|
module Spider
|
306
307
|
#
|
307
308
|
# Creates a new agent and begin spidering at the given URL.
|
data/ronin-web-spider.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'yaml'
|
4
4
|
|
@@ -22,7 +22,7 @@ Gem::Specification.new do |gem|
|
|
22
22
|
gem.homepage = gemspec['homepage']
|
23
23
|
gem.metadata = gemspec['metadata'] if gemspec['metadata']
|
24
24
|
|
25
|
-
glob =
|
25
|
+
glob = ->(patterns) { gem.files & Dir[*patterns] }
|
26
26
|
|
27
27
|
gem.files = `git ls-files`.split($/)
|
28
28
|
gem.files = glob[gemspec['files']] if gemspec['files']
|
@@ -46,7 +46,7 @@ Gem::Specification.new do |gem|
|
|
46
46
|
gem.required_rubygems_version = gemspec['required_rubygems_version']
|
47
47
|
gem.post_install_message = gemspec['post_install_message']
|
48
48
|
|
49
|
-
split =
|
49
|
+
split = ->(string) { string.split(/,\s*/) }
|
50
50
|
|
51
51
|
if gemspec['dependencies']
|
52
52
|
gemspec['dependencies'].each do |name,versions|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ronin-web-spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -66,6 +66,7 @@ files:
|
|
66
66
|
- ".github/workflows/ruby.yml"
|
67
67
|
- ".gitignore"
|
68
68
|
- ".rspec"
|
69
|
+
- ".rubocop.yml"
|
69
70
|
- ".ruby-version"
|
70
71
|
- ".yardopts"
|
71
72
|
- COPYING.txt
|
@@ -105,8 +106,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
105
106
|
- !ruby/object:Gem::Version
|
106
107
|
version: '0'
|
107
108
|
requirements: []
|
108
|
-
rubygems_version: 3.3.
|
109
|
+
rubygems_version: 3.3.27
|
109
110
|
signing_key:
|
110
111
|
specification_version: 4
|
111
|
-
summary: collection of common web spidering routines
|
112
|
+
summary: A collection of common web spidering routines.
|
112
113
|
test_files: []
|