ronin-web-spider 0.1.0 → 0.2.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +16 -1
- data/.rubocop.yml +11 -0
- data/ChangeLog.md +35 -1
- data/Gemfile +3 -0
- data/README.md +12 -3
- data/Rakefile +2 -2
- data/gemspec.yml +1 -1
- data/lib/ronin/web/spider/agent.rb +311 -15
- data/lib/ronin/web/spider/archive.rb +2 -1
- data/lib/ronin/web/spider/exceptions.rb +2 -1
- data/lib/ronin/web/spider/git_archive.rb +2 -1
- data/lib/ronin/web/spider/version.rb +3 -2
- data/lib/ronin/web/spider.rb +64 -63
- data/ronin-web-spider.gemspec +3 -3
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e1637185a37e17f587cab3bb0ec451dfa763ab58b167404ec5b1161a4a80316f
|
4
|
+
data.tar.gz: ea11da89c3c232feaca90c103c45cdb653eac9b94c6c97be34998d6b5089e896
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d2f5ac7b650096856b87f5e37cf42044a12db416eab2375715a39073606a1f4c30103fda27cb1faaf9a53533bdf323ae9912787078f24f120366fb519a3b4a1
|
7
|
+
data.tar.gz: b2ae98ce51187a5a65cd2355816b98d1916edb0d731f158e83a9366ec70261ef2e3eae9d5f3abe95b311a766c489064f855b4ee6afa4606592a5f9b560fbcb9d
|
data/.github/workflows/ruby.yml
CHANGED
@@ -12,11 +12,12 @@ jobs:
|
|
12
12
|
- '3.0'
|
13
13
|
- '3.1'
|
14
14
|
- '3.2'
|
15
|
+
- '3.3'
|
15
16
|
- jruby
|
16
17
|
- truffleruby
|
17
18
|
name: Ruby ${{ matrix.ruby }}
|
18
19
|
steps:
|
19
|
-
- uses: actions/checkout@
|
20
|
+
- uses: actions/checkout@v4
|
20
21
|
- name: Set up Ruby
|
21
22
|
uses: ruby/setup-ruby@v1
|
22
23
|
with:
|
@@ -26,3 +27,17 @@ jobs:
|
|
26
27
|
run: bundle install --jobs 4 --retry 3
|
27
28
|
- name: Run tests
|
28
29
|
run: bundle exec rake test
|
30
|
+
|
31
|
+
# rubocop linting
|
32
|
+
rubocop:
|
33
|
+
runs-on: ubuntu-latest
|
34
|
+
steps:
|
35
|
+
- uses: actions/checkout@v4
|
36
|
+
- name: Set up Ruby
|
37
|
+
uses: ruby/setup-ruby@v1
|
38
|
+
with:
|
39
|
+
ruby-version: 3.0
|
40
|
+
- name: Install dependencies
|
41
|
+
run: bundle install --jobs 4 --retry 3
|
42
|
+
- name: Run rubocop
|
43
|
+
run: bundle exec rubocop --parallel
|
data/.rubocop.yml
ADDED
data/ChangeLog.md
CHANGED
@@ -1,4 +1,37 @@
|
|
1
|
-
### 0.
|
1
|
+
### 0.2.0 / 2024-XX-XX
|
2
|
+
|
3
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
|
4
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
|
5
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
|
6
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
|
7
|
+
* Allow {Ronin::Web::Spider::Agent#every_html_comment},
|
8
|
+
{Ronin::Web::Spider::Agent#every_javascript every_javascript},
|
9
|
+
{Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
|
10
|
+
{Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
|
11
|
+
{Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
|
12
|
+
{Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
|
13
|
+
{Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
|
14
|
+
to also yield a `Spidr::Page` block argument for additional context.
|
15
|
+
|
16
|
+
### 0.1.1 / 2024-06-19
|
17
|
+
|
18
|
+
* Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
|
19
|
+
{Ronin::Web::Spider::Agent#every_javascript} when the page's `Content-Type`
|
20
|
+
header included `text/html` but lacked a response body, causing `page.doc` to
|
21
|
+
be `nil`.
|
22
|
+
* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript} where parsed
|
23
|
+
JavaScript source code strings containing UTF-8 characters where being
|
24
|
+
incorrectly encoded as ASCII-8bit strings, if the page's `Content-Type` header
|
25
|
+
did not include a `charset=` attribute.
|
26
|
+
* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript_string} where
|
27
|
+
inline JavaScript regexes containing the `"` or `'` characters (ex: `/["'=]/`)
|
28
|
+
would incorrectly be treated as the beginning or ends of JavaScript string
|
29
|
+
literals. Note that while this greatly improves the accuracy of
|
30
|
+
{Ronin::Web::Spider::Agent#every_javascript_string}, it still does not
|
31
|
+
support parsing JavaScript template literals that may also contain string
|
32
|
+
literals (ex: ````Hello \"World\"```` or ````Hello ${myFunc("string literal")}````).
|
33
|
+
|
34
|
+
### 0.1.0 / 2023-02-01
|
2
35
|
|
3
36
|
* Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
|
4
37
|
* Relicensed as LGPL-3.0.
|
@@ -20,3 +53,4 @@
|
|
20
53
|
* `every_comment` - yields every HTML or JavaScript comment.
|
21
54
|
* Supports archiving spidered pages to a directory or git repository.
|
22
55
|
|
56
|
+
[spidr]: https://github.com/postmodern/spidr#readme
|
data/Gemfile
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
source 'https://rubygems.org'
|
2
3
|
|
3
4
|
gemspec
|
@@ -28,4 +29,6 @@ group :development do
|
|
28
29
|
gem 'dead_end', require: false
|
29
30
|
gem 'sord', require: false, platform: :mri
|
30
31
|
gem 'stackprof', require: false, platform: :mri
|
32
|
+
gem 'rubocop', require: false, platform: :mri
|
33
|
+
gem 'rubocop-ronin', require: false, platform: :mri
|
31
34
|
end
|
data/README.md
CHANGED
@@ -9,7 +9,6 @@
|
|
9
9
|
* [Issues](https://github.com/ronin-rb/ronin-web-spider/issues)
|
10
10
|
* [Documentation](https://ronin-rb.dev/docs/ronin-web-spider/frames)
|
11
11
|
* [Discord](https://discord.gg/6WAb3PsVX9) |
|
12
|
-
[Twitter](https://twitter.com/ronin_rb) |
|
13
12
|
[Mastodon](https://infosec.exchange/@ronin_rb)
|
14
13
|
|
15
14
|
## Description
|
@@ -38,7 +37,7 @@ ronin-web-spider is a collection of common web spidering routines using the
|
|
38
37
|
* [every_comment][docs-every_comment] - yields every HTML or JavaScript
|
39
38
|
comment.
|
40
39
|
* Supports archiving spidered pages to a directory or git repository.
|
41
|
-
* Has
|
40
|
+
* Has 97% documentation coverage.
|
42
41
|
* Has 94% test coverage.
|
43
42
|
|
44
43
|
[docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
|
@@ -305,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
|
|
305
304
|
end
|
306
305
|
```
|
307
306
|
|
307
|
+
Print every JavaScript URL string literal:
|
308
|
+
|
309
|
+
```ruby
|
310
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
311
|
+
spider.every_javascript_url_string do |url|
|
312
|
+
puts url
|
313
|
+
end
|
314
|
+
end
|
315
|
+
```
|
316
|
+
|
308
317
|
Print every JavaScript comment:
|
309
318
|
|
310
319
|
```ruby
|
@@ -391,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
|
|
391
400
|
|
392
401
|
## License
|
393
402
|
|
394
|
-
Copyright (c) 2006-
|
403
|
+
Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
395
404
|
|
396
405
|
ronin-web-spider is free software: you can redistribute it and/or modify
|
397
406
|
it under the terms of the GNU Lesser General Public License as published
|
data/Rakefile
CHANGED
data/gemspec.yml
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -22,6 +23,7 @@ require 'spidr/agent'
|
|
22
23
|
require 'ronin/support/network/http'
|
23
24
|
require 'ronin/support/crypto/cert'
|
24
25
|
require 'ronin/support/text/patterns/source_code'
|
26
|
+
require 'ronin/support/text/patterns/network'
|
25
27
|
require 'ronin/support/encoding/js'
|
26
28
|
|
27
29
|
module Ronin
|
@@ -224,10 +226,17 @@ module Ronin
|
|
224
226
|
# @yield [comment]
|
225
227
|
# The given block will be pass every HTML comment.
|
226
228
|
#
|
229
|
+
# @yield [comment, page]
|
230
|
+
# If the block accepts two arguments, the HTML comment and the page
|
231
|
+
# that the comment was found on will be passed to the given block.
|
232
|
+
#
|
227
233
|
# @yieldparam [String] comment
|
228
234
|
# The HTML comment inner text, with leading and trailing whitespace
|
229
235
|
# stripped.
|
230
236
|
#
|
237
|
+
# @yieldparam [Spidr::Page] page
|
238
|
+
# The page that the HTML comment exists on.
|
239
|
+
#
|
231
240
|
# @example
|
232
241
|
# spider.every_html_comment do |comment|
|
233
242
|
# puts comment
|
@@ -235,13 +244,19 @@ module Ronin
|
|
235
244
|
#
|
236
245
|
# @api public
|
237
246
|
#
|
238
|
-
def every_html_comment
|
247
|
+
def every_html_comment(&block)
|
239
248
|
every_html_page do |page|
|
249
|
+
next unless page.doc
|
250
|
+
|
240
251
|
page.doc.xpath('//comment()').each do |comment|
|
241
252
|
comment_text = comment.inner_text.strip
|
242
253
|
|
243
254
|
unless comment_text.empty?
|
244
|
-
|
255
|
+
if block.arity == 2
|
256
|
+
yield comment_text, page
|
257
|
+
else
|
258
|
+
yield comment_text
|
259
|
+
end
|
245
260
|
end
|
246
261
|
end
|
247
262
|
end
|
@@ -253,9 +268,17 @@ module Ronin
|
|
253
268
|
# @yield [js]
|
254
269
|
# The given block will be passed every piece of JavaScript source.
|
255
270
|
#
|
271
|
+
# @yield [js, page]
|
272
|
+
# If the block accepts two arguments, the JavaScript source and the
|
273
|
+
# page that the JavaScript source was found on will be passed to the
|
274
|
+
# given block.
|
275
|
+
#
|
256
276
|
# @yieldparam [String] js
|
257
277
|
# The JavaScript source code.
|
258
278
|
#
|
279
|
+
# @yieldparam [Spidr::Page] page
|
280
|
+
# The page that the JavaScript source was found in or on.
|
281
|
+
#
|
259
282
|
# @example
|
260
283
|
# spider.every_javascript do |js|
|
261
284
|
# puts js
|
@@ -263,24 +286,72 @@ module Ronin
|
|
263
286
|
#
|
264
287
|
# @api public
|
265
288
|
#
|
266
|
-
def every_javascript
|
289
|
+
def every_javascript(&block)
|
267
290
|
# yield inner text of every `<script type="text/javascript">` tag
|
268
291
|
# and every `.js` URL.
|
269
292
|
every_html_page do |page|
|
293
|
+
next unless page.doc
|
294
|
+
|
270
295
|
page.doc.xpath('//script[@type="text/javascript"]').each do |script|
|
271
|
-
|
272
|
-
|
296
|
+
source = script.inner_text
|
297
|
+
source.force_encoding(Encoding::UTF_8)
|
298
|
+
|
299
|
+
unless source.empty?
|
300
|
+
if block.arity == 2
|
301
|
+
yield source, page
|
302
|
+
else
|
303
|
+
yield source
|
304
|
+
end
|
273
305
|
end
|
274
306
|
end
|
275
307
|
end
|
276
308
|
|
277
309
|
every_javascript_page do |page|
|
278
|
-
|
310
|
+
source = page.body
|
311
|
+
source.force_encoding(Encoding::UTF_8)
|
312
|
+
|
313
|
+
if block.arity == 2
|
314
|
+
yield source, page
|
315
|
+
else
|
316
|
+
yield source
|
317
|
+
end
|
279
318
|
end
|
280
319
|
end
|
281
320
|
|
282
321
|
alias every_js every_javascript
|
283
322
|
|
323
|
+
# Regex to match and skip JavaScript inline regexes.
|
324
|
+
#
|
325
|
+
# @api private
|
326
|
+
#
|
327
|
+
# @since 0.1.1
|
328
|
+
JAVASCRIPT_INLINE_REGEX = %r{
|
329
|
+
(?# match before the regex to avoid matching division operators )
|
330
|
+
(?:[\{\[\(;:,]\s*|=\s*)
|
331
|
+
/
|
332
|
+
(?# inline regex contents )
|
333
|
+
(?:
|
334
|
+
\[ (?:\\. | [^\]]) \] (?# [...] ) |
|
335
|
+
\\. (?# backslash escaped characters ) |
|
336
|
+
[^/] (?# everything else )
|
337
|
+
)+
|
338
|
+
/[dgimsuvy]* (?# also match any regex flags )
|
339
|
+
}mx
|
340
|
+
|
341
|
+
# Regex to match and skip JavaScript template literals.
|
342
|
+
#
|
343
|
+
# @note
|
344
|
+
# This regex will not properly match nested template literals:
|
345
|
+
#
|
346
|
+
# ```javascript
|
347
|
+
# `foo ${`bar ${1+1}`}`
|
348
|
+
# ```
|
349
|
+
#
|
350
|
+
# @api private
|
351
|
+
#
|
352
|
+
# @since 0.1.1
|
353
|
+
JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
|
354
|
+
|
284
355
|
#
|
285
356
|
# Passes every JavaScript string value to the given block.
|
286
357
|
#
|
@@ -288,35 +359,246 @@ module Ronin
|
|
288
359
|
# The given block will be passed each JavaScript string with the quote
|
289
360
|
# marks removed.
|
290
361
|
#
|
362
|
+
# @yield [string, page]
|
363
|
+
# If the block accepts two arguments, the JavaScript string and the
|
364
|
+
# page that the JavaScript string was found on will be passed to the
|
365
|
+
# given block.
|
366
|
+
#
|
291
367
|
# @yieldparam [String] string
|
292
368
|
# The parsed contents of a JavaScript string.
|
293
369
|
#
|
370
|
+
# @yieldparam [Spidr::Page] page
|
371
|
+
# The page that the JavaScript string was found in or on.
|
372
|
+
#
|
294
373
|
# @example
|
295
374
|
# spider.every_javascript_string do |str|
|
296
|
-
#
|
297
|
-
#
|
375
|
+
# puts str
|
376
|
+
# end
|
298
377
|
#
|
299
378
|
# @api public
|
300
379
|
#
|
301
|
-
def every_javascript_string
|
302
|
-
every_javascript do |js|
|
303
|
-
|
304
|
-
|
380
|
+
def every_javascript_string(&block)
|
381
|
+
every_javascript do |js,page|
|
382
|
+
scanner = StringScanner.new(js)
|
383
|
+
|
384
|
+
until scanner.eos?
|
385
|
+
# NOTE: this is a naive JavaScript string scanner and should
|
386
|
+
# eventually be replaced with a real JavaScript lexer or parser.
|
387
|
+
case scanner.peek(1)
|
388
|
+
when '"', "'" # beginning of a quoted string
|
389
|
+
js_string = scanner.scan(Support::Text::Patterns::STRING)
|
390
|
+
string = Support::Encoding::JS.unquote(js_string)
|
391
|
+
|
392
|
+
if block.arity == 2
|
393
|
+
yield string, page
|
394
|
+
else
|
395
|
+
yield string
|
396
|
+
end
|
397
|
+
else
|
398
|
+
scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
|
399
|
+
scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
|
400
|
+
scanner.getch
|
401
|
+
end
|
305
402
|
end
|
306
403
|
end
|
307
404
|
end
|
308
405
|
|
309
406
|
alias every_js_string every_javascript_string
|
310
407
|
|
408
|
+
# Regular expression that matches relative paths within JavaScript.
|
409
|
+
#
|
410
|
+
# @note
|
411
|
+
# This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
|
412
|
+
# but *not* `/foo`, `foo`, or `foo.`.
|
413
|
+
JAVASCRIPT_RELATIVE_PATH = %r{
|
414
|
+
\A
|
415
|
+
(?:
|
416
|
+
[^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
|
417
|
+
|
|
418
|
+
[^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
|
419
|
+
)
|
420
|
+
\z
|
421
|
+
}x
|
422
|
+
|
423
|
+
#
|
424
|
+
# Passes every JavaScript relative path string to the given block.
|
425
|
+
#
|
426
|
+
# @yield [string]
|
427
|
+
# The given block will be passed each JavaScript relative path string
|
428
|
+
# with the quote marks removed.
|
429
|
+
#
|
430
|
+
# @yield [string, page]
|
431
|
+
# If the block accepts two arguments, the JavaScript relative path
|
432
|
+
# string and the page that the JavaScript relative path string was
|
433
|
+
# found on will be passed to the given block.
|
434
|
+
#
|
435
|
+
# @yieldparam [String] string
|
436
|
+
# The parsed contents of a literal JavaScript relative path string.
|
437
|
+
#
|
438
|
+
# @yieldparam [Spidr::Page] page
|
439
|
+
# The page that the JavaScript relative path string was found in or
|
440
|
+
# on.
|
441
|
+
#
|
442
|
+
# @example
|
443
|
+
# spider.every_javascript_relative_path_string do |relative_path|
|
444
|
+
# puts relative_path
|
445
|
+
# end
|
446
|
+
#
|
447
|
+
# @api public
|
448
|
+
#
|
449
|
+
# @since 0.2.0
|
450
|
+
#
|
451
|
+
def every_javascript_relative_path_string(&block)
|
452
|
+
every_javascript_string do |string,page|
|
453
|
+
if string =~ JAVASCRIPT_RELATIVE_PATH
|
454
|
+
if block.arity == 2
|
455
|
+
yield string, page
|
456
|
+
else
|
457
|
+
yield string
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
alias every_js_relative_path_string every_javascript_relative_path_string
|
464
|
+
|
465
|
+
# Regular expression that matches absolute paths within JavaScript.
|
466
|
+
JAVASCRIPT_ABSOLUTE_PATH = %r{\A(?:/[^/\\ ]+)+\z}
|
467
|
+
|
468
|
+
#
|
469
|
+
# Passes every JavaScript absolute path string to the given block.
|
470
|
+
#
|
471
|
+
# @yield [string]
|
472
|
+
# The given block will be passed each JavaScript absolute path string
|
473
|
+
# with the quote marks removed.
|
474
|
+
#
|
475
|
+
# @yield [string, page]
|
476
|
+
# If the block accepts two arguments, the JavaScript absolute path
|
477
|
+
# string and the page that the JavaScript absolute path string was
|
478
|
+
# found on will be passed to the given block.
|
479
|
+
#
|
480
|
+
# @yieldparam [String] string
|
481
|
+
# The parsed contents of a literal JavaScript absolute path string.
|
482
|
+
#
|
483
|
+
# @yieldparam [Spidr::Page] page
|
484
|
+
# The page that the JavaScript absolute path string was found in or
|
485
|
+
# on.
|
486
|
+
#
|
487
|
+
# @example
|
488
|
+
# spider.every_javascript_absolute_path_string do |absolute_path|
|
489
|
+
# puts absolute_path
|
490
|
+
# end
|
491
|
+
#
|
492
|
+
# @api public
|
493
|
+
#
|
494
|
+
# @since 0.2.0
|
495
|
+
#
|
496
|
+
def every_javascript_absolute_path_string(&block)
|
497
|
+
every_javascript_string do |string,page|
|
498
|
+
if string =~ JAVASCRIPT_ABSOLUTE_PATH
|
499
|
+
if block.arity == 2
|
500
|
+
yield string, page
|
501
|
+
else
|
502
|
+
yield string
|
503
|
+
end
|
504
|
+
end
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
alias every_js_absolute_path_string every_javascript_absolute_path_string
|
509
|
+
|
510
|
+
#
|
511
|
+
# Passes every JavaScript path string to the given block.
|
512
|
+
#
|
513
|
+
# @yield [string]
|
514
|
+
# The given block will be passed each JavaScript path string with the
|
515
|
+
# quote marks removed.
|
516
|
+
#
|
517
|
+
# @yield [string, page]
|
518
|
+
# If the block accepts two arguments, the JavaScript path string and
|
519
|
+
# the page that the JavaScript path string was found on will be
|
520
|
+
# passed to the given block.
|
521
|
+
#
|
522
|
+
# @yieldparam [String] string
|
523
|
+
# The parsed contents of a literal JavaScript path string.
|
524
|
+
#
|
525
|
+
# @yieldparam [Spidr::Page] page
|
526
|
+
# The page that the JavaScript path string was found in or on.
|
527
|
+
#
|
528
|
+
# @example
|
529
|
+
# spider.every_javascript_path_string do |path|
|
530
|
+
# puts path
|
531
|
+
# end
|
532
|
+
#
|
533
|
+
# @api public
|
534
|
+
#
|
535
|
+
# @since 0.2.0
|
536
|
+
#
|
537
|
+
def every_javascript_path_string(&block)
|
538
|
+
every_javascript_relative_path_string(&block)
|
539
|
+
every_javascript_absolute_path_string(&block)
|
540
|
+
end
|
541
|
+
|
542
|
+
alias every_js_path_string every_javascript_path_string
|
543
|
+
|
544
|
+
#
|
545
|
+
# Passes every JavaScript URL string to the given block.
|
546
|
+
#
|
547
|
+
# @yield [string]
|
548
|
+
# The given block will be passed each JavaScript URL string with the
|
549
|
+
# quote marks removed.
|
550
|
+
#
|
551
|
+
# @yield [string, page]
|
552
|
+
# If the block accepts two arguments, the JavaScript URL string and
|
553
|
+
# the page that the JavaScript URL string was found on will be passed
|
554
|
+
# to the given block.
|
555
|
+
#
|
556
|
+
# @yieldparam [String] string
|
557
|
+
# The parsed contents of a literal JavaScript URL string.
|
558
|
+
#
|
559
|
+
# @yieldparam [Spidr::Page] page
|
560
|
+
# The page that the JavaScript URL string was found in or on.
|
561
|
+
#
|
562
|
+
# @example
|
563
|
+
# spider.every_javascript_url_string do |url|
|
564
|
+
# puts url
|
565
|
+
# end
|
566
|
+
#
|
567
|
+
# @api public
|
568
|
+
#
|
569
|
+
# @since 0.2.0
|
570
|
+
#
|
571
|
+
def every_javascript_url_string(&block)
|
572
|
+
every_javascript_string do |string,page|
|
573
|
+
if string =~ Support::Text::Patterns::URL
|
574
|
+
if block.arity == 2
|
575
|
+
yield string, page
|
576
|
+
else
|
577
|
+
yield string
|
578
|
+
end
|
579
|
+
end
|
580
|
+
end
|
581
|
+
end
|
582
|
+
|
583
|
+
alias every_js_url_string every_javascript_url_string
|
584
|
+
|
311
585
|
#
|
312
586
|
# Passes every JavaScript comment to the given block.
|
313
587
|
#
|
314
588
|
# @yield [comment]
|
315
589
|
# The given block will be passed each JavaScript comment.
|
316
590
|
#
|
591
|
+
# @yield [comment, page]
|
592
|
+
# If the block accepts two arguments, the JavaScript comment and the
|
593
|
+
# page that the JavaScript comment was found on will be passed to the
|
594
|
+
# given block.
|
595
|
+
#
|
317
596
|
# @yieldparam [String] comment
|
318
597
|
# The contents of a JavaScript comment.
|
319
598
|
#
|
599
|
+
# @yieldparam [Spidr::Page] page
|
600
|
+
# The page that the JavaScript comment was found in or on.
|
601
|
+
#
|
320
602
|
# @example
|
321
603
|
# spider.every_javascript_comment do |comment|
|
322
604
|
# puts comment
|
@@ -325,8 +607,14 @@ module Ronin
|
|
325
607
|
# @api public
|
326
608
|
#
|
327
609
|
def every_javascript_comment(&block)
|
328
|
-
every_javascript do |js|
|
329
|
-
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT
|
610
|
+
every_javascript do |js,page|
|
611
|
+
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
|
612
|
+
if block.arity == 2
|
613
|
+
yield comment, page
|
614
|
+
else
|
615
|
+
yield comment
|
616
|
+
end
|
617
|
+
end
|
330
618
|
end
|
331
619
|
end
|
332
620
|
|
@@ -338,9 +626,17 @@ module Ronin
|
|
338
626
|
# @yield [comment]
|
339
627
|
# The given block will be passed each HTML or JavaScript comment.
|
340
628
|
#
|
629
|
+
# @yield [comment, page]
|
630
|
+
# If the block accepts two arguments, the HTML or JavaScript comment
|
631
|
+
# and the page that the HTML/JavaScript comment was found on will be
|
632
|
+
# passed to the given block.
|
633
|
+
#
|
341
634
|
# @yieldparam [String] comment
|
342
635
|
# The contents of a HTML or JavaScript comment.
|
343
636
|
#
|
637
|
+
# @yieldparam [Spidr::Page] page
|
638
|
+
# The page that the HTML or JavaScript comment was found in or on.
|
639
|
+
#
|
344
640
|
# @example
|
345
641
|
# spider.every_comment do |comment|
|
346
642
|
# puts comment
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -31,7 +32,7 @@ module Ronin
|
|
31
32
|
#
|
32
33
|
# require 'ronin/web/spider'
|
33
34
|
# require 'ronin/web/spider/archive'
|
34
|
-
#
|
35
|
+
#
|
35
36
|
# Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
36
37
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
37
38
|
# archive.write(page.url,page.body)
|
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -33,7 +34,7 @@ module Ronin
|
|
33
34
|
# require 'ronin/web/spider'
|
34
35
|
# require 'ronin/web/spider/git_archive'
|
35
36
|
# require 'date'
|
36
|
-
#
|
37
|
+
#
|
37
38
|
# Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
38
39
|
# archive.commit("Updated #{Date.today}") do
|
39
40
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -21,7 +22,7 @@ module Ronin
|
|
21
22
|
module Web
|
22
23
|
module Spider
|
23
24
|
# ronin-web-spider version
|
24
|
-
VERSION = '0.
|
25
|
+
VERSION = '0.2.0.rc1'
|
25
26
|
end
|
26
27
|
end
|
27
28
|
end
|
data/lib/ronin/web/spider.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -30,136 +31,136 @@ module Ronin
|
|
30
31
|
# ## Examples
|
31
32
|
#
|
32
33
|
# Spider a host:
|
33
|
-
#
|
34
|
+
#
|
34
35
|
# ```ruby
|
35
36
|
# require 'ronin/web/spider'
|
36
|
-
#
|
37
|
+
#
|
37
38
|
# Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
|
38
39
|
# # ...
|
39
40
|
# end
|
40
41
|
# ```
|
41
|
-
#
|
42
|
+
#
|
42
43
|
# Spider a host:
|
43
|
-
#
|
44
|
+
#
|
44
45
|
# ```ruby
|
45
46
|
# Ronin::Web::Spider.host('solnic.eu') do |agent|
|
46
47
|
# # ...
|
47
48
|
# end
|
48
49
|
# ```
|
49
|
-
#
|
50
|
+
#
|
50
51
|
# Spider a domain (and any sub-domains):
|
51
|
-
#
|
52
|
+
#
|
52
53
|
# ```ruby
|
53
54
|
# Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
|
54
55
|
# # ...
|
55
56
|
# end
|
56
57
|
# ```
|
57
|
-
#
|
58
|
+
#
|
58
59
|
# Spider a site:
|
59
|
-
#
|
60
|
+
#
|
60
61
|
# ```ruby
|
61
62
|
# Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
|
62
63
|
# # ...
|
63
64
|
# end
|
64
65
|
# ```
|
65
|
-
#
|
66
|
+
#
|
66
67
|
# Spider multiple hosts:
|
67
|
-
#
|
68
|
+
#
|
68
69
|
# ```ruby
|
69
70
|
# Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
70
71
|
# # ...
|
71
72
|
# end
|
72
73
|
# ```
|
73
|
-
#
|
74
|
+
#
|
74
75
|
# Do not spider certain links:
|
75
|
-
#
|
76
|
+
#
|
76
77
|
# ```ruby
|
77
78
|
# Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
78
79
|
# # ...
|
79
80
|
# end
|
80
81
|
# ```
|
81
|
-
#
|
82
|
+
#
|
82
83
|
# Do not spider links on certain ports:
|
83
|
-
#
|
84
|
+
#
|
84
85
|
# ```ruby
|
85
86
|
# Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
86
87
|
# # ...
|
87
88
|
# end
|
88
89
|
# ```
|
89
|
-
#
|
90
|
+
#
|
90
91
|
# Do not spider links blacklisted in robots.txt:
|
91
|
-
#
|
92
|
+
#
|
92
93
|
# ```ruby
|
93
94
|
# Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
|
94
95
|
# # ...
|
95
96
|
# end
|
96
97
|
# ```
|
97
|
-
#
|
98
|
+
#
|
98
99
|
# Print out visited URLs:
|
99
|
-
#
|
100
|
+
#
|
100
101
|
# ```ruby
|
101
102
|
# Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
|
102
103
|
# spider.every_url { |url| puts url }
|
103
104
|
# end
|
104
105
|
# ```
|
105
|
-
#
|
106
|
+
#
|
106
107
|
# Build a URL map of a site:
|
107
|
-
#
|
108
|
+
#
|
108
109
|
# ```ruby
|
109
110
|
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
110
|
-
#
|
111
|
+
#
|
111
112
|
# Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
112
113
|
# spider.every_link do |origin,dest|
|
113
114
|
# url_map[dest] << origin
|
114
115
|
# end
|
115
116
|
# end
|
116
117
|
# ```
|
117
|
-
#
|
118
|
+
#
|
118
119
|
# Print out the URLs that could not be requested:
|
119
|
-
#
|
120
|
+
#
|
120
121
|
# ```ruby
|
121
122
|
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
122
123
|
# spider.every_failed_url { |url| puts url }
|
123
124
|
# end
|
124
125
|
# ```
|
125
|
-
#
|
126
|
+
#
|
126
127
|
# Finds all pages which have broken links:
|
127
|
-
#
|
128
|
+
#
|
128
129
|
# ```ruby
|
129
130
|
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
130
|
-
#
|
131
|
+
#
|
131
132
|
# spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
132
133
|
# spider.every_link do |origin,dest|
|
133
134
|
# url_map[dest] << origin
|
134
135
|
# end
|
135
136
|
# end
|
136
|
-
#
|
137
|
+
#
|
137
138
|
# spider.failures.each do |url|
|
138
139
|
# puts "Broken link #{url} found in:"
|
139
|
-
#
|
140
|
+
#
|
140
141
|
# url_map[url].each { |page| puts " #{page}" }
|
141
142
|
# end
|
142
143
|
# ```
|
143
|
-
#
|
144
|
+
#
|
144
145
|
# Search HTML and XML pages:
|
145
|
-
#
|
146
|
+
#
|
146
147
|
# ```ruby
|
147
148
|
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
148
149
|
# spider.every_page do |page|
|
149
150
|
# puts ">>> #{page.url}"
|
150
|
-
#
|
151
|
+
#
|
151
152
|
# page.search('//meta').each do |meta|
|
152
153
|
# name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
153
154
|
# value = meta.attributes['content']
|
154
|
-
#
|
155
|
+
#
|
155
156
|
# puts " #{name} = #{value}"
|
156
157
|
# end
|
157
158
|
# end
|
158
159
|
# end
|
159
160
|
# ```
|
160
|
-
#
|
161
|
+
#
|
161
162
|
# Print out the titles from every page:
|
162
|
-
#
|
163
|
+
#
|
163
164
|
# ```ruby
|
164
165
|
# Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
|
165
166
|
# spider.every_html_page do |page|
|
@@ -167,9 +168,9 @@ module Ronin
|
|
167
168
|
# end
|
168
169
|
# end
|
169
170
|
# ```
|
170
|
-
#
|
171
|
+
#
|
171
172
|
# Print out every HTTP redirect:
|
172
|
-
#
|
173
|
+
#
|
173
174
|
# ```ruby
|
174
175
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
175
176
|
# spider.every_redirect_page do |page|
|
@@ -177,21 +178,21 @@ module Ronin
|
|
177
178
|
# end
|
178
179
|
# end
|
179
180
|
# ```
|
180
|
-
#
|
181
|
+
#
|
181
182
|
# Find what kinds of web servers a host is using, by accessing the headers:
|
182
|
-
#
|
183
|
+
#
|
183
184
|
# ```ruby
|
184
185
|
# servers = Set[]
|
185
|
-
#
|
186
|
+
#
|
186
187
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
187
188
|
# spider.all_headers do |headers|
|
188
189
|
# servers << headers['server']
|
189
190
|
# end
|
190
191
|
# end
|
191
192
|
# ```
|
192
|
-
#
|
193
|
+
#
|
193
194
|
# Pause the spider on a forbidden page:
|
194
|
-
#
|
195
|
+
#
|
195
196
|
# ```ruby
|
196
197
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
197
198
|
# spider.every_forbidden_page do |page|
|
@@ -199,9 +200,9 @@ module Ronin
|
|
199
200
|
# end
|
200
201
|
# end
|
201
202
|
# ```
|
202
|
-
#
|
203
|
+
#
|
203
204
|
# Skip the processing of a page:
|
204
|
-
#
|
205
|
+
#
|
205
206
|
# ```ruby
|
206
207
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
207
208
|
# spider.every_missing_page do |page|
|
@@ -209,9 +210,9 @@ module Ronin
|
|
209
210
|
# end
|
210
211
|
# end
|
211
212
|
# ```
|
212
|
-
#
|
213
|
+
#
|
213
214
|
# Skip the processing of links:
|
214
|
-
#
|
215
|
+
#
|
215
216
|
# ```ruby
|
216
217
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
217
218
|
# spider.every_url do |url|
|
@@ -221,9 +222,9 @@ module Ronin
|
|
221
222
|
# end
|
222
223
|
# end
|
223
224
|
# ```
|
224
|
-
#
|
225
|
+
#
|
225
226
|
# Detect when a new host name is spidered:
|
226
|
-
#
|
227
|
+
#
|
227
228
|
# ```ruby
|
228
229
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
229
230
|
# spider.every_host do |host|
|
@@ -231,9 +232,9 @@ module Ronin
|
|
231
232
|
# end
|
232
233
|
# end
|
233
234
|
# ```
|
234
|
-
#
|
235
|
+
#
|
235
236
|
# Detect when a new SSL/TLS certificate is encountered:
|
236
|
-
#
|
237
|
+
#
|
237
238
|
# ```ruby
|
238
239
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
239
240
|
# spider.every_cert do |cert|
|
@@ -241,9 +242,9 @@ module Ronin
|
|
241
242
|
# end
|
242
243
|
# end
|
243
244
|
# ```
|
244
|
-
#
|
245
|
+
#
|
245
246
|
# Print the MD5 checksum of every `favicon.ico` file:
|
246
|
-
#
|
247
|
+
#
|
247
248
|
# ```ruby
|
248
249
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
249
250
|
# spider.every_favicon do |page|
|
@@ -251,9 +252,9 @@ module Ronin
|
|
251
252
|
# end
|
252
253
|
# end
|
253
254
|
# ```
|
254
|
-
#
|
255
|
+
#
|
255
256
|
# Print every HTML comment:
|
256
|
-
#
|
257
|
+
#
|
257
258
|
# ```ruby
|
258
259
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
259
260
|
# spider.every_html_comment do |comment|
|
@@ -261,9 +262,9 @@ module Ronin
|
|
261
262
|
# end
|
262
263
|
# end
|
263
264
|
# ```
|
264
|
-
#
|
265
|
+
#
|
265
266
|
# Print all JavaScript source code:
|
266
|
-
#
|
267
|
+
#
|
267
268
|
# ```ruby
|
268
269
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
269
270
|
# spider.every_javascript do |js|
|
@@ -271,9 +272,9 @@ module Ronin
|
|
271
272
|
# end
|
272
273
|
# end
|
273
274
|
# ```
|
274
|
-
#
|
275
|
+
#
|
275
276
|
# Print every JavaScript string literal:
|
276
|
-
#
|
277
|
+
#
|
277
278
|
# ```ruby
|
278
279
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
279
280
|
# spider.every_javascript_string do |str|
|
@@ -281,9 +282,9 @@ module Ronin
|
|
281
282
|
# end
|
282
283
|
# end
|
283
284
|
# ```
|
284
|
-
#
|
285
|
+
#
|
285
286
|
# Print every JavaScript comment:
|
286
|
-
#
|
287
|
+
#
|
287
288
|
# ```ruby
|
288
289
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
289
290
|
# spider.every_javascript_comment do |comment|
|
@@ -291,9 +292,9 @@ module Ronin
|
|
291
292
|
# end
|
292
293
|
# end
|
293
294
|
# ```
|
294
|
-
#
|
295
|
+
#
|
295
296
|
# Print every HTML and JavaScript comment:
|
296
|
-
#
|
297
|
+
#
|
297
298
|
# ```ruby
|
298
299
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
299
300
|
# spider.every_comment do |comment|
|
@@ -301,7 +302,7 @@ module Ronin
|
|
301
302
|
# end
|
302
303
|
# end
|
303
304
|
# ```
|
304
|
-
#
|
305
|
+
#
|
305
306
|
module Spider
|
306
307
|
#
|
307
308
|
# Creates a new agent and begin spidering at the given URL.
|
data/ronin-web-spider.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'yaml'
|
4
4
|
|
@@ -22,7 +22,7 @@ Gem::Specification.new do |gem|
|
|
22
22
|
gem.homepage = gemspec['homepage']
|
23
23
|
gem.metadata = gemspec['metadata'] if gemspec['metadata']
|
24
24
|
|
25
|
-
glob =
|
25
|
+
glob = ->(patterns) { gem.files & Dir[*patterns] }
|
26
26
|
|
27
27
|
gem.files = `git ls-files`.split($/)
|
28
28
|
gem.files = glob[gemspec['files']] if gemspec['files']
|
@@ -46,7 +46,7 @@ Gem::Specification.new do |gem|
|
|
46
46
|
gem.required_rubygems_version = gemspec['required_rubygems_version']
|
47
47
|
gem.post_install_message = gemspec['post_install_message']
|
48
48
|
|
49
|
-
split =
|
49
|
+
split = ->(string) { string.split(/,\s*/) }
|
50
50
|
|
51
51
|
if gemspec['dependencies']
|
52
52
|
gemspec['dependencies'].each do |name,versions|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ronin-web-spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -66,6 +66,7 @@ files:
|
|
66
66
|
- ".github/workflows/ruby.yml"
|
67
67
|
- ".gitignore"
|
68
68
|
- ".rspec"
|
69
|
+
- ".rubocop.yml"
|
69
70
|
- ".ruby-version"
|
70
71
|
- ".yardopts"
|
71
72
|
- COPYING.txt
|
@@ -105,8 +106,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
105
106
|
- !ruby/object:Gem::Version
|
106
107
|
version: '0'
|
107
108
|
requirements: []
|
108
|
-
rubygems_version: 3.3.
|
109
|
+
rubygems_version: 3.3.27
|
109
110
|
signing_key:
|
110
111
|
specification_version: 4
|
111
|
-
summary: collection of common web spidering routines
|
112
|
+
summary: A collection of common web spidering routines.
|
112
113
|
test_files: []
|