ronin-web-spider 0.1.1 → 0.2.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog.md +15 -0
- data/README.md +11 -1
- data/lib/ronin/web/spider/agent.rb +251 -11
- data/lib/ronin/web/spider/exceptions.rb +1 -1
- data/lib/ronin/web/spider/version.rb +2 -2
- data/lib/ronin/web/spider.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e1637185a37e17f587cab3bb0ec451dfa763ab58b167404ec5b1161a4a80316f
|
4
|
+
data.tar.gz: ea11da89c3c232feaca90c103c45cdb653eac9b94c6c97be34998d6b5089e896
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d2f5ac7b650096856b87f5e37cf42044a12db416eab2375715a39073606a1f4c30103fda27cb1faaf9a53533bdf323ae9912787078f24f120366fb519a3b4a1
|
7
|
+
data.tar.gz: b2ae98ce51187a5a65cd2355816b98d1916edb0d731f158e83a9366ec70261ef2e3eae9d5f3abe95b311a766c489064f855b4ee6afa4606592a5f9b560fbcb9d
|
data/ChangeLog.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
### 0.2.0 / 2024-XX-XX
|
2
|
+
|
3
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
|
4
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
|
5
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
|
6
|
+
* Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
|
7
|
+
* Allow {Ronin::Web::Spider::Agent#every_html_comment},
|
8
|
+
{Ronin::Web::Spider::Agent#every_javascript every_javascript},
|
9
|
+
{Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
|
10
|
+
{Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
|
11
|
+
{Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
|
12
|
+
{Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
|
13
|
+
{Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
|
14
|
+
to also yield a `Spidr::Page` block argument for additional context.
|
15
|
+
|
1
16
|
### 0.1.1 / 2024-06-19
|
2
17
|
|
3
18
|
* Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
|
data/README.md
CHANGED
@@ -304,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
|
|
304
304
|
end
|
305
305
|
```
|
306
306
|
|
307
|
+
Print every JavaScript URL string literal:
|
308
|
+
|
309
|
+
```ruby
|
310
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
311
|
+
spider.every_javascript_url_string do |url|
|
312
|
+
puts url
|
313
|
+
end
|
314
|
+
end
|
315
|
+
```
|
316
|
+
|
307
317
|
Print every JavaScript comment:
|
308
318
|
|
309
319
|
```ruby
|
@@ -390,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
|
|
390
400
|
|
391
401
|
## License
|
392
402
|
|
393
|
-
Copyright (c) 2006-
|
403
|
+
Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
394
404
|
|
395
405
|
ronin-web-spider is free software: you can redistribute it and/or modify
|
396
406
|
it under the terms of the GNU Lesser General Public License as published
|
@@ -2,7 +2,7 @@
|
|
2
2
|
#
|
3
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
4
4
|
#
|
5
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
6
|
#
|
7
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
8
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -23,6 +23,7 @@ require 'spidr/agent'
|
|
23
23
|
require 'ronin/support/network/http'
|
24
24
|
require 'ronin/support/crypto/cert'
|
25
25
|
require 'ronin/support/text/patterns/source_code'
|
26
|
+
require 'ronin/support/text/patterns/network'
|
26
27
|
require 'ronin/support/encoding/js'
|
27
28
|
|
28
29
|
module Ronin
|
@@ -225,10 +226,17 @@ module Ronin
|
|
225
226
|
# @yield [comment]
|
226
227
|
# The given block will be pass every HTML comment.
|
227
228
|
#
|
229
|
+
# @yield [comment, page]
|
230
|
+
# If the block accepts two arguments, the HTML comment and the page
|
231
|
+
# that the comment was found on will be passed to the given block.
|
232
|
+
#
|
228
233
|
# @yieldparam [String] comment
|
229
234
|
# The HTML comment inner text, with leading and trailing whitespace
|
230
235
|
# stripped.
|
231
236
|
#
|
237
|
+
# @yieldparam [Spidr::Page] page
|
238
|
+
# The page that the HTML comment exists on.
|
239
|
+
#
|
232
240
|
# @example
|
233
241
|
# spider.every_html_comment do |comment|
|
234
242
|
# puts comment
|
@@ -236,7 +244,7 @@ module Ronin
|
|
236
244
|
#
|
237
245
|
# @api public
|
238
246
|
#
|
239
|
-
def every_html_comment
|
247
|
+
def every_html_comment(&block)
|
240
248
|
every_html_page do |page|
|
241
249
|
next unless page.doc
|
242
250
|
|
@@ -244,7 +252,11 @@ module Ronin
|
|
244
252
|
comment_text = comment.inner_text.strip
|
245
253
|
|
246
254
|
unless comment_text.empty?
|
247
|
-
|
255
|
+
if block.arity == 2
|
256
|
+
yield comment_text, page
|
257
|
+
else
|
258
|
+
yield comment_text
|
259
|
+
end
|
248
260
|
end
|
249
261
|
end
|
250
262
|
end
|
@@ -256,9 +268,17 @@ module Ronin
|
|
256
268
|
# @yield [js]
|
257
269
|
# The given block will be passed every piece of JavaScript source.
|
258
270
|
#
|
271
|
+
# @yield [js, page]
|
272
|
+
# If the block accepts two arguments, the JavaScript source and the
|
273
|
+
# page that the JavaScript source was found on will be passed to the
|
274
|
+
# given block.
|
275
|
+
#
|
259
276
|
# @yieldparam [String] js
|
260
277
|
# The JavaScript source code.
|
261
278
|
#
|
279
|
+
# @yieldparam [Spidr::Page] page
|
280
|
+
# The page that the JavaScript source was found in or on.
|
281
|
+
#
|
262
282
|
# @example
|
263
283
|
# spider.every_javascript do |js|
|
264
284
|
# puts js
|
@@ -266,7 +286,7 @@ module Ronin
|
|
266
286
|
#
|
267
287
|
# @api public
|
268
288
|
#
|
269
|
-
def every_javascript
|
289
|
+
def every_javascript(&block)
|
270
290
|
# yield inner text of every `<script type="text/javascript">` tag
|
271
291
|
# and every `.js` URL.
|
272
292
|
every_html_page do |page|
|
@@ -277,7 +297,11 @@ module Ronin
|
|
277
297
|
source.force_encoding(Encoding::UTF_8)
|
278
298
|
|
279
299
|
unless source.empty?
|
280
|
-
|
300
|
+
if block.arity == 2
|
301
|
+
yield source, page
|
302
|
+
else
|
303
|
+
yield source
|
304
|
+
end
|
281
305
|
end
|
282
306
|
end
|
283
307
|
end
|
@@ -286,7 +310,11 @@ module Ronin
|
|
286
310
|
source = page.body
|
287
311
|
source.force_encoding(Encoding::UTF_8)
|
288
312
|
|
289
|
-
|
313
|
+
if block.arity == 2
|
314
|
+
yield source, page
|
315
|
+
else
|
316
|
+
yield source
|
317
|
+
end
|
290
318
|
end
|
291
319
|
end
|
292
320
|
|
@@ -331,9 +359,17 @@ module Ronin
|
|
331
359
|
# The given block will be passed each JavaScript string with the quote
|
332
360
|
# marks removed.
|
333
361
|
#
|
362
|
+
# @yield [string, page]
|
363
|
+
# If the block accepts two arguments, the JavaScript string and the
|
364
|
+
# page that the JavaScript string was found on will be passed to the
|
365
|
+
# given block.
|
366
|
+
#
|
334
367
|
# @yieldparam [String] string
|
335
368
|
# The parsed contents of a JavaScript string.
|
336
369
|
#
|
370
|
+
# @yieldparam [Spidr::Page] page
|
371
|
+
# The page that the JavaScript string was found in or on.
|
372
|
+
#
|
337
373
|
# @example
|
338
374
|
# spider.every_javascript_string do |str|
|
339
375
|
# puts str
|
@@ -341,8 +377,8 @@ module Ronin
|
|
341
377
|
#
|
342
378
|
# @api public
|
343
379
|
#
|
344
|
-
def every_javascript_string
|
345
|
-
every_javascript do |js|
|
380
|
+
def every_javascript_string(&block)
|
381
|
+
every_javascript do |js,page|
|
346
382
|
scanner = StringScanner.new(js)
|
347
383
|
|
348
384
|
until scanner.eos?
|
@@ -351,8 +387,13 @@ module Ronin
|
|
351
387
|
case scanner.peek(1)
|
352
388
|
when '"', "'" # beginning of a quoted string
|
353
389
|
js_string = scanner.scan(Support::Text::Patterns::STRING)
|
390
|
+
string = Support::Encoding::JS.unquote(js_string)
|
354
391
|
|
355
|
-
|
392
|
+
if block.arity == 2
|
393
|
+
yield string, page
|
394
|
+
else
|
395
|
+
yield string
|
396
|
+
end
|
356
397
|
else
|
357
398
|
scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
|
358
399
|
scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
|
@@ -364,15 +405,200 @@ module Ronin
|
|
364
405
|
|
365
406
|
alias every_js_string every_javascript_string
|
366
407
|
|
408
|
+
# Regular expression that matches relative paths within JavaScript.
|
409
|
+
#
|
410
|
+
# @note
|
411
|
+
# This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
|
412
|
+
# but *not* `/foo`, `foo`, or `foo.`.
|
413
|
+
JAVASCRIPT_RELATIVE_PATH = %r{
|
414
|
+
\A
|
415
|
+
(?:
|
416
|
+
[^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
|
417
|
+
|
|
418
|
+
[^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
|
419
|
+
)
|
420
|
+
\z
|
421
|
+
}x
|
422
|
+
|
423
|
+
#
|
424
|
+
# Passes every JavaScript relative path string to the given block.
|
425
|
+
#
|
426
|
+
# @yield [string]
|
427
|
+
# The given block will be passed each JavaScript relative path string
|
428
|
+
# with the quote marks removed.
|
429
|
+
#
|
430
|
+
# @yield [string, page]
|
431
|
+
# If the block accepts two arguments, the JavaScript relative path
|
432
|
+
# string and the page that the JavaScript relative path string was
|
433
|
+
# found on will be passed to the given block.
|
434
|
+
#
|
435
|
+
# @yieldparam [String] string
|
436
|
+
# The parsed contents of a literal JavaScript relative path string.
|
437
|
+
#
|
438
|
+
# @yieldparam [Spidr::Page] page
|
439
|
+
# The page that the JavaScript relative path string was found in or
|
440
|
+
# on.
|
441
|
+
#
|
442
|
+
# @example
|
443
|
+
# spider.every_javascript_relative_path_string do |relative_path|
|
444
|
+
# puts relative_path
|
445
|
+
# end
|
446
|
+
#
|
447
|
+
# @api public
|
448
|
+
#
|
449
|
+
# @since 0.2.0
|
450
|
+
#
|
451
|
+
def every_javascript_relative_path_string(&block)
|
452
|
+
every_javascript_string do |string,page|
|
453
|
+
if string =~ JAVASCRIPT_RELATIVE_PATH
|
454
|
+
if block.arity == 2
|
455
|
+
yield string, page
|
456
|
+
else
|
457
|
+
yield string
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
alias every_js_relative_path_string every_javascript_relative_path_string
|
464
|
+
|
465
|
+
# Regular expression that matches absolute paths within JavaScript.
|
466
|
+
JAVASCRIPT_ABSOLUTE_PATH = %r{\A(?:/[^/\\ ]+)+\z}
|
467
|
+
|
468
|
+
#
|
469
|
+
# Passes every JavaScript absolute path string to the given block.
|
470
|
+
#
|
471
|
+
# @yield [string]
|
472
|
+
# The given block will be passed each JavaScript absolute path string
|
473
|
+
# with the quote marks removed.
|
474
|
+
#
|
475
|
+
# @yield [string, page]
|
476
|
+
# If the block accepts two arguments, the JavaScript absolute path
|
477
|
+
# string and the page that the JavaScript absolute path string was
|
478
|
+
# found on will be passed to the given block.
|
479
|
+
#
|
480
|
+
# @yieldparam [String] string
|
481
|
+
# The parsed contents of a literal JavaScript absolute path string.
|
482
|
+
#
|
483
|
+
# @yieldparam [Spidr::Page] page
|
484
|
+
# The page that the JavaScript absolute path string was found in or
|
485
|
+
# on.
|
486
|
+
#
|
487
|
+
# @example
|
488
|
+
# spider.every_javascript_absolute_path_string do |absolute_path|
|
489
|
+
# puts absolute_path
|
490
|
+
# end
|
491
|
+
#
|
492
|
+
# @api public
|
493
|
+
#
|
494
|
+
# @since 0.2.0
|
495
|
+
#
|
496
|
+
def every_javascript_absolute_path_string(&block)
|
497
|
+
every_javascript_string do |string,page|
|
498
|
+
if string =~ JAVASCRIPT_ABSOLUTE_PATH
|
499
|
+
if block.arity == 2
|
500
|
+
yield string, page
|
501
|
+
else
|
502
|
+
yield string
|
503
|
+
end
|
504
|
+
end
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
alias every_js_absolute_path_string every_javascript_absolute_path_string
|
509
|
+
|
510
|
+
#
|
511
|
+
# Passes every JavaScript path string to the given block.
|
512
|
+
#
|
513
|
+
# @yield [string]
|
514
|
+
# The given block will be passed each JavaScript path string with the
|
515
|
+
# quote marks removed.
|
516
|
+
#
|
517
|
+
# @yield [string, page]
|
518
|
+
# If the block accepts two arguments, the JavaScript path string and
|
519
|
+
# the page that the JavaScript path string was found on will be
|
520
|
+
# passed to the given block.
|
521
|
+
#
|
522
|
+
# @yieldparam [String] string
|
523
|
+
# The parsed contents of a literal JavaScript path string.
|
524
|
+
#
|
525
|
+
# @yieldparam [Spidr::Page] page
|
526
|
+
# The page that the JavaScript path string was found in or on.
|
527
|
+
#
|
528
|
+
# @example
|
529
|
+
# spider.every_javascript_path_string do |path|
|
530
|
+
# puts path
|
531
|
+
# end
|
532
|
+
#
|
533
|
+
# @api public
|
534
|
+
#
|
535
|
+
# @since 0.2.0
|
536
|
+
#
|
537
|
+
def every_javascript_path_string(&block)
|
538
|
+
every_javascript_relative_path_string(&block)
|
539
|
+
every_javascript_absolute_path_string(&block)
|
540
|
+
end
|
541
|
+
|
542
|
+
alias every_js_path_string every_javascript_path_string
|
543
|
+
|
544
|
+
#
|
545
|
+
# Passes every JavaScript URL string to the given block.
|
546
|
+
#
|
547
|
+
# @yield [string]
|
548
|
+
# The given block will be passed each JavaScript URL string with the
|
549
|
+
# quote marks removed.
|
550
|
+
#
|
551
|
+
# @yield [string, page]
|
552
|
+
# If the block accepts two arguments, the JavaScript URL string and
|
553
|
+
# the page that the JavaScript URL string was found on will be passed
|
554
|
+
# to the given block.
|
555
|
+
#
|
556
|
+
# @yieldparam [String] string
|
557
|
+
# The parsed contents of a literal JavaScript URL string.
|
558
|
+
#
|
559
|
+
# @yieldparam [Spidr::Page] page
|
560
|
+
# The page that the JavaScript URL string was found in or on.
|
561
|
+
#
|
562
|
+
# @example
|
563
|
+
# spider.every_javascript_url_string do |url|
|
564
|
+
# puts url
|
565
|
+
# end
|
566
|
+
#
|
567
|
+
# @api public
|
568
|
+
#
|
569
|
+
# @since 0.2.0
|
570
|
+
#
|
571
|
+
def every_javascript_url_string(&block)
|
572
|
+
every_javascript_string do |string,page|
|
573
|
+
if string =~ Support::Text::Patterns::URL
|
574
|
+
if block.arity == 2
|
575
|
+
yield string, page
|
576
|
+
else
|
577
|
+
yield string
|
578
|
+
end
|
579
|
+
end
|
580
|
+
end
|
581
|
+
end
|
582
|
+
|
583
|
+
alias every_js_url_string every_javascript_url_string
|
584
|
+
|
367
585
|
#
|
368
586
|
# Passes every JavaScript comment to the given block.
|
369
587
|
#
|
370
588
|
# @yield [comment]
|
371
589
|
# The given block will be passed each JavaScript comment.
|
372
590
|
#
|
591
|
+
# @yield [comment, page]
|
592
|
+
# If the block accepts two arguments, the JavaScript comment and the
|
593
|
+
# page that the JavaScript comment was found on will be passed to the
|
594
|
+
# given block.
|
595
|
+
#
|
373
596
|
# @yieldparam [String] comment
|
374
597
|
# The contents of a JavaScript comment.
|
375
598
|
#
|
599
|
+
# @yieldparam [Spidr::Page] page
|
600
|
+
# The page that the JavaScript comment was found in or on.
|
601
|
+
#
|
376
602
|
# @example
|
377
603
|
# spider.every_javascript_comment do |comment|
|
378
604
|
# puts comment
|
@@ -381,8 +607,14 @@ module Ronin
|
|
381
607
|
# @api public
|
382
608
|
#
|
383
609
|
def every_javascript_comment(&block)
|
384
|
-
every_javascript do |js|
|
385
|
-
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT
|
610
|
+
every_javascript do |js,page|
|
611
|
+
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
|
612
|
+
if block.arity == 2
|
613
|
+
yield comment, page
|
614
|
+
else
|
615
|
+
yield comment
|
616
|
+
end
|
617
|
+
end
|
386
618
|
end
|
387
619
|
end
|
388
620
|
|
@@ -394,9 +626,17 @@ module Ronin
|
|
394
626
|
# @yield [comment]
|
395
627
|
# The given block will be passed each HTML or JavaScript comment.
|
396
628
|
#
|
629
|
+
# @yield [comment, page]
|
630
|
+
# If the block accepts two arguments, the HTML or JavaScript comment
|
631
|
+
# and the page that the HTML/JavaScript comment was found on will be
|
632
|
+
# passed to the given block.
|
633
|
+
#
|
397
634
|
# @yieldparam [String] comment
|
398
635
|
# The contents of a HTML or JavaScript comment.
|
399
636
|
#
|
637
|
+
# @yieldparam [Spidr::Page] page
|
638
|
+
# The page that the HTML or JavaScript comment was found in or on.
|
639
|
+
#
|
400
640
|
# @example
|
401
641
|
# spider.every_comment do |comment|
|
402
642
|
# puts comment
|
@@ -2,7 +2,7 @@
|
|
2
2
|
#
|
3
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
4
4
|
#
|
5
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
6
|
#
|
7
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
8
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -2,7 +2,7 @@
|
|
2
2
|
#
|
3
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
4
4
|
#
|
5
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
6
|
#
|
7
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
8
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -22,7 +22,7 @@ module Ronin
|
|
22
22
|
module Web
|
23
23
|
module Spider
|
24
24
|
# ronin-web-spider version
|
25
|
-
VERSION = '0.
|
25
|
+
VERSION = '0.2.0.rc1'
|
26
26
|
end
|
27
27
|
end
|
28
28
|
end
|
data/lib/ronin/web/spider.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#
|
3
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
4
4
|
#
|
5
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
6
|
#
|
7
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
8
8
|
# it under the terms of the GNU Lesser General Public License as published
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ronin-web-spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-06-
|
11
|
+
date: 2024-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|