ronin-web-spider 0.1.1 → 0.2.0.rc2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dab34842325a731e13f23303b1dec66c7ab9d78b4805b982d518ba01024c9352
4
- data.tar.gz: '0668f126b3e828c6409cc7b7adef2d74cd527f48de334b10a3d54f3767fe3afd'
3
+ metadata.gz: 7aedd94cd4b3f96a07824419722763a91374590e6944ab3ae58c6ff1432837ca
4
+ data.tar.gz: 67d2d63b5468838f60f8b3aa3975dcb8f0842f66ca1d776384fb29c6a7f6a8ca
5
5
  SHA512:
6
- metadata.gz: d474705a601b7fe27be2a9c5f5e5485ed39b38dec1581db295b0e1ff524c987c9e74522afd24569829bc7e64f6930767104f9bd927a3007a17f627de003492f7
7
- data.tar.gz: 397b84308ec62d51e1dba64cff37c75eda8ebd8c2e6a792487468158fb774aae566b8be5cd0388521774d62873e5ba6b751423bc1138ac9e3d47804bdd877a81
6
+ metadata.gz: b9e7588f16084226b812db561261122af81fea2bd2d4c1b529c7cc1763e48060c90f716475e930029159f9cd1f886866f47ea03be8c4131c850af0b6d73da8c0
7
+ data.tar.gz: 253283e0e5f8046d4d41fb9bf445933017e81b92e72c817aa0d4c6850c4952e0ec687e727196707f71e0343a97ba0f492a409e070a3c00e8d0fd8640feda75d5
data/ChangeLog.md CHANGED
@@ -1,3 +1,18 @@
1
+ ### 0.2.0 / 2024-XX-XX
2
+
3
+ * Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
4
+ * Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
5
+ * Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
6
+ * Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
7
+ * Allow {Ronin::Web::Spider::Agent#every_html_comment},
8
+ {Ronin::Web::Spider::Agent#every_javascript every_javascript},
9
+ {Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
10
+ {Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
11
+ {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
12
+ {Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
13
+ {Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
14
+ to also yield a `Spidr::Page` block argument for additional context.
15
+
1
16
  ### 0.1.1 / 2024-06-19
2
17
 
3
18
  * Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
data/README.md CHANGED
@@ -304,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
304
304
  end
305
305
  ```
306
306
 
307
+ Print every JavaScript URL string literal:
308
+
309
+ ```ruby
310
+ Ronin::Web::Spider.domain('example.com') do |spider|
311
+ spider.every_javascript_url_string do |url|
312
+ puts url
313
+ end
314
+ end
315
+ ```
316
+
307
317
  Print every JavaScript comment:
308
318
 
309
319
  ```ruby
@@ -390,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
390
400
 
391
401
  ## License
392
402
 
393
- Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
403
+ Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
394
404
 
395
405
  ronin-web-spider is free software: you can redistribute it and/or modify
396
406
  it under the terms of the GNU Lesser General Public License as published
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -23,6 +23,7 @@ require 'spidr/agent'
23
23
  require 'ronin/support/network/http'
24
24
  require 'ronin/support/crypto/cert'
25
25
  require 'ronin/support/text/patterns/source_code'
26
+ require 'ronin/support/text/patterns/network'
26
27
  require 'ronin/support/encoding/js'
27
28
 
28
29
  module Ronin
@@ -225,10 +226,17 @@ module Ronin
225
226
  # @yield [comment]
226
227
  # The given block will be pass every HTML comment.
227
228
  #
229
+ # @yield [comment, page]
230
+ # If the block accepts two arguments, the HTML comment and the page
231
+ # that the comment was found on will be passed to the given block.
232
+ #
228
233
  # @yieldparam [String] comment
229
234
  # The HTML comment inner text, with leading and trailing whitespace
230
235
  # stripped.
231
236
  #
237
+ # @yieldparam [Spidr::Page] page
238
+ # The page that the HTML comment exists on.
239
+ #
232
240
  # @example
233
241
  # spider.every_html_comment do |comment|
234
242
  # puts comment
@@ -236,7 +244,7 @@ module Ronin
236
244
  #
237
245
  # @api public
238
246
  #
239
- def every_html_comment
247
+ def every_html_comment(&block)
240
248
  every_html_page do |page|
241
249
  next unless page.doc
242
250
 
@@ -244,7 +252,11 @@ module Ronin
244
252
  comment_text = comment.inner_text.strip
245
253
 
246
254
  unless comment_text.empty?
247
- yield comment_text
255
+ if block.arity == 2
256
+ yield comment_text, page
257
+ else
258
+ yield comment_text
259
+ end
248
260
  end
249
261
  end
250
262
  end
@@ -256,9 +268,17 @@ module Ronin
256
268
  # @yield [js]
257
269
  # The given block will be passed every piece of JavaScript source.
258
270
  #
271
+ # @yield [js, page]
272
+ # If the block accepts two arguments, the JavaScript source and the
273
+ # page that the JavaScript source was found on will be passed to the
274
+ # given block.
275
+ #
259
276
  # @yieldparam [String] js
260
277
  # The JavaScript source code.
261
278
  #
279
+ # @yieldparam [Spidr::Page] page
280
+ # The page that the JavaScript source was found in or on.
281
+ #
262
282
  # @example
263
283
  # spider.every_javascript do |js|
264
284
  # puts js
@@ -266,7 +286,7 @@ module Ronin
266
286
  #
267
287
  # @api public
268
288
  #
269
- def every_javascript
289
+ def every_javascript(&block)
270
290
  # yield inner text of every `<script type="text/javascript">` tag
271
291
  # and every `.js` URL.
272
292
  every_html_page do |page|
@@ -277,7 +297,11 @@ module Ronin
277
297
  source.force_encoding(Encoding::UTF_8)
278
298
 
279
299
  unless source.empty?
280
- yield source
300
+ if block.arity == 2
301
+ yield source, page
302
+ else
303
+ yield source
304
+ end
281
305
  end
282
306
  end
283
307
  end
@@ -286,7 +310,11 @@ module Ronin
286
310
  source = page.body
287
311
  source.force_encoding(Encoding::UTF_8)
288
312
 
289
- yield source
313
+ if block.arity == 2
314
+ yield source, page
315
+ else
316
+ yield source
317
+ end
290
318
  end
291
319
  end
292
320
 
@@ -297,7 +325,7 @@ module Ronin
297
325
  # @api private
298
326
  #
299
327
  # @since 0.1.1
300
- JAVASCRIPT_INLINE_REGEX = %r{
328
+ JAVASCRIPT_INLINE_REGEX_REGEX = %r{
301
329
  (?# match before the regex to avoid matching division operators )
302
330
  (?:[\{\[\(;:,]\s*|=\s*)
303
331
  /
@@ -322,7 +350,7 @@ module Ronin
322
350
  # @api private
323
351
  #
324
352
  # @since 0.1.1
325
- JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
353
+ JAVASCRIPT_TEMPLATE_LITERAL_REGEX = /`(?:\\`|[^`])+`/m
326
354
 
327
355
  #
328
356
  # Passes every JavaScript string value to the given block.
@@ -331,9 +359,17 @@ module Ronin
331
359
  # The given block will be passed each JavaScript string with the quote
332
360
  # marks removed.
333
361
  #
362
+ # @yield [string, page]
363
+ # If the block accepts two arguments, the JavaScript string and the
364
+ # page that the JavaScript string was found on will be passed to the
365
+ # given block.
366
+ #
334
367
  # @yieldparam [String] string
335
368
  # The parsed contents of a JavaScript string.
336
369
  #
370
+ # @yieldparam [Spidr::Page] page
371
+ # The page that the JavaScript string was found in or on.
372
+ #
337
373
  # @example
338
374
  # spider.every_javascript_string do |str|
339
375
  # puts str
@@ -341,8 +377,8 @@ module Ronin
341
377
  #
342
378
  # @api public
343
379
  #
344
- def every_javascript_string
345
- every_javascript do |js|
380
+ def every_javascript_string(&block)
381
+ every_javascript do |js,page|
346
382
  scanner = StringScanner.new(js)
347
383
 
348
384
  until scanner.eos?
@@ -351,11 +387,16 @@ module Ronin
351
387
  case scanner.peek(1)
352
388
  when '"', "'" # beginning of a quoted string
353
389
  js_string = scanner.scan(Support::Text::Patterns::STRING)
390
+ string = Support::Encoding::JS.unquote(js_string)
354
391
 
355
- yield Support::Encoding::JS.unquote(js_string)
392
+ if block.arity == 2
393
+ yield string, page
394
+ else
395
+ yield string
396
+ end
356
397
  else
357
- scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
358
- scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
398
+ scanner.skip(JAVASCRIPT_INLINE_REGEX_REGEX) ||
399
+ scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL_REGEX) ||
359
400
  scanner.getch
360
401
  end
361
402
  end
@@ -364,15 +405,215 @@ module Ronin
364
405
 
365
406
  alias every_js_string every_javascript_string
366
407
 
408
+ # Regular expression that matches relative paths within JavaScript.
409
+ #
410
+ # @note
411
+ # This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
412
+ # but *not* `/foo`, `foo`, or `foo.`.
413
+ #
414
+ # @api private
415
+ #
416
+ # @since 0.2.0
417
+ JAVASCRIPT_RELATIVE_PATH_REGEX = %r{
418
+ \A
419
+ (?:
420
+ [^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
421
+ |
422
+ [^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
423
+ )
424
+ \z
425
+ }x
426
+
427
+ #
428
+ # Passes every JavaScript relative path string to the given block.
429
+ #
430
+ # @yield [string]
431
+ # The given block will be passed each JavaScript relative path string
432
+ # with the quote marks removed.
433
+ #
434
+ # @yield [string, page]
435
+ # If the block accepts two arguments, the JavaScript relative path
436
+ # string and the page that the JavaScript relative path string was
437
+ # found on will be passed to the given block.
438
+ #
439
+ # @yieldparam [String] string
440
+ # The parsed contents of a literal JavaScript relative path string.
441
+ #
442
+ # @yieldparam [Spidr::Page] page
443
+ # The page that the JavaScript relative path string was found in or
444
+ # on.
445
+ #
446
+ # @example
447
+ # spider.every_javascript_relative_path_string do |relative_path|
448
+ # puts relative_path
449
+ # end
450
+ #
451
+ # @api public
452
+ #
453
+ # @since 0.2.0
454
+ #
455
+ def every_javascript_relative_path_string(&block)
456
+ every_javascript_string do |string,page|
457
+ if string =~ JAVASCRIPT_RELATIVE_PATH_REGEX
458
+ if block.arity == 2
459
+ yield string, page
460
+ else
461
+ yield string
462
+ end
463
+ end
464
+ end
465
+ end
466
+
467
+ alias every_js_relative_path_string every_javascript_relative_path_string
468
+
469
+ # Regular expression that matches absolute paths within JavaScript.
470
+ #
471
+ # @api private
472
+ #
473
+ # @since 0.2.0
474
+ JAVASCRIPT_ABSOLUTE_PATH_REGEX = %r{\A(?:/[^/\\ ]+)+\z}
475
+
476
+ #
477
+ # Passes every JavaScript absolute path string to the given block.
478
+ #
479
+ # @yield [string]
480
+ # The given block will be passed each JavaScript absolute path string
481
+ # with the quote marks removed.
482
+ #
483
+ # @yield [string, page]
484
+ # If the block accepts two arguments, the JavaScript absolute path
485
+ # string and the page that the JavaScript absolute path string was
486
+ # found on will be passed to the given block.
487
+ #
488
+ # @yieldparam [String] string
489
+ # The parsed contents of a literal JavaScript absolute path string.
490
+ #
491
+ # @yieldparam [Spidr::Page] page
492
+ # The page that the JavaScript absolute path string was found in or
493
+ # on.
494
+ #
495
+ # @example
496
+ # spider.every_javascript_absolute_path_string do |absolute_path|
497
+ # puts absolute_path
498
+ # end
499
+ #
500
+ # @api public
501
+ #
502
+ # @since 0.2.0
503
+ #
504
+ def every_javascript_absolute_path_string(&block)
505
+ every_javascript_string do |string,page|
506
+ if string =~ JAVASCRIPT_ABSOLUTE_PATH_REGEX
507
+ if block.arity == 2
508
+ yield string, page
509
+ else
510
+ yield string
511
+ end
512
+ end
513
+ end
514
+ end
515
+
516
+ alias every_js_absolute_path_string every_javascript_absolute_path_string
517
+
518
+ #
519
+ # Passes every JavaScript path string to the given block.
520
+ #
521
+ # @yield [string]
522
+ # The given block will be passed each JavaScript path string with the
523
+ # quote marks removed.
524
+ #
525
+ # @yield [string, page]
526
+ # If the block accepts two arguments, the JavaScript path string and
527
+ # the page that the JavaScript path string was found on will be
528
+ # passed to the given block.
529
+ #
530
+ # @yieldparam [String] string
531
+ # The parsed contents of a literal JavaScript path string.
532
+ #
533
+ # @yieldparam [Spidr::Page] page
534
+ # The page that the JavaScript path string was found in or on.
535
+ #
536
+ # @example
537
+ # spider.every_javascript_path_string do |path|
538
+ # puts path
539
+ # end
540
+ #
541
+ # @api public
542
+ #
543
+ # @since 0.2.0
544
+ #
545
+ def every_javascript_path_string(&block)
546
+ every_javascript_relative_path_string(&block)
547
+ every_javascript_absolute_path_string(&block)
548
+ end
549
+
550
+ alias every_js_path_string every_javascript_path_string
551
+
552
+ # Regular expression for identifying URLs.
553
+ #
554
+ # @api private
555
+ #
556
+ # @since 0.2.0
557
+ URL_REGEX = /\A#{Support::Text::Patterns::URL}\z/
558
+
559
+ #
560
+ # Passes every JavaScript URL string to the given block.
561
+ #
562
+ # @yield [string]
563
+ # The given block will be passed each JavaScript URL string with the
564
+ # quote marks removed.
565
+ #
566
+ # @yield [string, page]
567
+ # If the block accepts two arguments, the JavaScript URL string and
568
+ # the page that the JavaScript URL string was found on will be passed
569
+ # to the given block.
570
+ #
571
+ # @yieldparam [String] string
572
+ # The parsed contents of a literal JavaScript URL string.
573
+ #
574
+ # @yieldparam [Spidr::Page] page
575
+ # The page that the JavaScript URL string was found in or on.
576
+ #
577
+ # @example
578
+ # spider.every_javascript_url_string do |url|
579
+ # puts url
580
+ # end
581
+ #
582
+ # @api public
583
+ #
584
+ # @since 0.2.0
585
+ #
586
+ def every_javascript_url_string(&block)
587
+ every_javascript_string do |string,page|
588
+ if string =~ URL_REGEX
589
+ if block.arity == 2
590
+ yield string, page
591
+ else
592
+ yield string
593
+ end
594
+ end
595
+ end
596
+ end
597
+
598
+ alias every_js_url_string every_javascript_url_string
599
+
367
600
  #
368
601
  # Passes every JavaScript comment to the given block.
369
602
  #
370
603
  # @yield [comment]
371
604
  # The given block will be passed each JavaScript comment.
372
605
  #
606
+ # @yield [comment, page]
607
+ # If the block accepts two arguments, the JavaScript comment and the
608
+ # page that the JavaScript comment was found on will be passed to the
609
+ # given block.
610
+ #
373
611
  # @yieldparam [String] comment
374
612
  # The contents of a JavaScript comment.
375
613
  #
614
+ # @yieldparam [Spidr::Page] page
615
+ # The page that the JavaScript comment was found in or on.
616
+ #
376
617
  # @example
377
618
  # spider.every_javascript_comment do |comment|
378
619
  # puts comment
@@ -381,8 +622,14 @@ module Ronin
381
622
  # @api public
382
623
  #
383
624
  def every_javascript_comment(&block)
384
- every_javascript do |js|
385
- js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
625
+ every_javascript do |js,page|
626
+ js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
627
+ if block.arity == 2
628
+ yield comment, page
629
+ else
630
+ yield comment
631
+ end
632
+ end
386
633
  end
387
634
  end
388
635
 
@@ -394,9 +641,17 @@ module Ronin
394
641
  # @yield [comment]
395
642
  # The given block will be passed each HTML or JavaScript comment.
396
643
  #
644
+ # @yield [comment, page]
645
+ # If the block accepts two arguments, the HTML or JavaScript comment
646
+ # and the page that the HTML/JavaScript comment was found on will be
647
+ # passed to the given block.
648
+ #
397
649
  # @yieldparam [String] comment
398
650
  # The contents of a HTML or JavaScript comment.
399
651
  #
652
+ # @yieldparam [Spidr::Page] page
653
+ # The page that the HTML or JavaScript comment was found in or on.
654
+ #
400
655
  # @example
401
656
  # spider.every_comment do |comment|
402
657
  # puts comment
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,7 +22,7 @@ module Ronin
22
22
  module Web
23
23
  module Spider
24
24
  # ronin-web-spider version
25
- VERSION = '0.1.1'
25
+ VERSION = '0.2.0.rc2'
26
26
  end
27
27
  end
28
28
  end
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ronin-web-spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0.rc2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-06-20 00:00:00.000000000 Z
11
+ date: 2024-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr