ronin-web-spider 0.1.1 → 0.2.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dab34842325a731e13f23303b1dec66c7ab9d78b4805b982d518ba01024c9352
4
- data.tar.gz: '0668f126b3e828c6409cc7b7adef2d74cd527f48de334b10a3d54f3767fe3afd'
3
+ metadata.gz: 7aedd94cd4b3f96a07824419722763a91374590e6944ab3ae58c6ff1432837ca
4
+ data.tar.gz: 67d2d63b5468838f60f8b3aa3975dcb8f0842f66ca1d776384fb29c6a7f6a8ca
5
5
  SHA512:
6
- metadata.gz: d474705a601b7fe27be2a9c5f5e5485ed39b38dec1581db295b0e1ff524c987c9e74522afd24569829bc7e64f6930767104f9bd927a3007a17f627de003492f7
7
- data.tar.gz: 397b84308ec62d51e1dba64cff37c75eda8ebd8c2e6a792487468158fb774aae566b8be5cd0388521774d62873e5ba6b751423bc1138ac9e3d47804bdd877a81
6
+ metadata.gz: b9e7588f16084226b812db561261122af81fea2bd2d4c1b529c7cc1763e48060c90f716475e930029159f9cd1f886866f47ea03be8c4131c850af0b6d73da8c0
7
+ data.tar.gz: 253283e0e5f8046d4d41fb9bf445933017e81b92e72c817aa0d4c6850c4952e0ec687e727196707f71e0343a97ba0f492a409e070a3c00e8d0fd8640feda75d5
data/ChangeLog.md CHANGED
@@ -1,3 +1,18 @@
1
+ ### 0.2.0 / 2024-XX-XX
2
+
3
+ * Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
4
+ * Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
5
+ * Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
6
+ * Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
7
+ * Allow {Ronin::Web::Spider::Agent#every_html_comment},
8
+ {Ronin::Web::Spider::Agent#every_javascript every_javascript},
9
+ {Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
10
+ {Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
11
+ {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
12
+ {Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
13
+ {Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
14
+ to also yield a `Spidr::Page` block argument for additional context.
15
+
1
16
  ### 0.1.1 / 2024-06-19
2
17
 
3
18
  * Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
data/README.md CHANGED
@@ -304,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
304
304
  end
305
305
  ```
306
306
 
307
+ Print every JavaScript URL string literal:
308
+
309
+ ```ruby
310
+ Ronin::Web::Spider.domain('example.com') do |spider|
311
+ spider.every_javascript_url_string do |url|
312
+ puts url
313
+ end
314
+ end
315
+ ```
316
+
307
317
  Print every JavaScript comment:
308
318
 
309
319
  ```ruby
@@ -390,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
390
400
 
391
401
  ## License
392
402
 
393
- Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
403
+ Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
394
404
 
395
405
  ronin-web-spider is free software: you can redistribute it and/or modify
396
406
  it under the terms of the GNU Lesser General Public License as published
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -23,6 +23,7 @@ require 'spidr/agent'
23
23
  require 'ronin/support/network/http'
24
24
  require 'ronin/support/crypto/cert'
25
25
  require 'ronin/support/text/patterns/source_code'
26
+ require 'ronin/support/text/patterns/network'
26
27
  require 'ronin/support/encoding/js'
27
28
 
28
29
  module Ronin
@@ -225,10 +226,17 @@ module Ronin
225
226
  # @yield [comment]
226
227
  # The given block will be pass every HTML comment.
227
228
  #
229
+ # @yield [comment, page]
230
+ # If the block accepts two arguments, the HTML comment and the page
231
+ # that the comment was found on will be passed to the given block.
232
+ #
228
233
  # @yieldparam [String] comment
229
234
  # The HTML comment inner text, with leading and trailing whitespace
230
235
  # stripped.
231
236
  #
237
+ # @yieldparam [Spidr::Page] page
238
+ # The page that the HTML comment exists on.
239
+ #
232
240
  # @example
233
241
  # spider.every_html_comment do |comment|
234
242
  # puts comment
@@ -236,7 +244,7 @@ module Ronin
236
244
  #
237
245
  # @api public
238
246
  #
239
- def every_html_comment
247
+ def every_html_comment(&block)
240
248
  every_html_page do |page|
241
249
  next unless page.doc
242
250
 
@@ -244,7 +252,11 @@ module Ronin
244
252
  comment_text = comment.inner_text.strip
245
253
 
246
254
  unless comment_text.empty?
247
- yield comment_text
255
+ if block.arity == 2
256
+ yield comment_text, page
257
+ else
258
+ yield comment_text
259
+ end
248
260
  end
249
261
  end
250
262
  end
@@ -256,9 +268,17 @@ module Ronin
256
268
  # @yield [js]
257
269
  # The given block will be passed every piece of JavaScript source.
258
270
  #
271
+ # @yield [js, page]
272
+ # If the block accepts two arguments, the JavaScript source and the
273
+ # page that the JavaScript source was found on will be passed to the
274
+ # given block.
275
+ #
259
276
  # @yieldparam [String] js
260
277
  # The JavaScript source code.
261
278
  #
279
+ # @yieldparam [Spidr::Page] page
280
+ # The page that the JavaScript source was found in or on.
281
+ #
262
282
  # @example
263
283
  # spider.every_javascript do |js|
264
284
  # puts js
@@ -266,7 +286,7 @@ module Ronin
266
286
  #
267
287
  # @api public
268
288
  #
269
- def every_javascript
289
+ def every_javascript(&block)
270
290
  # yield inner text of every `<script type="text/javascript">` tag
271
291
  # and every `.js` URL.
272
292
  every_html_page do |page|
@@ -277,7 +297,11 @@ module Ronin
277
297
  source.force_encoding(Encoding::UTF_8)
278
298
 
279
299
  unless source.empty?
280
- yield source
300
+ if block.arity == 2
301
+ yield source, page
302
+ else
303
+ yield source
304
+ end
281
305
  end
282
306
  end
283
307
  end
@@ -286,7 +310,11 @@ module Ronin
286
310
  source = page.body
287
311
  source.force_encoding(Encoding::UTF_8)
288
312
 
289
- yield source
313
+ if block.arity == 2
314
+ yield source, page
315
+ else
316
+ yield source
317
+ end
290
318
  end
291
319
  end
292
320
 
@@ -297,7 +325,7 @@ module Ronin
297
325
  # @api private
298
326
  #
299
327
  # @since 0.1.1
300
- JAVASCRIPT_INLINE_REGEX = %r{
328
+ JAVASCRIPT_INLINE_REGEX_REGEX = %r{
301
329
  (?# match before the regex to avoid matching division operators )
302
330
  (?:[\{\[\(;:,]\s*|=\s*)
303
331
  /
@@ -322,7 +350,7 @@ module Ronin
322
350
  # @api private
323
351
  #
324
352
  # @since 0.1.1
325
- JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
353
+ JAVASCRIPT_TEMPLATE_LITERAL_REGEX = /`(?:\\`|[^`])+`/m
326
354
 
327
355
  #
328
356
  # Passes every JavaScript string value to the given block.
@@ -331,9 +359,17 @@ module Ronin
331
359
  # The given block will be passed each JavaScript string with the quote
332
360
  # marks removed.
333
361
  #
362
+ # @yield [string, page]
363
+ # If the block accepts two arguments, the JavaScript string and the
364
+ # page that the JavaScript string was found on will be passed to the
365
+ # given block.
366
+ #
334
367
  # @yieldparam [String] string
335
368
  # The parsed contents of a JavaScript string.
336
369
  #
370
+ # @yieldparam [Spidr::Page] page
371
+ # The page that the JavaScript string was found in or on.
372
+ #
337
373
  # @example
338
374
  # spider.every_javascript_string do |str|
339
375
  # puts str
@@ -341,8 +377,8 @@ module Ronin
341
377
  #
342
378
  # @api public
343
379
  #
344
- def every_javascript_string
345
- every_javascript do |js|
380
+ def every_javascript_string(&block)
381
+ every_javascript do |js,page|
346
382
  scanner = StringScanner.new(js)
347
383
 
348
384
  until scanner.eos?
@@ -351,11 +387,16 @@ module Ronin
351
387
  case scanner.peek(1)
352
388
  when '"', "'" # beginning of a quoted string
353
389
  js_string = scanner.scan(Support::Text::Patterns::STRING)
390
+ string = Support::Encoding::JS.unquote(js_string)
354
391
 
355
- yield Support::Encoding::JS.unquote(js_string)
392
+ if block.arity == 2
393
+ yield string, page
394
+ else
395
+ yield string
396
+ end
356
397
  else
357
- scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
358
- scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
398
+ scanner.skip(JAVASCRIPT_INLINE_REGEX_REGEX) ||
399
+ scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL_REGEX) ||
359
400
  scanner.getch
360
401
  end
361
402
  end
@@ -364,15 +405,215 @@ module Ronin
364
405
 
365
406
  alias every_js_string every_javascript_string
366
407
 
408
+ # Regular expression that matches relative paths within JavaScript.
409
+ #
410
+ # @note
411
+ # This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
412
+ # but *not* `/foo`, `foo`, or `foo.`.
413
+ #
414
+ # @api private
415
+ #
416
+ # @since 0.2.0
417
+ JAVASCRIPT_RELATIVE_PATH_REGEX = %r{
418
+ \A
419
+ (?:
420
+ [^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
421
+ |
422
+ [^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
423
+ )
424
+ \z
425
+ }x
426
+
427
+ #
428
+ # Passes every JavaScript relative path string to the given block.
429
+ #
430
+ # @yield [string]
431
+ # The given block will be passed each JavaScript relative path string
432
+ # with the quote marks removed.
433
+ #
434
+ # @yield [string, page]
435
+ # If the block accepts two arguments, the JavaScript relative path
436
+ # string and the page that the JavaScript relative path string was
437
+ # found on will be passed to the given block.
438
+ #
439
+ # @yieldparam [String] string
440
+ # The parsed contents of a literal JavaScript relative path string.
441
+ #
442
+ # @yieldparam [Spidr::Page] page
443
+ # The page that the JavaScript relative path string was found in or
444
+ # on.
445
+ #
446
+ # @example
447
+ # spider.every_javascript_relative_path_string do |relative_path|
448
+ # puts relative_path
449
+ # end
450
+ #
451
+ # @api public
452
+ #
453
+ # @since 0.2.0
454
+ #
455
+ def every_javascript_relative_path_string(&block)
456
+ every_javascript_string do |string,page|
457
+ if string =~ JAVASCRIPT_RELATIVE_PATH_REGEX
458
+ if block.arity == 2
459
+ yield string, page
460
+ else
461
+ yield string
462
+ end
463
+ end
464
+ end
465
+ end
466
+
467
+ alias every_js_relative_path_string every_javascript_relative_path_string
468
+
469
+ # Regular expression that matches absolute paths within JavaScript.
470
+ #
471
+ # @api private
472
+ #
473
+ # @since 0.2.0
474
+ JAVASCRIPT_ABSOLUTE_PATH_REGEX = %r{\A(?:/[^/\\ ]+)+\z}
475
+
476
+ #
477
+ # Passes every JavaScript absolute path string to the given block.
478
+ #
479
+ # @yield [string]
480
+ # The given block will be passed each JavaScript absolute path string
481
+ # with the quote marks removed.
482
+ #
483
+ # @yield [string, page]
484
+ # If the block accepts two arguments, the JavaScript absolute path
485
+ # string and the page that the JavaScript absolute path string was
486
+ # found on will be passed to the given block.
487
+ #
488
+ # @yieldparam [String] string
489
+ # The parsed contents of a literal JavaScript absolute path string.
490
+ #
491
+ # @yieldparam [Spidr::Page] page
492
+ # The page that the JavaScript absolute path string was found in or
493
+ # on.
494
+ #
495
+ # @example
496
+ # spider.every_javascript_absolute_path_string do |absolute_path|
497
+ # puts absolute_path
498
+ # end
499
+ #
500
+ # @api public
501
+ #
502
+ # @since 0.2.0
503
+ #
504
+ def every_javascript_absolute_path_string(&block)
505
+ every_javascript_string do |string,page|
506
+ if string =~ JAVASCRIPT_ABSOLUTE_PATH_REGEX
507
+ if block.arity == 2
508
+ yield string, page
509
+ else
510
+ yield string
511
+ end
512
+ end
513
+ end
514
+ end
515
+
516
+ alias every_js_absolute_path_string every_javascript_absolute_path_string
517
+
518
+ #
519
+ # Passes every JavaScript path string to the given block.
520
+ #
521
+ # @yield [string]
522
+ # The given block will be passed each JavaScript path string with the
523
+ # quote marks removed.
524
+ #
525
+ # @yield [string, page]
526
+ # If the block accepts two arguments, the JavaScript path string and
527
+ # the page that the JavaScript path string was found on will be
528
+ # passed to the given block.
529
+ #
530
+ # @yieldparam [String] string
531
+ # The parsed contents of a literal JavaScript path string.
532
+ #
533
+ # @yieldparam [Spidr::Page] page
534
+ # The page that the JavaScript path string was found in or on.
535
+ #
536
+ # @example
537
+ # spider.every_javascript_path_string do |path|
538
+ # puts path
539
+ # end
540
+ #
541
+ # @api public
542
+ #
543
+ # @since 0.2.0
544
+ #
545
+ def every_javascript_path_string(&block)
546
+ every_javascript_relative_path_string(&block)
547
+ every_javascript_absolute_path_string(&block)
548
+ end
549
+
550
+ alias every_js_path_string every_javascript_path_string
551
+
552
+ # Regular expression for identifying URLs.
553
+ #
554
+ # @api private
555
+ #
556
+ # @since 0.2.0
557
+ URL_REGEX = /\A#{Support::Text::Patterns::URL}\z/
558
+
559
+ #
560
+ # Passes every JavaScript URL string to the given block.
561
+ #
562
+ # @yield [string]
563
+ # The given block will be passed each JavaScript URL string with the
564
+ # quote marks removed.
565
+ #
566
+ # @yield [string, page]
567
+ # If the block accepts two arguments, the JavaScript URL string and
568
+ # the page that the JavaScript URL string was found on will be passed
569
+ # to the given block.
570
+ #
571
+ # @yieldparam [String] string
572
+ # The parsed contents of a literal JavaScript URL string.
573
+ #
574
+ # @yieldparam [Spidr::Page] page
575
+ # The page that the JavaScript URL string was found in or on.
576
+ #
577
+ # @example
578
+ # spider.every_javascript_url_string do |url|
579
+ # puts url
580
+ # end
581
+ #
582
+ # @api public
583
+ #
584
+ # @since 0.2.0
585
+ #
586
+ def every_javascript_url_string(&block)
587
+ every_javascript_string do |string,page|
588
+ if string =~ URL_REGEX
589
+ if block.arity == 2
590
+ yield string, page
591
+ else
592
+ yield string
593
+ end
594
+ end
595
+ end
596
+ end
597
+
598
+ alias every_js_url_string every_javascript_url_string
599
+
367
600
  #
368
601
  # Passes every JavaScript comment to the given block.
369
602
  #
370
603
  # @yield [comment]
371
604
  # The given block will be passed each JavaScript comment.
372
605
  #
606
+ # @yield [comment, page]
607
+ # If the block accepts two arguments, the JavaScript comment and the
608
+ # page that the JavaScript comment was found on will be passed to the
609
+ # given block.
610
+ #
373
611
  # @yieldparam [String] comment
374
612
  # The contents of a JavaScript comment.
375
613
  #
614
+ # @yieldparam [Spidr::Page] page
615
+ # The page that the JavaScript comment was found in or on.
616
+ #
376
617
  # @example
377
618
  # spider.every_javascript_comment do |comment|
378
619
  # puts comment
@@ -381,8 +622,14 @@ module Ronin
381
622
  # @api public
382
623
  #
383
624
  def every_javascript_comment(&block)
384
- every_javascript do |js|
385
- js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
625
+ every_javascript do |js,page|
626
+ js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
627
+ if block.arity == 2
628
+ yield comment, page
629
+ else
630
+ yield comment
631
+ end
632
+ end
386
633
  end
387
634
  end
388
635
 
@@ -394,9 +641,17 @@ module Ronin
394
641
  # @yield [comment]
395
642
  # The given block will be passed each HTML or JavaScript comment.
396
643
  #
644
+ # @yield [comment, page]
645
+ # If the block accepts two arguments, the HTML or JavaScript comment
646
+ # and the page that the HTML/JavaScript comment was found on will be
647
+ # passed to the given block.
648
+ #
397
649
  # @yieldparam [String] comment
398
650
  # The contents of a HTML or JavaScript comment.
399
651
  #
652
+ # @yieldparam [Spidr::Page] page
653
+ # The page that the HTML or JavaScript comment was found in or on.
654
+ #
400
655
  # @example
401
656
  # spider.every_comment do |comment|
402
657
  # puts comment
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,7 +22,7 @@ module Ronin
22
22
  module Web
23
23
  module Spider
24
24
  # ronin-web-spider version
25
- VERSION = '0.1.1'
25
+ VERSION = '0.2.0.rc2'
26
26
  end
27
27
  end
28
28
  end
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ronin-web-spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0.rc2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-06-20 00:00:00.000000000 Z
11
+ date: 2024-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr