ronin-web-spider 0.1.1 → 0.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dab34842325a731e13f23303b1dec66c7ab9d78b4805b982d518ba01024c9352
4
- data.tar.gz: '0668f126b3e828c6409cc7b7adef2d74cd527f48de334b10a3d54f3767fe3afd'
3
+ metadata.gz: e1637185a37e17f587cab3bb0ec451dfa763ab58b167404ec5b1161a4a80316f
4
+ data.tar.gz: ea11da89c3c232feaca90c103c45cdb653eac9b94c6c97be34998d6b5089e896
5
5
  SHA512:
6
- metadata.gz: d474705a601b7fe27be2a9c5f5e5485ed39b38dec1581db295b0e1ff524c987c9e74522afd24569829bc7e64f6930767104f9bd927a3007a17f627de003492f7
7
- data.tar.gz: 397b84308ec62d51e1dba64cff37c75eda8ebd8c2e6a792487468158fb774aae566b8be5cd0388521774d62873e5ba6b751423bc1138ac9e3d47804bdd877a81
6
+ metadata.gz: 4d2f5ac7b650096856b87f5e37cf42044a12db416eab2375715a39073606a1f4c30103fda27cb1faaf9a53533bdf323ae9912787078f24f120366fb519a3b4a1
7
+ data.tar.gz: b2ae98ce51187a5a65cd2355816b98d1916edb0d731f158e83a9366ec70261ef2e3eae9d5f3abe95b311a766c489064f855b4ee6afa4606592a5f9b560fbcb9d
data/ChangeLog.md CHANGED
@@ -1,3 +1,18 @@
1
+ ### 0.2.0 / 2024-XX-XX
2
+
3
+ * Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
4
+ * Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
5
+ * Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
6
+ * Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
7
+ * Allow {Ronin::Web::Spider::Agent#every_html_comment},
8
+ {Ronin::Web::Spider::Agent#every_javascript every_javascript},
9
+ {Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
10
+ {Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
11
+ {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
12
+ {Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
13
+ {Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
14
+ to also yield a `Spidr::Page` block argument for additional context.
15
+
1
16
  ### 0.1.1 / 2024-06-19
2
17
 
3
18
  * Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
data/README.md CHANGED
@@ -304,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
304
304
  end
305
305
  ```
306
306
 
307
+ Print every JavaScript URL string literal:
308
+
309
+ ```ruby
310
+ Ronin::Web::Spider.domain('example.com') do |spider|
311
+ spider.every_javascript_url_string do |url|
312
+ puts url
313
+ end
314
+ end
315
+ ```
316
+
307
317
  Print every JavaScript comment:
308
318
 
309
319
  ```ruby
@@ -390,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
390
400
 
391
401
  ## License
392
402
 
393
- Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
403
+ Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
394
404
 
395
405
  ronin-web-spider is free software: you can redistribute it and/or modify
396
406
  it under the terms of the GNU Lesser General Public License as published
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -23,6 +23,7 @@ require 'spidr/agent'
23
23
  require 'ronin/support/network/http'
24
24
  require 'ronin/support/crypto/cert'
25
25
  require 'ronin/support/text/patterns/source_code'
26
+ require 'ronin/support/text/patterns/network'
26
27
  require 'ronin/support/encoding/js'
27
28
 
28
29
  module Ronin
@@ -225,10 +226,17 @@ module Ronin
225
226
  # @yield [comment]
226
227
  # The given block will be pass every HTML comment.
227
228
  #
229
+ # @yield [comment, page]
230
+ # If the block accepts two arguments, the HTML comment and the page
231
+ # that the comment was found on will be passed to the given block.
232
+ #
228
233
  # @yieldparam [String] comment
229
234
  # The HTML comment inner text, with leading and trailing whitespace
230
235
  # stripped.
231
236
  #
237
+ # @yieldparam [Spidr::Page] page
238
+ # The page that the HTML comment exists on.
239
+ #
232
240
  # @example
233
241
  # spider.every_html_comment do |comment|
234
242
  # puts comment
@@ -236,7 +244,7 @@ module Ronin
236
244
  #
237
245
  # @api public
238
246
  #
239
- def every_html_comment
247
+ def every_html_comment(&block)
240
248
  every_html_page do |page|
241
249
  next unless page.doc
242
250
 
@@ -244,7 +252,11 @@ module Ronin
244
252
  comment_text = comment.inner_text.strip
245
253
 
246
254
  unless comment_text.empty?
247
- yield comment_text
255
+ if block.arity == 2
256
+ yield comment_text, page
257
+ else
258
+ yield comment_text
259
+ end
248
260
  end
249
261
  end
250
262
  end
@@ -256,9 +268,17 @@ module Ronin
256
268
  # @yield [js]
257
269
  # The given block will be passed every piece of JavaScript source.
258
270
  #
271
+ # @yield [js, page]
272
+ # If the block accepts two arguments, the JavaScript source and the
273
+ # page that the JavaScript source was found on will be passed to the
274
+ # given block.
275
+ #
259
276
  # @yieldparam [String] js
260
277
  # The JavaScript source code.
261
278
  #
279
+ # @yieldparam [Spidr::Page] page
280
+ # The page that the JavaScript source was found in or on.
281
+ #
262
282
  # @example
263
283
  # spider.every_javascript do |js|
264
284
  # puts js
@@ -266,7 +286,7 @@ module Ronin
266
286
  #
267
287
  # @api public
268
288
  #
269
- def every_javascript
289
+ def every_javascript(&block)
270
290
  # yield inner text of every `<script type="text/javascript">` tag
271
291
  # and every `.js` URL.
272
292
  every_html_page do |page|
@@ -277,7 +297,11 @@ module Ronin
277
297
  source.force_encoding(Encoding::UTF_8)
278
298
 
279
299
  unless source.empty?
280
- yield source
300
+ if block.arity == 2
301
+ yield source, page
302
+ else
303
+ yield source
304
+ end
281
305
  end
282
306
  end
283
307
  end
@@ -286,7 +310,11 @@ module Ronin
286
310
  source = page.body
287
311
  source.force_encoding(Encoding::UTF_8)
288
312
 
289
- yield source
313
+ if block.arity == 2
314
+ yield source, page
315
+ else
316
+ yield source
317
+ end
290
318
  end
291
319
  end
292
320
 
@@ -331,9 +359,17 @@ module Ronin
331
359
  # The given block will be passed each JavaScript string with the quote
332
360
  # marks removed.
333
361
  #
362
+ # @yield [string, page]
363
+ # If the block accepts two arguments, the JavaScript string and the
364
+ # page that the JavaScript string was found on will be passed to the
365
+ # given block.
366
+ #
334
367
  # @yieldparam [String] string
335
368
  # The parsed contents of a JavaScript string.
336
369
  #
370
+ # @yieldparam [Spidr::Page] page
371
+ # The page that the JavaScript string was found in or on.
372
+ #
337
373
  # @example
338
374
  # spider.every_javascript_string do |str|
339
375
  # puts str
@@ -341,8 +377,8 @@ module Ronin
341
377
  #
342
378
  # @api public
343
379
  #
344
- def every_javascript_string
345
- every_javascript do |js|
380
+ def every_javascript_string(&block)
381
+ every_javascript do |js,page|
346
382
  scanner = StringScanner.new(js)
347
383
 
348
384
  until scanner.eos?
@@ -351,8 +387,13 @@ module Ronin
351
387
  case scanner.peek(1)
352
388
  when '"', "'" # beginning of a quoted string
353
389
  js_string = scanner.scan(Support::Text::Patterns::STRING)
390
+ string = Support::Encoding::JS.unquote(js_string)
354
391
 
355
- yield Support::Encoding::JS.unquote(js_string)
392
+ if block.arity == 2
393
+ yield string, page
394
+ else
395
+ yield string
396
+ end
356
397
  else
357
398
  scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
358
399
  scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
@@ -364,15 +405,200 @@ module Ronin
364
405
 
365
406
  alias every_js_string every_javascript_string
366
407
 
408
+ # Regular expression that matches relative paths within JavaScript.
409
+ #
410
+ # @note
411
+ # This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
412
+ # but *not* `/foo`, `foo`, or `foo.`.
413
+ JAVASCRIPT_RELATIVE_PATH = %r{
414
+ \A
415
+ (?:
416
+ [^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
417
+ |
418
+ [^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
419
+ )
420
+ \z
421
+ }x
422
+
423
+ #
424
+ # Passes every JavaScript relative path string to the given block.
425
+ #
426
+ # @yield [string]
427
+ # The given block will be passed each JavaScript relative path string
428
+ # with the quote marks removed.
429
+ #
430
+ # @yield [string, page]
431
+ # If the block accepts two arguments, the JavaScript relative path
432
+ # string and the page that the JavaScript relative path string was
433
+ # found on will be passed to the given block.
434
+ #
435
+ # @yieldparam [String] string
436
+ # The parsed contents of a literal JavaScript relative path string.
437
+ #
438
+ # @yieldparam [Spidr::Page] page
439
+ # The page that the JavaScript relative path string was found in or
440
+ # on.
441
+ #
442
+ # @example
443
+ # spider.every_javascript_relative_path_string do |relative_path|
444
+ # puts relative_path
445
+ # end
446
+ #
447
+ # @api public
448
+ #
449
+ # @since 0.2.0
450
+ #
451
+ def every_javascript_relative_path_string(&block)
452
+ every_javascript_string do |string,page|
453
+ if string =~ JAVASCRIPT_RELATIVE_PATH
454
+ if block.arity == 2
455
+ yield string, page
456
+ else
457
+ yield string
458
+ end
459
+ end
460
+ end
461
+ end
462
+
463
+ alias every_js_relative_path_string every_javascript_relative_path_string
464
+
465
+ # Regular expression that matches absolute paths within JavaScript.
466
+ JAVASCRIPT_ABSOLUTE_PATH = %r{\A(?:/[^/\\ ]+)+\z}
467
+
468
+ #
469
+ # Passes every JavaScript absolute path string to the given block.
470
+ #
471
+ # @yield [string]
472
+ # The given block will be passed each JavaScript absolute path string
473
+ # with the quote marks removed.
474
+ #
475
+ # @yield [string, page]
476
+ # If the block accepts two arguments, the JavaScript absolute path
477
+ # string and the page that the JavaScript absolute path string was
478
+ # found on will be passed to the given block.
479
+ #
480
+ # @yieldparam [String] string
481
+ # The parsed contents of a literal JavaScript absolute path string.
482
+ #
483
+ # @yieldparam [Spidr::Page] page
484
+ # The page that the JavaScript absolute path string was found in or
485
+ # on.
486
+ #
487
+ # @example
488
+ # spider.every_javascript_absolute_path_string do |absolute_path|
489
+ # puts absolute_path
490
+ # end
491
+ #
492
+ # @api public
493
+ #
494
+ # @since 0.2.0
495
+ #
496
+ def every_javascript_absolute_path_string(&block)
497
+ every_javascript_string do |string,page|
498
+ if string =~ JAVASCRIPT_ABSOLUTE_PATH
499
+ if block.arity == 2
500
+ yield string, page
501
+ else
502
+ yield string
503
+ end
504
+ end
505
+ end
506
+ end
507
+
508
+ alias every_js_absolute_path_string every_javascript_absolute_path_string
509
+
510
+ #
511
+ # Passes every JavaScript path string to the given block.
512
+ #
513
+ # @yield [string]
514
+ # The given block will be passed each JavaScript path string with the
515
+ # quote marks removed.
516
+ #
517
+ # @yield [string, page]
518
+ # If the block accepts two arguments, the JavaScript path string and
519
+ # the page that the JavaScript path string was found on will be
520
+ # passed to the given block.
521
+ #
522
+ # @yieldparam [String] string
523
+ # The parsed contents of a literal JavaScript path string.
524
+ #
525
+ # @yieldparam [Spidr::Page] page
526
+ # The page that the JavaScript path string was found in or on.
527
+ #
528
+ # @example
529
+ # spider.every_javascript_path_string do |path|
530
+ # puts path
531
+ # end
532
+ #
533
+ # @api public
534
+ #
535
+ # @since 0.2.0
536
+ #
537
+ def every_javascript_path_string(&block)
538
+ every_javascript_relative_path_string(&block)
539
+ every_javascript_absolute_path_string(&block)
540
+ end
541
+
542
+ alias every_js_path_string every_javascript_path_string
543
+
544
+ #
545
+ # Passes every JavaScript URL string to the given block.
546
+ #
547
+ # @yield [string]
548
+ # The given block will be passed each JavaScript URL string with the
549
+ # quote marks removed.
550
+ #
551
+ # @yield [string, page]
552
+ # If the block accepts two arguments, the JavaScript URL string and
553
+ # the page that the JavaScript URL string was found on will be passed
554
+ # to the given block.
555
+ #
556
+ # @yieldparam [String] string
557
+ # The parsed contents of a literal JavaScript URL string.
558
+ #
559
+ # @yieldparam [Spidr::Page] page
560
+ # The page that the JavaScript URL string was found in or on.
561
+ #
562
+ # @example
563
+ # spider.every_javascript_url_string do |url|
564
+ # puts url
565
+ # end
566
+ #
567
+ # @api public
568
+ #
569
+ # @since 0.2.0
570
+ #
571
+ def every_javascript_url_string(&block)
572
+ every_javascript_string do |string,page|
573
+ if string =~ Support::Text::Patterns::URL
574
+ if block.arity == 2
575
+ yield string, page
576
+ else
577
+ yield string
578
+ end
579
+ end
580
+ end
581
+ end
582
+
583
+ alias every_js_url_string every_javascript_url_string
584
+
367
585
  #
368
586
  # Passes every JavaScript comment to the given block.
369
587
  #
370
588
  # @yield [comment]
371
589
  # The given block will be passed each JavaScript comment.
372
590
  #
591
+ # @yield [comment, page]
592
+ # If the block accepts two arguments, the JavaScript comment and the
593
+ # page that the JavaScript comment was found on will be passed to the
594
+ # given block.
595
+ #
373
596
  # @yieldparam [String] comment
374
597
  # The contents of a JavaScript comment.
375
598
  #
599
+ # @yieldparam [Spidr::Page] page
600
+ # The page that the JavaScript comment was found in or on.
601
+ #
376
602
  # @example
377
603
  # spider.every_javascript_comment do |comment|
378
604
  # puts comment
@@ -381,8 +607,14 @@ module Ronin
381
607
  # @api public
382
608
  #
383
609
  def every_javascript_comment(&block)
384
- every_javascript do |js|
385
- js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
610
+ every_javascript do |js,page|
611
+ js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
612
+ if block.arity == 2
613
+ yield comment, page
614
+ else
615
+ yield comment
616
+ end
617
+ end
386
618
  end
387
619
  end
388
620
 
@@ -394,9 +626,17 @@ module Ronin
394
626
  # @yield [comment]
395
627
  # The given block will be passed each HTML or JavaScript comment.
396
628
  #
629
+ # @yield [comment, page]
630
+ # If the block accepts two arguments, the HTML or JavaScript comment
631
+ # and the page that the HTML/JavaScript comment was found on will be
632
+ # passed to the given block.
633
+ #
397
634
  # @yieldparam [String] comment
398
635
  # The contents of a HTML or JavaScript comment.
399
636
  #
637
+ # @yieldparam [Spidr::Page] page
638
+ # The page that the HTML or JavaScript comment was found in or on.
639
+ #
400
640
  # @example
401
641
  # spider.every_comment do |comment|
402
642
  # puts comment
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,7 +22,7 @@ module Ronin
22
22
  module Web
23
23
  module Spider
24
24
  # ronin-web-spider version
25
- VERSION = '0.1.1'
25
+ VERSION = '0.2.0.rc1'
26
26
  end
27
27
  end
28
28
  end
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ronin-web-spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-06-20 00:00:00.000000000 Z
11
+ date: 2024-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr