ronin-web-spider 0.1.1 → 0.2.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dab34842325a731e13f23303b1dec66c7ab9d78b4805b982d518ba01024c9352
4
- data.tar.gz: '0668f126b3e828c6409cc7b7adef2d74cd527f48de334b10a3d54f3767fe3afd'
3
+ metadata.gz: e1637185a37e17f587cab3bb0ec451dfa763ab58b167404ec5b1161a4a80316f
4
+ data.tar.gz: ea11da89c3c232feaca90c103c45cdb653eac9b94c6c97be34998d6b5089e896
5
5
  SHA512:
6
- metadata.gz: d474705a601b7fe27be2a9c5f5e5485ed39b38dec1581db295b0e1ff524c987c9e74522afd24569829bc7e64f6930767104f9bd927a3007a17f627de003492f7
7
- data.tar.gz: 397b84308ec62d51e1dba64cff37c75eda8ebd8c2e6a792487468158fb774aae566b8be5cd0388521774d62873e5ba6b751423bc1138ac9e3d47804bdd877a81
6
+ metadata.gz: 4d2f5ac7b650096856b87f5e37cf42044a12db416eab2375715a39073606a1f4c30103fda27cb1faaf9a53533bdf323ae9912787078f24f120366fb519a3b4a1
7
+ data.tar.gz: b2ae98ce51187a5a65cd2355816b98d1916edb0d731f158e83a9366ec70261ef2e3eae9d5f3abe95b311a766c489064f855b4ee6afa4606592a5f9b560fbcb9d
data/ChangeLog.md CHANGED
@@ -1,3 +1,18 @@
1
+ ### 0.2.0 / 2024-XX-XX
2
+
3
+ * Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
4
+ * Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
5
+ * Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
6
+ * Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
7
+ * Allow {Ronin::Web::Spider::Agent#every_html_comment},
8
+ {Ronin::Web::Spider::Agent#every_javascript every_javascript},
9
+ {Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
10
+ {Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
11
+ {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
12
+ {Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
13
+ {Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
14
+ to also yield a `Spidr::Page` block argument for additional context.
15
+
1
16
  ### 0.1.1 / 2024-06-19
2
17
 
3
18
  * Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
data/README.md CHANGED
@@ -304,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
304
304
  end
305
305
  ```
306
306
 
307
+ Print every JavaScript URL string literal:
308
+
309
+ ```ruby
310
+ Ronin::Web::Spider.domain('example.com') do |spider|
311
+ spider.every_javascript_url_string do |url|
312
+ puts url
313
+ end
314
+ end
315
+ ```
316
+
307
317
  Print every JavaScript comment:
308
318
 
309
319
  ```ruby
@@ -390,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
390
400
 
391
401
  ## License
392
402
 
393
- Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
403
+ Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
394
404
 
395
405
  ronin-web-spider is free software: you can redistribute it and/or modify
396
406
  it under the terms of the GNU Lesser General Public License as published
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -23,6 +23,7 @@ require 'spidr/agent'
23
23
  require 'ronin/support/network/http'
24
24
  require 'ronin/support/crypto/cert'
25
25
  require 'ronin/support/text/patterns/source_code'
26
+ require 'ronin/support/text/patterns/network'
26
27
  require 'ronin/support/encoding/js'
27
28
 
28
29
  module Ronin
@@ -225,10 +226,17 @@ module Ronin
225
226
  # @yield [comment]
226
227
  # The given block will be pass every HTML comment.
227
228
  #
229
+ # @yield [comment, page]
230
+ # If the block accepts two arguments, the HTML comment and the page
231
+ # that the comment was found on will be passed to the given block.
232
+ #
228
233
  # @yieldparam [String] comment
229
234
  # The HTML comment inner text, with leading and trailing whitespace
230
235
  # stripped.
231
236
  #
237
+ # @yieldparam [Spidr::Page] page
238
+ # The page that the HTML comment exists on.
239
+ #
232
240
  # @example
233
241
  # spider.every_html_comment do |comment|
234
242
  # puts comment
@@ -236,7 +244,7 @@ module Ronin
236
244
  #
237
245
  # @api public
238
246
  #
239
- def every_html_comment
247
+ def every_html_comment(&block)
240
248
  every_html_page do |page|
241
249
  next unless page.doc
242
250
 
@@ -244,7 +252,11 @@ module Ronin
244
252
  comment_text = comment.inner_text.strip
245
253
 
246
254
  unless comment_text.empty?
247
- yield comment_text
255
+ if block.arity == 2
256
+ yield comment_text, page
257
+ else
258
+ yield comment_text
259
+ end
248
260
  end
249
261
  end
250
262
  end
@@ -256,9 +268,17 @@ module Ronin
256
268
  # @yield [js]
257
269
  # The given block will be passed every piece of JavaScript source.
258
270
  #
271
+ # @yield [js, page]
272
+ # If the block accepts two arguments, the JavaScript source and the
273
+ # page that the JavaScript source was found on will be passed to the
274
+ # given block.
275
+ #
259
276
  # @yieldparam [String] js
260
277
  # The JavaScript source code.
261
278
  #
279
+ # @yieldparam [Spidr::Page] page
280
+ # The page that the JavaScript source was found in or on.
281
+ #
262
282
  # @example
263
283
  # spider.every_javascript do |js|
264
284
  # puts js
@@ -266,7 +286,7 @@ module Ronin
266
286
  #
267
287
  # @api public
268
288
  #
269
- def every_javascript
289
+ def every_javascript(&block)
270
290
  # yield inner text of every `<script type="text/javascript">` tag
271
291
  # and every `.js` URL.
272
292
  every_html_page do |page|
@@ -277,7 +297,11 @@ module Ronin
277
297
  source.force_encoding(Encoding::UTF_8)
278
298
 
279
299
  unless source.empty?
280
- yield source
300
+ if block.arity == 2
301
+ yield source, page
302
+ else
303
+ yield source
304
+ end
281
305
  end
282
306
  end
283
307
  end
@@ -286,7 +310,11 @@ module Ronin
286
310
  source = page.body
287
311
  source.force_encoding(Encoding::UTF_8)
288
312
 
289
- yield source
313
+ if block.arity == 2
314
+ yield source, page
315
+ else
316
+ yield source
317
+ end
290
318
  end
291
319
  end
292
320
 
@@ -331,9 +359,17 @@ module Ronin
331
359
  # The given block will be passed each JavaScript string with the quote
332
360
  # marks removed.
333
361
  #
362
+ # @yield [string, page]
363
+ # If the block accepts two arguments, the JavaScript string and the
364
+ # page that the JavaScript string was found on will be passed to the
365
+ # given block.
366
+ #
334
367
  # @yieldparam [String] string
335
368
  # The parsed contents of a JavaScript string.
336
369
  #
370
+ # @yieldparam [Spidr::Page] page
371
+ # The page that the JavaScript string was found in or on.
372
+ #
337
373
  # @example
338
374
  # spider.every_javascript_string do |str|
339
375
  # puts str
@@ -341,8 +377,8 @@ module Ronin
341
377
  #
342
378
  # @api public
343
379
  #
344
- def every_javascript_string
345
- every_javascript do |js|
380
+ def every_javascript_string(&block)
381
+ every_javascript do |js,page|
346
382
  scanner = StringScanner.new(js)
347
383
 
348
384
  until scanner.eos?
@@ -351,8 +387,13 @@ module Ronin
351
387
  case scanner.peek(1)
352
388
  when '"', "'" # beginning of a quoted string
353
389
  js_string = scanner.scan(Support::Text::Patterns::STRING)
390
+ string = Support::Encoding::JS.unquote(js_string)
354
391
 
355
- yield Support::Encoding::JS.unquote(js_string)
392
+ if block.arity == 2
393
+ yield string, page
394
+ else
395
+ yield string
396
+ end
356
397
  else
357
398
  scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
358
399
  scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
@@ -364,15 +405,200 @@ module Ronin
364
405
 
365
406
  alias every_js_string every_javascript_string
366
407
 
408
+ # Regular expression that matches relative paths within JavaScript.
409
+ #
410
+ # @note
411
+ # This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
412
+ # but *not* `/foo`, `foo`, or `foo.`.
413
+ JAVASCRIPT_RELATIVE_PATH = %r{
414
+ \A
415
+ (?:
416
+ [^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
417
+ |
418
+ [^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
419
+ )
420
+ \z
421
+ }x
422
+
423
+ #
424
+ # Passes every JavaScript relative path string to the given block.
425
+ #
426
+ # @yield [string]
427
+ # The given block will be passed each JavaScript relative path string
428
+ # with the quote marks removed.
429
+ #
430
+ # @yield [string, page]
431
+ # If the block accepts two arguments, the JavaScript relative path
432
+ # string and the page that the JavaScript relative path string was
433
+ # found on will be passed to the given block.
434
+ #
435
+ # @yieldparam [String] string
436
+ # The parsed contents of a literal JavaScript relative path string.
437
+ #
438
+ # @yieldparam [Spidr::Page] page
439
+ # The page that the JavaScript relative path string was found in or
440
+ # on.
441
+ #
442
+ # @example
443
+ # spider.every_javascript_relative_path_string do |relative_path|
444
+ # puts relative_path
445
+ # end
446
+ #
447
+ # @api public
448
+ #
449
+ # @since 0.2.0
450
+ #
451
+ def every_javascript_relative_path_string(&block)
452
+ every_javascript_string do |string,page|
453
+ if string =~ JAVASCRIPT_RELATIVE_PATH
454
+ if block.arity == 2
455
+ yield string, page
456
+ else
457
+ yield string
458
+ end
459
+ end
460
+ end
461
+ end
462
+
463
+ alias every_js_relative_path_string every_javascript_relative_path_string
464
+
465
+ # Regular expression that matches absolute paths within JavaScript.
466
+ JAVASCRIPT_ABSOLUTE_PATH = %r{\A(?:/[^/\\ ]+)+\z}
467
+
468
+ #
469
+ # Passes every JavaScript absolute path string to the given block.
470
+ #
471
+ # @yield [string]
472
+ # The given block will be passed each JavaScript absolute path string
473
+ # with the quote marks removed.
474
+ #
475
+ # @yield [string, page]
476
+ # If the block accepts two arguments, the JavaScript absolute path
477
+ # string and the page that the JavaScript absolute path string was
478
+ # found on will be passed to the given block.
479
+ #
480
+ # @yieldparam [String] string
481
+ # The parsed contents of a literal JavaScript absolute path string.
482
+ #
483
+ # @yieldparam [Spidr::Page] page
484
+ # The page that the JavaScript absolute path string was found in or
485
+ # on.
486
+ #
487
+ # @example
488
+ # spider.every_javascript_absolute_path_string do |absolute_path|
489
+ # puts absolute_path
490
+ # end
491
+ #
492
+ # @api public
493
+ #
494
+ # @since 0.2.0
495
+ #
496
+ def every_javascript_absolute_path_string(&block)
497
+ every_javascript_string do |string,page|
498
+ if string =~ JAVASCRIPT_ABSOLUTE_PATH
499
+ if block.arity == 2
500
+ yield string, page
501
+ else
502
+ yield string
503
+ end
504
+ end
505
+ end
506
+ end
507
+
508
+ alias every_js_absolute_path_string every_javascript_absolute_path_string
509
+
510
+ #
511
+ # Passes every JavaScript path string to the given block.
512
+ #
513
+ # @yield [string]
514
+ # The given block will be passed each JavaScript path string with the
515
+ # quote marks removed.
516
+ #
517
+ # @yield [string, page]
518
+ # If the block accepts two arguments, the JavaScript path string and
519
+ # the page that the JavaScript path string was found on will be
520
+ # passed to the given block.
521
+ #
522
+ # @yieldparam [String] string
523
+ # The parsed contents of a literal JavaScript path string.
524
+ #
525
+ # @yieldparam [Spidr::Page] page
526
+ # The page that the JavaScript path string was found in or on.
527
+ #
528
+ # @example
529
+ # spider.every_javascript_path_string do |path|
530
+ # puts path
531
+ # end
532
+ #
533
+ # @api public
534
+ #
535
+ # @since 0.2.0
536
+ #
537
+ def every_javascript_path_string(&block)
538
+ every_javascript_relative_path_string(&block)
539
+ every_javascript_absolute_path_string(&block)
540
+ end
541
+
542
+ alias every_js_path_string every_javascript_path_string
543
+
544
+ #
545
+ # Passes every JavaScript URL string to the given block.
546
+ #
547
+ # @yield [string]
548
+ # The given block will be passed each JavaScript URL string with the
549
+ # quote marks removed.
550
+ #
551
+ # @yield [string, page]
552
+ # If the block accepts two arguments, the JavaScript URL string and
553
+ # the page that the JavaScript URL string was found on will be passed
554
+ # to the given block.
555
+ #
556
+ # @yieldparam [String] string
557
+ # The parsed contents of a literal JavaScript URL string.
558
+ #
559
+ # @yieldparam [Spidr::Page] page
560
+ # The page that the JavaScript URL string was found in or on.
561
+ #
562
+ # @example
563
+ # spider.every_javascript_url_string do |url|
564
+ # puts url
565
+ # end
566
+ #
567
+ # @api public
568
+ #
569
+ # @since 0.2.0
570
+ #
571
+ def every_javascript_url_string(&block)
572
+ every_javascript_string do |string,page|
573
+ if string =~ Support::Text::Patterns::URL
574
+ if block.arity == 2
575
+ yield string, page
576
+ else
577
+ yield string
578
+ end
579
+ end
580
+ end
581
+ end
582
+
583
+ alias every_js_url_string every_javascript_url_string
584
+
367
585
  #
368
586
  # Passes every JavaScript comment to the given block.
369
587
  #
370
588
  # @yield [comment]
371
589
  # The given block will be passed each JavaScript comment.
372
590
  #
591
+ # @yield [comment, page]
592
+ # If the block accepts two arguments, the JavaScript comment and the
593
+ # page that the JavaScript comment was found on will be passed to the
594
+ # given block.
595
+ #
373
596
  # @yieldparam [String] comment
374
597
  # The contents of a JavaScript comment.
375
598
  #
599
+ # @yieldparam [Spidr::Page] page
600
+ # The page that the JavaScript comment was found in or on.
601
+ #
376
602
  # @example
377
603
  # spider.every_javascript_comment do |comment|
378
604
  # puts comment
@@ -381,8 +607,14 @@ module Ronin
381
607
  # @api public
382
608
  #
383
609
  def every_javascript_comment(&block)
384
- every_javascript do |js|
385
- js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
610
+ every_javascript do |js,page|
611
+ js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
612
+ if block.arity == 2
613
+ yield comment, page
614
+ else
615
+ yield comment
616
+ end
617
+ end
386
618
  end
387
619
  end
388
620
 
@@ -394,9 +626,17 @@ module Ronin
394
626
  # @yield [comment]
395
627
  # The given block will be passed each HTML or JavaScript comment.
396
628
  #
629
+ # @yield [comment, page]
630
+ # If the block accepts two arguments, the HTML or JavaScript comment
631
+ # and the page that the HTML/JavaScript comment was found on will be
632
+ # passed to the given block.
633
+ #
397
634
  # @yieldparam [String] comment
398
635
  # The contents of a HTML or JavaScript comment.
399
636
  #
637
+ # @yieldparam [Spidr::Page] page
638
+ # The page that the HTML or JavaScript comment was found in or on.
639
+ #
400
640
  # @example
401
641
  # spider.every_comment do |comment|
402
642
  # puts comment
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,7 +22,7 @@ module Ronin
22
22
  module Web
23
23
  module Spider
24
24
  # ronin-web-spider version
25
- VERSION = '0.1.1'
25
+ VERSION = '0.2.0.rc1'
26
26
  end
27
27
  end
28
28
  end
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # ronin-web-spider - A collection of common web spidering routines.
4
4
  #
5
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
6
6
  #
7
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
8
8
  # it under the terms of the GNU Lesser General Public License as published
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ronin-web-spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-06-20 00:00:00.000000000 Z
11
+ date: 2024-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr