ronin-web-spider 0.1.0 → 0.2.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fcb3d69132ae37799758c37282083f3b876e04e76aa3ab9f500f251b7df0984d
4
- data.tar.gz: 04b92b26f1bcd6166530ddfe225cde18a4bbaa8a1eb3b395120ae1e6b41aec4b
3
+ metadata.gz: e1637185a37e17f587cab3bb0ec451dfa763ab58b167404ec5b1161a4a80316f
4
+ data.tar.gz: ea11da89c3c232feaca90c103c45cdb653eac9b94c6c97be34998d6b5089e896
5
5
  SHA512:
6
- metadata.gz: e5cc4d39ac8e5f9d92edd240e836d5848f0b96798afbcab9c8116f8223142851d835b7bfd3e7a8d94e867951c4b995e0a66736a73b72d6a96f06fee6daf26bc9
7
- data.tar.gz: 4f1facfbdffe1aca7fd0d10ff0c99d6f835b2633e94be49011b46127ca9cc7b76415930d5df0a961516000032b940f00e224c562923c06412c57f2896e50256f
6
+ metadata.gz: 4d2f5ac7b650096856b87f5e37cf42044a12db416eab2375715a39073606a1f4c30103fda27cb1faaf9a53533bdf323ae9912787078f24f120366fb519a3b4a1
7
+ data.tar.gz: b2ae98ce51187a5a65cd2355816b98d1916edb0d731f158e83a9366ec70261ef2e3eae9d5f3abe95b311a766c489064f855b4ee6afa4606592a5f9b560fbcb9d
@@ -12,11 +12,12 @@ jobs:
12
12
  - '3.0'
13
13
  - '3.1'
14
14
  - '3.2'
15
+ - '3.3'
15
16
  - jruby
16
17
  - truffleruby
17
18
  name: Ruby ${{ matrix.ruby }}
18
19
  steps:
19
- - uses: actions/checkout@v2
20
+ - uses: actions/checkout@v4
20
21
  - name: Set up Ruby
21
22
  uses: ruby/setup-ruby@v1
22
23
  with:
@@ -26,3 +27,17 @@ jobs:
26
27
  run: bundle install --jobs 4 --retry 3
27
28
  - name: Run tests
28
29
  run: bundle exec rake test
30
+
31
+ # rubocop linting
32
+ rubocop:
33
+ runs-on: ubuntu-latest
34
+ steps:
35
+ - uses: actions/checkout@v4
36
+ - name: Set up Ruby
37
+ uses: ruby/setup-ruby@v1
38
+ with:
39
+ ruby-version: 3.0
40
+ - name: Install dependencies
41
+ run: bundle install --jobs 4 --retry 3
42
+ - name: Run rubocop
43
+ run: bundle exec rubocop --parallel
data/.rubocop.yml ADDED
@@ -0,0 +1,11 @@
1
+ AllCops:
2
+ NewCops: enable
3
+ SuggestExtensions: false
4
+ TargetRubyVersion: 3.1
5
+
6
+ inherit_gem:
7
+ rubocop-ronin: rubocop.yml
8
+
9
+ #
10
+ # ronin-web-spider specific exceptions
11
+ #
data/ChangeLog.md CHANGED
@@ -1,4 +1,37 @@
1
- ### 0.1.0 / 2023-XX-XX
1
+ ### 0.2.0 / 2024-XX-XX
2
+
3
+ * Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
4
+ * Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
5
+ * Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
6
+ * Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
7
+ * Allow {Ronin::Web::Spider::Agent#every_html_comment},
8
+ {Ronin::Web::Spider::Agent#every_javascript every_javascript},
9
+ {Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
10
+ {Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
11
+ {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
12
+ {Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
13
+ {Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
14
+ to also yield a `Spidr::Page` block argument for additional context.
15
+
16
+ ### 0.1.1 / 2024-06-19
17
+
18
+ * Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
19
+ {Ronin::Web::Spider::Agent#every_javascript} when the page's `Content-Type`
20
+ header included `text/html` but lacked a response body, causing `page.doc` to
21
+ be `nil`.
22
+ * Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript} where parsed
23
+ JavaScript source code strings containing UTF-8 characters where being
24
+ incorrectly encoded as ASCII-8bit strings, if the page's `Content-Type` header
25
+ did not include a `charset=` attribute.
26
+ * Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript_string} where
27
+ inline JavaScript regexes containing the `"` or `'` characters (ex: `/["'=]/`)
28
+ would incorrectly be treated as the beginning or ends of JavaScript string
29
+ literals. Note that while this greatly improves the accuracy of
30
+ {Ronin::Web::Spider::Agent#every_javascript_string}, it still does not
31
+ support parsing JavaScript template literals that may also contain string
32
+ literals (ex: ````Hello \"World\"```` or ````Hello ${myFunc("string literal")}````).
33
+
34
+ ### 0.1.0 / 2023-02-01
2
35
 
3
36
  * Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
4
37
  * Relicensed as LGPL-3.0.
@@ -20,3 +53,4 @@
20
53
  * `every_comment` - yields every HTML or JavaScript comment.
21
54
  * Supports archiving spidered pages to a directory or git repository.
22
55
 
56
+ [spidr]: https://github.com/postmodern/spidr#readme
data/Gemfile CHANGED
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  source 'https://rubygems.org'
2
3
 
3
4
  gemspec
@@ -28,4 +29,6 @@ group :development do
28
29
  gem 'dead_end', require: false
29
30
  gem 'sord', require: false, platform: :mri
30
31
  gem 'stackprof', require: false, platform: :mri
32
+ gem 'rubocop', require: false, platform: :mri
33
+ gem 'rubocop-ronin', require: false, platform: :mri
31
34
  end
data/README.md CHANGED
@@ -9,7 +9,6 @@
9
9
  * [Issues](https://github.com/ronin-rb/ronin-web-spider/issues)
10
10
  * [Documentation](https://ronin-rb.dev/docs/ronin-web-spider/frames)
11
11
  * [Discord](https://discord.gg/6WAb3PsVX9) |
12
- [Twitter](https://twitter.com/ronin_rb) |
13
12
  [Mastodon](https://infosec.exchange/@ronin_rb)
14
13
 
15
14
  ## Description
@@ -38,7 +37,7 @@ ronin-web-spider is a collection of common web spidering routines using the
38
37
  * [every_comment][docs-every_comment] - yields every HTML or JavaScript
39
38
  comment.
40
39
  * Supports archiving spidered pages to a directory or git repository.
41
- * Has 94% documentation coverage.
40
+ * Has 97% documentation coverage.
42
41
  * Has 94% test coverage.
43
42
 
44
43
  [docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
@@ -305,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
305
304
  end
306
305
  ```
307
306
 
307
+ Print every JavaScript URL string literal:
308
+
309
+ ```ruby
310
+ Ronin::Web::Spider.domain('example.com') do |spider|
311
+ spider.every_javascript_url_string do |url|
312
+ puts url
313
+ end
314
+ end
315
+ ```
316
+
308
317
  Print every JavaScript comment:
309
318
 
310
319
  ```ruby
@@ -391,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
391
400
 
392
401
  ## License
393
402
 
394
- Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
403
+ Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
395
404
 
396
405
  ronin-web-spider is free software: you can redistribute it and/or modify
397
406
  it under the terms of the GNU Lesser General Public License as published
data/Rakefile CHANGED
@@ -1,11 +1,11 @@
1
- require 'rubygems'
1
+ # frozen_string_literal: true
2
2
 
3
3
  begin
4
4
  require 'bundler'
5
5
  rescue LoadError => e
6
6
  warn e.message
7
7
  warn "Run `gem install bundler` to install Bundler"
8
- exit -1
8
+ exit(-1)
9
9
  end
10
10
 
11
11
  begin
data/gemspec.yml CHANGED
@@ -1,5 +1,5 @@
1
1
  name: ronin-web-spider
2
- summary: collection of common web spidering routines
2
+ summary: A collection of common web spidering routines.
3
3
  description:
4
4
  ronin-web-spider is a collection of common web spidering routines using the
5
5
  spidr gem.
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,6 +23,7 @@ require 'spidr/agent'
22
23
  require 'ronin/support/network/http'
23
24
  require 'ronin/support/crypto/cert'
24
25
  require 'ronin/support/text/patterns/source_code'
26
+ require 'ronin/support/text/patterns/network'
25
27
  require 'ronin/support/encoding/js'
26
28
 
27
29
  module Ronin
@@ -224,10 +226,17 @@ module Ronin
224
226
  # @yield [comment]
225
227
  # The given block will be pass every HTML comment.
226
228
  #
229
+ # @yield [comment, page]
230
+ # If the block accepts two arguments, the HTML comment and the page
231
+ # that the comment was found on will be passed to the given block.
232
+ #
227
233
  # @yieldparam [String] comment
228
234
  # The HTML comment inner text, with leading and trailing whitespace
229
235
  # stripped.
230
236
  #
237
+ # @yieldparam [Spidr::Page] page
238
+ # The page that the HTML comment exists on.
239
+ #
231
240
  # @example
232
241
  # spider.every_html_comment do |comment|
233
242
  # puts comment
@@ -235,13 +244,19 @@ module Ronin
235
244
  #
236
245
  # @api public
237
246
  #
238
- def every_html_comment
247
+ def every_html_comment(&block)
239
248
  every_html_page do |page|
249
+ next unless page.doc
250
+
240
251
  page.doc.xpath('//comment()').each do |comment|
241
252
  comment_text = comment.inner_text.strip
242
253
 
243
254
  unless comment_text.empty?
244
- yield comment_text
255
+ if block.arity == 2
256
+ yield comment_text, page
257
+ else
258
+ yield comment_text
259
+ end
245
260
  end
246
261
  end
247
262
  end
@@ -253,9 +268,17 @@ module Ronin
253
268
  # @yield [js]
254
269
  # The given block will be passed every piece of JavaScript source.
255
270
  #
271
+ # @yield [js, page]
272
+ # If the block accepts two arguments, the JavaScript source and the
273
+ # page that the JavaScript source was found on will be passed to the
274
+ # given block.
275
+ #
256
276
  # @yieldparam [String] js
257
277
  # The JavaScript source code.
258
278
  #
279
+ # @yieldparam [Spidr::Page] page
280
+ # The page that the JavaScript source was found in or on.
281
+ #
259
282
  # @example
260
283
  # spider.every_javascript do |js|
261
284
  # puts js
@@ -263,24 +286,72 @@ module Ronin
263
286
  #
264
287
  # @api public
265
288
  #
266
- def every_javascript
289
+ def every_javascript(&block)
267
290
  # yield inner text of every `<script type="text/javascript">` tag
268
291
  # and every `.js` URL.
269
292
  every_html_page do |page|
293
+ next unless page.doc
294
+
270
295
  page.doc.xpath('//script[@type="text/javascript"]').each do |script|
271
- unless script.inner_text.empty?
272
- yield script.inner_text
296
+ source = script.inner_text
297
+ source.force_encoding(Encoding::UTF_8)
298
+
299
+ unless source.empty?
300
+ if block.arity == 2
301
+ yield source, page
302
+ else
303
+ yield source
304
+ end
273
305
  end
274
306
  end
275
307
  end
276
308
 
277
309
  every_javascript_page do |page|
278
- yield page.body
310
+ source = page.body
311
+ source.force_encoding(Encoding::UTF_8)
312
+
313
+ if block.arity == 2
314
+ yield source, page
315
+ else
316
+ yield source
317
+ end
279
318
  end
280
319
  end
281
320
 
282
321
  alias every_js every_javascript
283
322
 
323
+ # Regex to match and skip JavaScript inline regexes.
324
+ #
325
+ # @api private
326
+ #
327
+ # @since 0.1.1
328
+ JAVASCRIPT_INLINE_REGEX = %r{
329
+ (?# match before the regex to avoid matching division operators )
330
+ (?:[\{\[\(;:,]\s*|=\s*)
331
+ /
332
+ (?# inline regex contents )
333
+ (?:
334
+ \[ (?:\\. | [^\]]) \] (?# [...] ) |
335
+ \\. (?# backslash escaped characters ) |
336
+ [^/] (?# everything else )
337
+ )+
338
+ /[dgimsuvy]* (?# also match any regex flags )
339
+ }mx
340
+
341
+ # Regex to match and skip JavaScript template literals.
342
+ #
343
+ # @note
344
+ # This regex will not properly match nested template literals:
345
+ #
346
+ # ```javascript
347
+ # `foo ${`bar ${1+1}`}`
348
+ # ```
349
+ #
350
+ # @api private
351
+ #
352
+ # @since 0.1.1
353
+ JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
354
+
284
355
  #
285
356
  # Passes every JavaScript string value to the given block.
286
357
  #
@@ -288,35 +359,246 @@ module Ronin
288
359
  # The given block will be passed each JavaScript string with the quote
289
360
  # marks removed.
290
361
  #
362
+ # @yield [string, page]
363
+ # If the block accepts two arguments, the JavaScript string and the
364
+ # page that the JavaScript string was found on will be passed to the
365
+ # given block.
366
+ #
291
367
  # @yieldparam [String] string
292
368
  # The parsed contents of a JavaScript string.
293
369
  #
370
+ # @yieldparam [Spidr::Page] page
371
+ # The page that the JavaScript string was found in or on.
372
+ #
294
373
  # @example
295
374
  # spider.every_javascript_string do |str|
296
- # puts str
297
- # end
375
+ # puts str
376
+ # end
298
377
  #
299
378
  # @api public
300
379
  #
301
- def every_javascript_string
302
- every_javascript do |js|
303
- js.scan(Support::Text::Patterns::STRING) do |js_string|
304
- yield Support::Encoding::JS.unquote(js_string)
380
+ def every_javascript_string(&block)
381
+ every_javascript do |js,page|
382
+ scanner = StringScanner.new(js)
383
+
384
+ until scanner.eos?
385
+ # NOTE: this is a naive JavaScript string scanner and should
386
+ # eventually be replaced with a real JavaScript lexer or parser.
387
+ case scanner.peek(1)
388
+ when '"', "'" # beginning of a quoted string
389
+ js_string = scanner.scan(Support::Text::Patterns::STRING)
390
+ string = Support::Encoding::JS.unquote(js_string)
391
+
392
+ if block.arity == 2
393
+ yield string, page
394
+ else
395
+ yield string
396
+ end
397
+ else
398
+ scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
399
+ scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
400
+ scanner.getch
401
+ end
305
402
  end
306
403
  end
307
404
  end
308
405
 
309
406
  alias every_js_string every_javascript_string
310
407
 
408
+ # Regular expression that matches relative paths within JavaScript.
409
+ #
410
+ # @note
411
+ # This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
412
+ # but *not* `/foo`, `foo`, or `foo.`.
413
+ JAVASCRIPT_RELATIVE_PATH = %r{
414
+ \A
415
+ (?:
416
+ [^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
417
+ |
418
+ [^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
419
+ )
420
+ \z
421
+ }x
422
+
423
+ #
424
+ # Passes every JavaScript relative path string to the given block.
425
+ #
426
+ # @yield [string]
427
+ # The given block will be passed each JavaScript relative path string
428
+ # with the quote marks removed.
429
+ #
430
+ # @yield [string, page]
431
+ # If the block accepts two arguments, the JavaScript relative path
432
+ # string and the page that the JavaScript relative path string was
433
+ # found on will be passed to the given block.
434
+ #
435
+ # @yieldparam [String] string
436
+ # The parsed contents of a literal JavaScript relative path string.
437
+ #
438
+ # @yieldparam [Spidr::Page] page
439
+ # The page that the JavaScript relative path string was found in or
440
+ # on.
441
+ #
442
+ # @example
443
+ # spider.every_javascript_relative_path_string do |relative_path|
444
+ # puts relative_path
445
+ # end
446
+ #
447
+ # @api public
448
+ #
449
+ # @since 0.2.0
450
+ #
451
+ def every_javascript_relative_path_string(&block)
452
+ every_javascript_string do |string,page|
453
+ if string =~ JAVASCRIPT_RELATIVE_PATH
454
+ if block.arity == 2
455
+ yield string, page
456
+ else
457
+ yield string
458
+ end
459
+ end
460
+ end
461
+ end
462
+
463
+ alias every_js_relative_path_string every_javascript_relative_path_string
464
+
465
+ # Regular expression that matches absolute paths within JavaScript.
466
+ JAVASCRIPT_ABSOLUTE_PATH = %r{\A(?:/[^/\\ ]+)+\z}
467
+
468
+ #
469
+ # Passes every JavaScript absolute path string to the given block.
470
+ #
471
+ # @yield [string]
472
+ # The given block will be passed each JavaScript absolute path string
473
+ # with the quote marks removed.
474
+ #
475
+ # @yield [string, page]
476
+ # If the block accepts two arguments, the JavaScript absolute path
477
+ # string and the page that the JavaScript absolute path string was
478
+ # found on will be passed to the given block.
479
+ #
480
+ # @yieldparam [String] string
481
+ # The parsed contents of a literal JavaScript absolute path string.
482
+ #
483
+ # @yieldparam [Spidr::Page] page
484
+ # The page that the JavaScript absolute path string was found in or
485
+ # on.
486
+ #
487
+ # @example
488
+ # spider.every_javascript_absolute_path_string do |absolute_path|
489
+ # puts absolute_path
490
+ # end
491
+ #
492
+ # @api public
493
+ #
494
+ # @since 0.2.0
495
+ #
496
+ def every_javascript_absolute_path_string(&block)
497
+ every_javascript_string do |string,page|
498
+ if string =~ JAVASCRIPT_ABSOLUTE_PATH
499
+ if block.arity == 2
500
+ yield string, page
501
+ else
502
+ yield string
503
+ end
504
+ end
505
+ end
506
+ end
507
+
508
+ alias every_js_absolute_path_string every_javascript_absolute_path_string
509
+
510
+ #
511
+ # Passes every JavaScript path string to the given block.
512
+ #
513
+ # @yield [string]
514
+ # The given block will be passed each JavaScript path string with the
515
+ # quote marks removed.
516
+ #
517
+ # @yield [string, page]
518
+ # If the block accepts two arguments, the JavaScript path string and
519
+ # the page that the JavaScript path string was found on will be
520
+ # passed to the given block.
521
+ #
522
+ # @yieldparam [String] string
523
+ # The parsed contents of a literal JavaScript path string.
524
+ #
525
+ # @yieldparam [Spidr::Page] page
526
+ # The page that the JavaScript path string was found in or on.
527
+ #
528
+ # @example
529
+ # spider.every_javascript_path_string do |path|
530
+ # puts path
531
+ # end
532
+ #
533
+ # @api public
534
+ #
535
+ # @since 0.2.0
536
+ #
537
+ def every_javascript_path_string(&block)
538
+ every_javascript_relative_path_string(&block)
539
+ every_javascript_absolute_path_string(&block)
540
+ end
541
+
542
+ alias every_js_path_string every_javascript_path_string
543
+
544
+ #
545
+ # Passes every JavaScript URL string to the given block.
546
+ #
547
+ # @yield [string]
548
+ # The given block will be passed each JavaScript URL string with the
549
+ # quote marks removed.
550
+ #
551
+ # @yield [string, page]
552
+ # If the block accepts two arguments, the JavaScript URL string and
553
+ # the page that the JavaScript URL string was found on will be passed
554
+ # to the given block.
555
+ #
556
+ # @yieldparam [String] string
557
+ # The parsed contents of a literal JavaScript URL string.
558
+ #
559
+ # @yieldparam [Spidr::Page] page
560
+ # The page that the JavaScript URL string was found in or on.
561
+ #
562
+ # @example
563
+ # spider.every_javascript_url_string do |url|
564
+ # puts url
565
+ # end
566
+ #
567
+ # @api public
568
+ #
569
+ # @since 0.2.0
570
+ #
571
+ def every_javascript_url_string(&block)
572
+ every_javascript_string do |string,page|
573
+ if string =~ Support::Text::Patterns::URL
574
+ if block.arity == 2
575
+ yield string, page
576
+ else
577
+ yield string
578
+ end
579
+ end
580
+ end
581
+ end
582
+
583
+ alias every_js_url_string every_javascript_url_string
584
+
311
585
  #
312
586
  # Passes every JavaScript comment to the given block.
313
587
  #
314
588
  # @yield [comment]
315
589
  # The given block will be passed each JavaScript comment.
316
590
  #
591
+ # @yield [comment, page]
592
+ # If the block accepts two arguments, the JavaScript comment and the
593
+ # page that the JavaScript comment was found on will be passed to the
594
+ # given block.
595
+ #
317
596
  # @yieldparam [String] comment
318
597
  # The contents of a JavaScript comment.
319
598
  #
599
+ # @yieldparam [Spidr::Page] page
600
+ # The page that the JavaScript comment was found in or on.
601
+ #
320
602
  # @example
321
603
  # spider.every_javascript_comment do |comment|
322
604
  # puts comment
@@ -325,8 +607,14 @@ module Ronin
325
607
  # @api public
326
608
  #
327
609
  def every_javascript_comment(&block)
328
- every_javascript do |js|
329
- js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
610
+ every_javascript do |js,page|
611
+ js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
612
+ if block.arity == 2
613
+ yield comment, page
614
+ else
615
+ yield comment
616
+ end
617
+ end
330
618
  end
331
619
  end
332
620
 
@@ -338,9 +626,17 @@ module Ronin
338
626
  # @yield [comment]
339
627
  # The given block will be passed each HTML or JavaScript comment.
340
628
  #
629
+ # @yield [comment, page]
630
+ # If the block accepts two arguments, the HTML or JavaScript comment
631
+ # and the page that the HTML/JavaScript comment was found on will be
632
+ # passed to the given block.
633
+ #
341
634
  # @yieldparam [String] comment
342
635
  # The contents of a HTML or JavaScript comment.
343
636
  #
637
+ # @yieldparam [Spidr::Page] page
638
+ # The page that the HTML or JavaScript comment was found in or on.
639
+ #
344
640
  # @example
345
641
  # spider.every_comment do |comment|
346
642
  # puts comment
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
@@ -31,7 +32,7 @@ module Ronin
31
32
  #
32
33
  # require 'ronin/web/spider'
33
34
  # require 'ronin/web/spider/archive'
34
- #
35
+ #
35
36
  # Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
36
37
  # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
37
38
  # archive.write(page.url,page.body)
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
@@ -33,7 +34,7 @@ module Ronin
33
34
  # require 'ronin/web/spider'
34
35
  # require 'ronin/web/spider/git_archive'
35
36
  # require 'date'
36
- #
37
+ #
37
38
  # Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
38
39
  # archive.commit("Updated #{Date.today}") do
39
40
  # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -21,7 +22,7 @@ module Ronin
21
22
  module Web
22
23
  module Spider
23
24
  # ronin-web-spider version
24
- VERSION = '0.1.0'
25
+ VERSION = '0.2.0.rc1'
25
26
  end
26
27
  end
27
28
  end
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -30,136 +31,136 @@ module Ronin
30
31
  # ## Examples
31
32
  #
32
33
  # Spider a host:
33
- #
34
+ #
34
35
  # ```ruby
35
36
  # require 'ronin/web/spider'
36
- #
37
+ #
37
38
  # Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
38
39
  # # ...
39
40
  # end
40
41
  # ```
41
- #
42
+ #
42
43
  # Spider a host:
43
- #
44
+ #
44
45
  # ```ruby
45
46
  # Ronin::Web::Spider.host('solnic.eu') do |agent|
46
47
  # # ...
47
48
  # end
48
49
  # ```
49
- #
50
+ #
50
51
  # Spider a domain (and any sub-domains):
51
- #
52
+ #
52
53
  # ```ruby
53
54
  # Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
54
55
  # # ...
55
56
  # end
56
57
  # ```
57
- #
58
+ #
58
59
  # Spider a site:
59
- #
60
+ #
60
61
  # ```ruby
61
62
  # Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
62
63
  # # ...
63
64
  # end
64
65
  # ```
65
- #
66
+ #
66
67
  # Spider multiple hosts:
67
- #
68
+ #
68
69
  # ```ruby
69
70
  # Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
70
71
  # # ...
71
72
  # end
72
73
  # ```
73
- #
74
+ #
74
75
  # Do not spider certain links:
75
- #
76
+ #
76
77
  # ```ruby
77
78
  # Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
78
79
  # # ...
79
80
  # end
80
81
  # ```
81
- #
82
+ #
82
83
  # Do not spider links on certain ports:
83
- #
84
+ #
84
85
  # ```ruby
85
86
  # Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
86
87
  # # ...
87
88
  # end
88
89
  # ```
89
- #
90
+ #
90
91
  # Do not spider links blacklisted in robots.txt:
91
- #
92
+ #
92
93
  # ```ruby
93
94
  # Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
94
95
  # # ...
95
96
  # end
96
97
  # ```
97
- #
98
+ #
98
99
  # Print out visited URLs:
99
- #
100
+ #
100
101
  # ```ruby
101
102
  # Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
102
103
  # spider.every_url { |url| puts url }
103
104
  # end
104
105
  # ```
105
- #
106
+ #
106
107
  # Build a URL map of a site:
107
- #
108
+ #
108
109
  # ```ruby
109
110
  # url_map = Hash.new { |hash,key| hash[key] = [] }
110
- #
111
+ #
111
112
  # Ronin::Web::Spider.site('http://intranet.com/') do |spider|
112
113
  # spider.every_link do |origin,dest|
113
114
  # url_map[dest] << origin
114
115
  # end
115
116
  # end
116
117
  # ```
117
- #
118
+ #
118
119
  # Print out the URLs that could not be requested:
119
- #
120
+ #
120
121
  # ```ruby
121
122
  # Ronin::Web::Spider.site('http://company.com/') do |spider|
122
123
  # spider.every_failed_url { |url| puts url }
123
124
  # end
124
125
  # ```
125
- #
126
+ #
126
127
  # Finds all pages which have broken links:
127
- #
128
+ #
128
129
  # ```ruby
129
130
  # url_map = Hash.new { |hash,key| hash[key] = [] }
130
- #
131
+ #
131
132
  # spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
132
133
  # spider.every_link do |origin,dest|
133
134
  # url_map[dest] << origin
134
135
  # end
135
136
  # end
136
- #
137
+ #
137
138
  # spider.failures.each do |url|
138
139
  # puts "Broken link #{url} found in:"
139
- #
140
+ #
140
141
  # url_map[url].each { |page| puts " #{page}" }
141
142
  # end
142
143
  # ```
143
- #
144
+ #
144
145
  # Search HTML and XML pages:
145
- #
146
+ #
146
147
  # ```ruby
147
148
  # Ronin::Web::Spider.site('http://company.com/') do |spider|
148
149
  # spider.every_page do |page|
149
150
  # puts ">>> #{page.url}"
150
- #
151
+ #
151
152
  # page.search('//meta').each do |meta|
152
153
  # name = (meta.attributes['name'] || meta.attributes['http-equiv'])
153
154
  # value = meta.attributes['content']
154
- #
155
+ #
155
156
  # puts " #{name} = #{value}"
156
157
  # end
157
158
  # end
158
159
  # end
159
160
  # ```
160
- #
161
+ #
161
162
  # Print out the titles from every page:
162
- #
163
+ #
163
164
  # ```ruby
164
165
  # Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
165
166
  # spider.every_html_page do |page|
@@ -167,9 +168,9 @@ module Ronin
167
168
  # end
168
169
  # end
169
170
  # ```
170
- #
171
+ #
171
172
  # Print out every HTTP redirect:
172
- #
173
+ #
173
174
  # ```ruby
174
175
  # Ronin::Web::Spider.host('company.com') do |spider|
175
176
  # spider.every_redirect_page do |page|
@@ -177,21 +178,21 @@ module Ronin
177
178
  # end
178
179
  # end
179
180
  # ```
180
- #
181
+ #
181
182
  # Find what kinds of web servers a host is using, by accessing the headers:
182
- #
183
+ #
183
184
  # ```ruby
184
185
  # servers = Set[]
185
- #
186
+ #
186
187
  # Ronin::Web::Spider.host('company.com') do |spider|
187
188
  # spider.all_headers do |headers|
188
189
  # servers << headers['server']
189
190
  # end
190
191
  # end
191
192
  # ```
192
- #
193
+ #
193
194
  # Pause the spider on a forbidden page:
194
- #
195
+ #
195
196
  # ```ruby
196
197
  # Ronin::Web::Spider.host('company.com') do |spider|
197
198
  # spider.every_forbidden_page do |page|
@@ -199,9 +200,9 @@ module Ronin
199
200
  # end
200
201
  # end
201
202
  # ```
202
- #
203
+ #
203
204
  # Skip the processing of a page:
204
- #
205
+ #
205
206
  # ```ruby
206
207
  # Ronin::Web::Spider.host('company.com') do |spider|
207
208
  # spider.every_missing_page do |page|
@@ -209,9 +210,9 @@ module Ronin
209
210
  # end
210
211
  # end
211
212
  # ```
212
- #
213
+ #
213
214
  # Skip the processing of links:
214
- #
215
+ #
215
216
  # ```ruby
216
217
  # Ronin::Web::Spider.host('company.com') do |spider|
217
218
  # spider.every_url do |url|
@@ -221,9 +222,9 @@ module Ronin
221
222
  # end
222
223
  # end
223
224
  # ```
224
- #
225
+ #
225
226
  # Detect when a new host name is spidered:
226
- #
227
+ #
227
228
  # ```ruby
228
229
  # Ronin::Web::Spider.domain('example.com') do |spider|
229
230
  # spider.every_host do |host|
@@ -231,9 +232,9 @@ module Ronin
231
232
  # end
232
233
  # end
233
234
  # ```
234
- #
235
+ #
235
236
  # Detect when a new SSL/TLS certificate is encountered:
236
- #
237
+ #
237
238
  # ```ruby
238
239
  # Ronin::Web::Spider.domain('example.com') do |spider|
239
240
  # spider.every_cert do |cert|
@@ -241,9 +242,9 @@ module Ronin
241
242
  # end
242
243
  # end
243
244
  # ```
244
- #
245
+ #
245
246
  # Print the MD5 checksum of every `favicon.ico` file:
246
- #
247
+ #
247
248
  # ```ruby
248
249
  # Ronin::Web::Spider.domain('example.com') do |spider|
249
250
  # spider.every_favicon do |page|
@@ -251,9 +252,9 @@ module Ronin
251
252
  # end
252
253
  # end
253
254
  # ```
254
- #
255
+ #
255
256
  # Print every HTML comment:
256
- #
257
+ #
257
258
  # ```ruby
258
259
  # Ronin::Web::Spider.domain('example.com') do |spider|
259
260
  # spider.every_html_comment do |comment|
@@ -261,9 +262,9 @@ module Ronin
261
262
  # end
262
263
  # end
263
264
  # ```
264
- #
265
+ #
265
266
  # Print all JavaScript source code:
266
- #
267
+ #
267
268
  # ```ruby
268
269
  # Ronin::Web::Spider.domain('example.com') do |spider|
269
270
  # spider.every_javascript do |js|
@@ -271,9 +272,9 @@ module Ronin
271
272
  # end
272
273
  # end
273
274
  # ```
274
- #
275
+ #
275
276
  # Print every JavaScript string literal:
276
- #
277
+ #
277
278
  # ```ruby
278
279
  # Ronin::Web::Spider.domain('example.com') do |spider|
279
280
  # spider.every_javascript_string do |str|
@@ -281,9 +282,9 @@ module Ronin
281
282
  # end
282
283
  # end
283
284
  # ```
284
- #
285
+ #
285
286
  # Print every JavaScript comment:
286
- #
287
+ #
287
288
  # ```ruby
288
289
  # Ronin::Web::Spider.domain('example.com') do |spider|
289
290
  # spider.every_javascript_comment do |comment|
@@ -291,9 +292,9 @@ module Ronin
291
292
  # end
292
293
  # end
293
294
  # ```
294
- #
295
+ #
295
296
  # Print every HTML and JavaScript comment:
296
- #
297
+ #
297
298
  # ```ruby
298
299
  # Ronin::Web::Spider.domain('example.com') do |spider|
299
300
  # spider.every_comment do |comment|
@@ -301,7 +302,7 @@ module Ronin
301
302
  # end
302
303
  # end
303
304
  # ```
304
- #
305
+ #
305
306
  module Spider
306
307
  #
307
308
  # Creates a new agent and begin spidering at the given URL.
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  require 'yaml'
4
4
 
@@ -22,7 +22,7 @@ Gem::Specification.new do |gem|
22
22
  gem.homepage = gemspec['homepage']
23
23
  gem.metadata = gemspec['metadata'] if gemspec['metadata']
24
24
 
25
- glob = lambda { |patterns| gem.files & Dir[*patterns] }
25
+ glob = ->(patterns) { gem.files & Dir[*patterns] }
26
26
 
27
27
  gem.files = `git ls-files`.split($/)
28
28
  gem.files = glob[gemspec['files']] if gemspec['files']
@@ -46,7 +46,7 @@ Gem::Specification.new do |gem|
46
46
  gem.required_rubygems_version = gemspec['required_rubygems_version']
47
47
  gem.post_install_message = gemspec['post_install_message']
48
48
 
49
- split = lambda { |string| string.split(/,\s*/) }
49
+ split = ->(string) { string.split(/,\s*/) }
50
50
 
51
51
  if gemspec['dependencies']
52
52
  gemspec['dependencies'].each do |name,versions|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ronin-web-spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-01 00:00:00.000000000 Z
11
+ date: 2024-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -66,6 +66,7 @@ files:
66
66
  - ".github/workflows/ruby.yml"
67
67
  - ".gitignore"
68
68
  - ".rspec"
69
+ - ".rubocop.yml"
69
70
  - ".ruby-version"
70
71
  - ".yardopts"
71
72
  - COPYING.txt
@@ -105,8 +106,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
105
106
  - !ruby/object:Gem::Version
106
107
  version: '0'
107
108
  requirements: []
108
- rubygems_version: 3.3.26
109
+ rubygems_version: 3.3.27
109
110
  signing_key:
110
111
  specification_version: 4
111
- summary: collection of common web spidering routines
112
+ summary: A collection of common web spidering routines.
112
113
  test_files: []