ronin-web-spider 0.1.0 → 0.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fcb3d69132ae37799758c37282083f3b876e04e76aa3ab9f500f251b7df0984d
4
- data.tar.gz: 04b92b26f1bcd6166530ddfe225cde18a4bbaa8a1eb3b395120ae1e6b41aec4b
3
+ metadata.gz: e1637185a37e17f587cab3bb0ec451dfa763ab58b167404ec5b1161a4a80316f
4
+ data.tar.gz: ea11da89c3c232feaca90c103c45cdb653eac9b94c6c97be34998d6b5089e896
5
5
  SHA512:
6
- metadata.gz: e5cc4d39ac8e5f9d92edd240e836d5848f0b96798afbcab9c8116f8223142851d835b7bfd3e7a8d94e867951c4b995e0a66736a73b72d6a96f06fee6daf26bc9
7
- data.tar.gz: 4f1facfbdffe1aca7fd0d10ff0c99d6f835b2633e94be49011b46127ca9cc7b76415930d5df0a961516000032b940f00e224c562923c06412c57f2896e50256f
6
+ metadata.gz: 4d2f5ac7b650096856b87f5e37cf42044a12db416eab2375715a39073606a1f4c30103fda27cb1faaf9a53533bdf323ae9912787078f24f120366fb519a3b4a1
7
+ data.tar.gz: b2ae98ce51187a5a65cd2355816b98d1916edb0d731f158e83a9366ec70261ef2e3eae9d5f3abe95b311a766c489064f855b4ee6afa4606592a5f9b560fbcb9d
@@ -12,11 +12,12 @@ jobs:
12
12
  - '3.0'
13
13
  - '3.1'
14
14
  - '3.2'
15
+ - '3.3'
15
16
  - jruby
16
17
  - truffleruby
17
18
  name: Ruby ${{ matrix.ruby }}
18
19
  steps:
19
- - uses: actions/checkout@v2
20
+ - uses: actions/checkout@v4
20
21
  - name: Set up Ruby
21
22
  uses: ruby/setup-ruby@v1
22
23
  with:
@@ -26,3 +27,17 @@ jobs:
26
27
  run: bundle install --jobs 4 --retry 3
27
28
  - name: Run tests
28
29
  run: bundle exec rake test
30
+
31
+ # rubocop linting
32
+ rubocop:
33
+ runs-on: ubuntu-latest
34
+ steps:
35
+ - uses: actions/checkout@v4
36
+ - name: Set up Ruby
37
+ uses: ruby/setup-ruby@v1
38
+ with:
39
+ ruby-version: 3.0
40
+ - name: Install dependencies
41
+ run: bundle install --jobs 4 --retry 3
42
+ - name: Run rubocop
43
+ run: bundle exec rubocop --parallel
data/.rubocop.yml ADDED
@@ -0,0 +1,11 @@
1
+ AllCops:
2
+ NewCops: enable
3
+ SuggestExtensions: false
4
+ TargetRubyVersion: 3.1
5
+
6
+ inherit_gem:
7
+ rubocop-ronin: rubocop.yml
8
+
9
+ #
10
+ # ronin-web-spider specific exceptions
11
+ #
data/ChangeLog.md CHANGED
@@ -1,4 +1,37 @@
1
- ### 0.1.0 / 2023-XX-XX
1
+ ### 0.2.0 / 2024-XX-XX
2
+
3
+ * Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
4
+ * Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
5
+ * Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
6
+ * Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
7
+ * Allow {Ronin::Web::Spider::Agent#every_html_comment},
8
+ {Ronin::Web::Spider::Agent#every_javascript every_javascript},
9
+ {Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
10
+ {Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
11
+ {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
12
+ {Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
13
+ {Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
14
+ to also yield a `Spidr::Page` block argument for additional context.
15
+
16
+ ### 0.1.1 / 2024-06-19
17
+
18
+ * Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
19
+ {Ronin::Web::Spider::Agent#every_javascript} when the page's `Content-Type`
20
+ header included `text/html` but lacked a response body, causing `page.doc` to
21
+ be `nil`.
22
+ * Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript} where parsed
23
+ JavaScript source code strings containing UTF-8 characters where being
24
+ incorrectly encoded as ASCII-8bit strings, if the page's `Content-Type` header
25
+ did not include a `charset=` attribute.
26
+ * Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript_string} where
27
+ inline JavaScript regexes containing the `"` or `'` characters (ex: `/["'=]/`)
28
+ would incorrectly be treated as the beginning or ends of JavaScript string
29
+ literals. Note that while this greatly improves the accuracy of
30
+ {Ronin::Web::Spider::Agent#every_javascript_string}, it still does not
31
+ support parsing JavaScript template literals that may also contain string
32
+ literals (ex: ````Hello \"World\"```` or ````Hello ${myFunc("string literal")}````).
33
+
34
+ ### 0.1.0 / 2023-02-01
2
35
 
3
36
  * Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
4
37
  * Relicensed as LGPL-3.0.
@@ -20,3 +53,4 @@
20
53
  * `every_comment` - yields every HTML or JavaScript comment.
21
54
  * Supports archiving spidered pages to a directory or git repository.
22
55
 
56
+ [spidr]: https://github.com/postmodern/spidr#readme
data/Gemfile CHANGED
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  source 'https://rubygems.org'
2
3
 
3
4
  gemspec
@@ -28,4 +29,6 @@ group :development do
28
29
  gem 'dead_end', require: false
29
30
  gem 'sord', require: false, platform: :mri
30
31
  gem 'stackprof', require: false, platform: :mri
32
+ gem 'rubocop', require: false, platform: :mri
33
+ gem 'rubocop-ronin', require: false, platform: :mri
31
34
  end
data/README.md CHANGED
@@ -9,7 +9,6 @@
9
9
  * [Issues](https://github.com/ronin-rb/ronin-web-spider/issues)
10
10
  * [Documentation](https://ronin-rb.dev/docs/ronin-web-spider/frames)
11
11
  * [Discord](https://discord.gg/6WAb3PsVX9) |
12
- [Twitter](https://twitter.com/ronin_rb) |
13
12
  [Mastodon](https://infosec.exchange/@ronin_rb)
14
13
 
15
14
  ## Description
@@ -38,7 +37,7 @@ ronin-web-spider is a collection of common web spidering routines using the
38
37
  * [every_comment][docs-every_comment] - yields every HTML or JavaScript
39
38
  comment.
40
39
  * Supports archiving spidered pages to a directory or git repository.
41
- * Has 94% documentation coverage.
40
+ * Has 97% documentation coverage.
42
41
  * Has 94% test coverage.
43
42
 
44
43
  [docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
@@ -305,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
305
304
  end
306
305
  ```
307
306
 
307
+ Print every JavaScript URL string literal:
308
+
309
+ ```ruby
310
+ Ronin::Web::Spider.domain('example.com') do |spider|
311
+ spider.every_javascript_url_string do |url|
312
+ puts url
313
+ end
314
+ end
315
+ ```
316
+
308
317
  Print every JavaScript comment:
309
318
 
310
319
  ```ruby
@@ -391,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
391
400
 
392
401
  ## License
393
402
 
394
- Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
403
+ Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
395
404
 
396
405
  ronin-web-spider is free software: you can redistribute it and/or modify
397
406
  it under the terms of the GNU Lesser General Public License as published
data/Rakefile CHANGED
@@ -1,11 +1,11 @@
1
- require 'rubygems'
1
+ # frozen_string_literal: true
2
2
 
3
3
  begin
4
4
  require 'bundler'
5
5
  rescue LoadError => e
6
6
  warn e.message
7
7
  warn "Run `gem install bundler` to install Bundler"
8
- exit -1
8
+ exit(-1)
9
9
  end
10
10
 
11
11
  begin
data/gemspec.yml CHANGED
@@ -1,5 +1,5 @@
1
1
  name: ronin-web-spider
2
- summary: collection of common web spidering routines
2
+ summary: A collection of common web spidering routines.
3
3
  description:
4
4
  ronin-web-spider is a collection of common web spidering routines using the
5
5
  spidr gem.
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -22,6 +23,7 @@ require 'spidr/agent'
22
23
  require 'ronin/support/network/http'
23
24
  require 'ronin/support/crypto/cert'
24
25
  require 'ronin/support/text/patterns/source_code'
26
+ require 'ronin/support/text/patterns/network'
25
27
  require 'ronin/support/encoding/js'
26
28
 
27
29
  module Ronin
@@ -224,10 +226,17 @@ module Ronin
224
226
  # @yield [comment]
225
227
  # The given block will be pass every HTML comment.
226
228
  #
229
+ # @yield [comment, page]
230
+ # If the block accepts two arguments, the HTML comment and the page
231
+ # that the comment was found on will be passed to the given block.
232
+ #
227
233
  # @yieldparam [String] comment
228
234
  # The HTML comment inner text, with leading and trailing whitespace
229
235
  # stripped.
230
236
  #
237
+ # @yieldparam [Spidr::Page] page
238
+ # The page that the HTML comment exists on.
239
+ #
231
240
  # @example
232
241
  # spider.every_html_comment do |comment|
233
242
  # puts comment
@@ -235,13 +244,19 @@ module Ronin
235
244
  #
236
245
  # @api public
237
246
  #
238
- def every_html_comment
247
+ def every_html_comment(&block)
239
248
  every_html_page do |page|
249
+ next unless page.doc
250
+
240
251
  page.doc.xpath('//comment()').each do |comment|
241
252
  comment_text = comment.inner_text.strip
242
253
 
243
254
  unless comment_text.empty?
244
- yield comment_text
255
+ if block.arity == 2
256
+ yield comment_text, page
257
+ else
258
+ yield comment_text
259
+ end
245
260
  end
246
261
  end
247
262
  end
@@ -253,9 +268,17 @@ module Ronin
253
268
  # @yield [js]
254
269
  # The given block will be passed every piece of JavaScript source.
255
270
  #
271
+ # @yield [js, page]
272
+ # If the block accepts two arguments, the JavaScript source and the
273
+ # page that the JavaScript source was found on will be passed to the
274
+ # given block.
275
+ #
256
276
  # @yieldparam [String] js
257
277
  # The JavaScript source code.
258
278
  #
279
+ # @yieldparam [Spidr::Page] page
280
+ # The page that the JavaScript source was found in or on.
281
+ #
259
282
  # @example
260
283
  # spider.every_javascript do |js|
261
284
  # puts js
@@ -263,24 +286,72 @@ module Ronin
263
286
  #
264
287
  # @api public
265
288
  #
266
- def every_javascript
289
+ def every_javascript(&block)
267
290
  # yield inner text of every `<script type="text/javascript">` tag
268
291
  # and every `.js` URL.
269
292
  every_html_page do |page|
293
+ next unless page.doc
294
+
270
295
  page.doc.xpath('//script[@type="text/javascript"]').each do |script|
271
- unless script.inner_text.empty?
272
- yield script.inner_text
296
+ source = script.inner_text
297
+ source.force_encoding(Encoding::UTF_8)
298
+
299
+ unless source.empty?
300
+ if block.arity == 2
301
+ yield source, page
302
+ else
303
+ yield source
304
+ end
273
305
  end
274
306
  end
275
307
  end
276
308
 
277
309
  every_javascript_page do |page|
278
- yield page.body
310
+ source = page.body
311
+ source.force_encoding(Encoding::UTF_8)
312
+
313
+ if block.arity == 2
314
+ yield source, page
315
+ else
316
+ yield source
317
+ end
279
318
  end
280
319
  end
281
320
 
282
321
  alias every_js every_javascript
283
322
 
323
+ # Regex to match and skip JavaScript inline regexes.
324
+ #
325
+ # @api private
326
+ #
327
+ # @since 0.1.1
328
+ JAVASCRIPT_INLINE_REGEX = %r{
329
+ (?# match before the regex to avoid matching division operators )
330
+ (?:[\{\[\(;:,]\s*|=\s*)
331
+ /
332
+ (?# inline regex contents )
333
+ (?:
334
+ \[ (?:\\. | [^\]]) \] (?# [...] ) |
335
+ \\. (?# backslash escaped characters ) |
336
+ [^/] (?# everything else )
337
+ )+
338
+ /[dgimsuvy]* (?# also match any regex flags )
339
+ }mx
340
+
341
+ # Regex to match and skip JavaScript template literals.
342
+ #
343
+ # @note
344
+ # This regex will not properly match nested template literals:
345
+ #
346
+ # ```javascript
347
+ # `foo ${`bar ${1+1}`}`
348
+ # ```
349
+ #
350
+ # @api private
351
+ #
352
+ # @since 0.1.1
353
+ JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
354
+
284
355
  #
285
356
  # Passes every JavaScript string value to the given block.
286
357
  #
@@ -288,35 +359,246 @@ module Ronin
288
359
  # The given block will be passed each JavaScript string with the quote
289
360
  # marks removed.
290
361
  #
362
+ # @yield [string, page]
363
+ # If the block accepts two arguments, the JavaScript string and the
364
+ # page that the JavaScript string was found on will be passed to the
365
+ # given block.
366
+ #
291
367
  # @yieldparam [String] string
292
368
  # The parsed contents of a JavaScript string.
293
369
  #
370
+ # @yieldparam [Spidr::Page] page
371
+ # The page that the JavaScript string was found in or on.
372
+ #
294
373
  # @example
295
374
  # spider.every_javascript_string do |str|
296
- # puts str
297
- # end
375
+ # puts str
376
+ # end
298
377
  #
299
378
  # @api public
300
379
  #
301
- def every_javascript_string
302
- every_javascript do |js|
303
- js.scan(Support::Text::Patterns::STRING) do |js_string|
304
- yield Support::Encoding::JS.unquote(js_string)
380
+ def every_javascript_string(&block)
381
+ every_javascript do |js,page|
382
+ scanner = StringScanner.new(js)
383
+
384
+ until scanner.eos?
385
+ # NOTE: this is a naive JavaScript string scanner and should
386
+ # eventually be replaced with a real JavaScript lexer or parser.
387
+ case scanner.peek(1)
388
+ when '"', "'" # beginning of a quoted string
389
+ js_string = scanner.scan(Support::Text::Patterns::STRING)
390
+ string = Support::Encoding::JS.unquote(js_string)
391
+
392
+ if block.arity == 2
393
+ yield string, page
394
+ else
395
+ yield string
396
+ end
397
+ else
398
+ scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
399
+ scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
400
+ scanner.getch
401
+ end
305
402
  end
306
403
  end
307
404
  end
308
405
 
309
406
  alias every_js_string every_javascript_string
310
407
 
408
+ # Regular expression that matches relative paths within JavaScript.
409
+ #
410
+ # @note
411
+ # This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
412
+ # but *not* `/foo`, `foo`, or `foo.`.
413
+ JAVASCRIPT_RELATIVE_PATH = %r{
414
+ \A
415
+ (?:
416
+ [^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
417
+ |
418
+ [^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
419
+ )
420
+ \z
421
+ }x
422
+
423
+ #
424
+ # Passes every JavaScript relative path string to the given block.
425
+ #
426
+ # @yield [string]
427
+ # The given block will be passed each JavaScript relative path string
428
+ # with the quote marks removed.
429
+ #
430
+ # @yield [string, page]
431
+ # If the block accepts two arguments, the JavaScript relative path
432
+ # string and the page that the JavaScript relative path string was
433
+ # found on will be passed to the given block.
434
+ #
435
+ # @yieldparam [String] string
436
+ # The parsed contents of a literal JavaScript relative path string.
437
+ #
438
+ # @yieldparam [Spidr::Page] page
439
+ # The page that the JavaScript relative path string was found in or
440
+ # on.
441
+ #
442
+ # @example
443
+ # spider.every_javascript_relative_path_string do |relative_path|
444
+ # puts relative_path
445
+ # end
446
+ #
447
+ # @api public
448
+ #
449
+ # @since 0.2.0
450
+ #
451
+ def every_javascript_relative_path_string(&block)
452
+ every_javascript_string do |string,page|
453
+ if string =~ JAVASCRIPT_RELATIVE_PATH
454
+ if block.arity == 2
455
+ yield string, page
456
+ else
457
+ yield string
458
+ end
459
+ end
460
+ end
461
+ end
462
+
463
+ alias every_js_relative_path_string every_javascript_relative_path_string
464
+
465
+ # Regular expression that matches absolute paths within JavaScript.
466
+ JAVASCRIPT_ABSOLUTE_PATH = %r{\A(?:/[^/\\ ]+)+\z}
467
+
468
+ #
469
+ # Passes every JavaScript absolute path string to the given block.
470
+ #
471
+ # @yield [string]
472
+ # The given block will be passed each JavaScript absolute path string
473
+ # with the quote marks removed.
474
+ #
475
+ # @yield [string, page]
476
+ # If the block accepts two arguments, the JavaScript absolute path
477
+ # string and the page that the JavaScript absolute path string was
478
+ # found on will be passed to the given block.
479
+ #
480
+ # @yieldparam [String] string
481
+ # The parsed contents of a literal JavaScript absolute path string.
482
+ #
483
+ # @yieldparam [Spidr::Page] page
484
+ # The page that the JavaScript absolute path string was found in or
485
+ # on.
486
+ #
487
+ # @example
488
+ # spider.every_javascript_absolute_path_string do |absolute_path|
489
+ # puts absolute_path
490
+ # end
491
+ #
492
+ # @api public
493
+ #
494
+ # @since 0.2.0
495
+ #
496
+ def every_javascript_absolute_path_string(&block)
497
+ every_javascript_string do |string,page|
498
+ if string =~ JAVASCRIPT_ABSOLUTE_PATH
499
+ if block.arity == 2
500
+ yield string, page
501
+ else
502
+ yield string
503
+ end
504
+ end
505
+ end
506
+ end
507
+
508
+ alias every_js_absolute_path_string every_javascript_absolute_path_string
509
+
510
+ #
511
+ # Passes every JavaScript path string to the given block.
512
+ #
513
+ # @yield [string]
514
+ # The given block will be passed each JavaScript path string with the
515
+ # quote marks removed.
516
+ #
517
+ # @yield [string, page]
518
+ # If the block accepts two arguments, the JavaScript path string and
519
+ # the page that the JavaScript path string was found on will be
520
+ # passed to the given block.
521
+ #
522
+ # @yieldparam [String] string
523
+ # The parsed contents of a literal JavaScript path string.
524
+ #
525
+ # @yieldparam [Spidr::Page] page
526
+ # The page that the JavaScript path string was found in or on.
527
+ #
528
+ # @example
529
+ # spider.every_javascript_path_string do |path|
530
+ # puts path
531
+ # end
532
+ #
533
+ # @api public
534
+ #
535
+ # @since 0.2.0
536
+ #
537
+ def every_javascript_path_string(&block)
538
+ every_javascript_relative_path_string(&block)
539
+ every_javascript_absolute_path_string(&block)
540
+ end
541
+
542
+ alias every_js_path_string every_javascript_path_string
543
+
544
+ #
545
+ # Passes every JavaScript URL string to the given block.
546
+ #
547
+ # @yield [string]
548
+ # The given block will be passed each JavaScript URL string with the
549
+ # quote marks removed.
550
+ #
551
+ # @yield [string, page]
552
+ # If the block accepts two arguments, the JavaScript URL string and
553
+ # the page that the JavaScript URL string was found on will be passed
554
+ # to the given block.
555
+ #
556
+ # @yieldparam [String] string
557
+ # The parsed contents of a literal JavaScript URL string.
558
+ #
559
+ # @yieldparam [Spidr::Page] page
560
+ # The page that the JavaScript URL string was found in or on.
561
+ #
562
+ # @example
563
+ # spider.every_javascript_url_string do |url|
564
+ # puts url
565
+ # end
566
+ #
567
+ # @api public
568
+ #
569
+ # @since 0.2.0
570
+ #
571
+ def every_javascript_url_string(&block)
572
+ every_javascript_string do |string,page|
573
+ if string =~ Support::Text::Patterns::URL
574
+ if block.arity == 2
575
+ yield string, page
576
+ else
577
+ yield string
578
+ end
579
+ end
580
+ end
581
+ end
582
+
583
+ alias every_js_url_string every_javascript_url_string
584
+
311
585
  #
312
586
  # Passes every JavaScript comment to the given block.
313
587
  #
314
588
  # @yield [comment]
315
589
  # The given block will be passed each JavaScript comment.
316
590
  #
591
+ # @yield [comment, page]
592
+ # If the block accepts two arguments, the JavaScript comment and the
593
+ # page that the JavaScript comment was found on will be passed to the
594
+ # given block.
595
+ #
317
596
  # @yieldparam [String] comment
318
597
  # The contents of a JavaScript comment.
319
598
  #
599
+ # @yieldparam [Spidr::Page] page
600
+ # The page that the JavaScript comment was found in or on.
601
+ #
320
602
  # @example
321
603
  # spider.every_javascript_comment do |comment|
322
604
  # puts comment
@@ -325,8 +607,14 @@ module Ronin
325
607
  # @api public
326
608
  #
327
609
  def every_javascript_comment(&block)
328
- every_javascript do |js|
329
- js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
610
+ every_javascript do |js,page|
611
+ js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
612
+ if block.arity == 2
613
+ yield comment, page
614
+ else
615
+ yield comment
616
+ end
617
+ end
330
618
  end
331
619
  end
332
620
 
@@ -338,9 +626,17 @@ module Ronin
338
626
  # @yield [comment]
339
627
  # The given block will be passed each HTML or JavaScript comment.
340
628
  #
629
+ # @yield [comment, page]
630
+ # If the block accepts two arguments, the HTML or JavaScript comment
631
+ # and the page that the HTML/JavaScript comment was found on will be
632
+ # passed to the given block.
633
+ #
341
634
  # @yieldparam [String] comment
342
635
  # The contents of a HTML or JavaScript comment.
343
636
  #
637
+ # @yieldparam [Spidr::Page] page
638
+ # The page that the HTML or JavaScript comment was found in or on.
639
+ #
344
640
  # @example
345
641
  # spider.every_comment do |comment|
346
642
  # puts comment
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
@@ -31,7 +32,7 @@ module Ronin
31
32
  #
32
33
  # require 'ronin/web/spider'
33
34
  # require 'ronin/web/spider/archive'
34
- #
35
+ #
35
36
  # Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
36
37
  # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
37
38
  # archive.write(page.url,page.body)
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
@@ -33,7 +34,7 @@ module Ronin
33
34
  # require 'ronin/web/spider'
34
35
  # require 'ronin/web/spider/git_archive'
35
36
  # require 'date'
36
- #
37
+ #
37
38
  # Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
38
39
  # archive.commit("Updated #{Date.today}") do
39
40
  # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -21,7 +22,7 @@ module Ronin
21
22
  module Web
22
23
  module Spider
23
24
  # ronin-web-spider version
24
- VERSION = '0.1.0'
25
+ VERSION = '0.2.0.rc1'
25
26
  end
26
27
  end
27
28
  end
@@ -1,7 +1,8 @@
1
+ # frozen_string_literal: true
1
2
  #
2
3
  # ronin-web-spider - A collection of common web spidering routines.
3
4
  #
4
- # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ # Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
5
6
  #
6
7
  # ronin-web-spider is free software: you can redistribute it and/or modify
7
8
  # it under the terms of the GNU Lesser General Public License as published
@@ -30,136 +31,136 @@ module Ronin
30
31
  # ## Examples
31
32
  #
32
33
  # Spider a host:
33
- #
34
+ #
34
35
  # ```ruby
35
36
  # require 'ronin/web/spider'
36
- #
37
+ #
37
38
  # Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
38
39
  # # ...
39
40
  # end
40
41
  # ```
41
- #
42
+ #
42
43
  # Spider a host:
43
- #
44
+ #
44
45
  # ```ruby
45
46
  # Ronin::Web::Spider.host('solnic.eu') do |agent|
46
47
  # # ...
47
48
  # end
48
49
  # ```
49
- #
50
+ #
50
51
  # Spider a domain (and any sub-domains):
51
- #
52
+ #
52
53
  # ```ruby
53
54
  # Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
54
55
  # # ...
55
56
  # end
56
57
  # ```
57
- #
58
+ #
58
59
  # Spider a site:
59
- #
60
+ #
60
61
  # ```ruby
61
62
  # Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
62
63
  # # ...
63
64
  # end
64
65
  # ```
65
- #
66
+ #
66
67
  # Spider multiple hosts:
67
- #
68
+ #
68
69
  # ```ruby
69
70
  # Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
70
71
  # # ...
71
72
  # end
72
73
  # ```
73
- #
74
+ #
74
75
  # Do not spider certain links:
75
- #
76
+ #
76
77
  # ```ruby
77
78
  # Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
78
79
  # # ...
79
80
  # end
80
81
  # ```
81
- #
82
+ #
82
83
  # Do not spider links on certain ports:
83
- #
84
+ #
84
85
  # ```ruby
85
86
  # Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
86
87
  # # ...
87
88
  # end
88
89
  # ```
89
- #
90
+ #
90
91
  # Do not spider links blacklisted in robots.txt:
91
- #
92
+ #
92
93
  # ```ruby
93
94
  # Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
94
95
  # # ...
95
96
  # end
96
97
  # ```
97
- #
98
+ #
98
99
  # Print out visited URLs:
99
- #
100
+ #
100
101
  # ```ruby
101
102
  # Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
102
103
  # spider.every_url { |url| puts url }
103
104
  # end
104
105
  # ```
105
- #
106
+ #
106
107
  # Build a URL map of a site:
107
- #
108
+ #
108
109
  # ```ruby
109
110
  # url_map = Hash.new { |hash,key| hash[key] = [] }
110
- #
111
+ #
111
112
  # Ronin::Web::Spider.site('http://intranet.com/') do |spider|
112
113
  # spider.every_link do |origin,dest|
113
114
  # url_map[dest] << origin
114
115
  # end
115
116
  # end
116
117
  # ```
117
- #
118
+ #
118
119
  # Print out the URLs that could not be requested:
119
- #
120
+ #
120
121
  # ```ruby
121
122
  # Ronin::Web::Spider.site('http://company.com/') do |spider|
122
123
  # spider.every_failed_url { |url| puts url }
123
124
  # end
124
125
  # ```
125
- #
126
+ #
126
127
  # Finds all pages which have broken links:
127
- #
128
+ #
128
129
  # ```ruby
129
130
  # url_map = Hash.new { |hash,key| hash[key] = [] }
130
- #
131
+ #
131
132
  # spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
132
133
  # spider.every_link do |origin,dest|
133
134
  # url_map[dest] << origin
134
135
  # end
135
136
  # end
136
- #
137
+ #
137
138
  # spider.failures.each do |url|
138
139
  # puts "Broken link #{url} found in:"
139
- #
140
+ #
140
141
  # url_map[url].each { |page| puts " #{page}" }
141
142
  # end
142
143
  # ```
143
- #
144
+ #
144
145
  # Search HTML and XML pages:
145
- #
146
+ #
146
147
  # ```ruby
147
148
  # Ronin::Web::Spider.site('http://company.com/') do |spider|
148
149
  # spider.every_page do |page|
149
150
  # puts ">>> #{page.url}"
150
- #
151
+ #
151
152
  # page.search('//meta').each do |meta|
152
153
  # name = (meta.attributes['name'] || meta.attributes['http-equiv'])
153
154
  # value = meta.attributes['content']
154
- #
155
+ #
155
156
  # puts " #{name} = #{value}"
156
157
  # end
157
158
  # end
158
159
  # end
159
160
  # ```
160
- #
161
+ #
161
162
  # Print out the titles from every page:
162
- #
163
+ #
163
164
  # ```ruby
164
165
  # Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
165
166
  # spider.every_html_page do |page|
@@ -167,9 +168,9 @@ module Ronin
167
168
  # end
168
169
  # end
169
170
  # ```
170
- #
171
+ #
171
172
  # Print out every HTTP redirect:
172
- #
173
+ #
173
174
  # ```ruby
174
175
  # Ronin::Web::Spider.host('company.com') do |spider|
175
176
  # spider.every_redirect_page do |page|
@@ -177,21 +178,21 @@ module Ronin
177
178
  # end
178
179
  # end
179
180
  # ```
180
- #
181
+ #
181
182
  # Find what kinds of web servers a host is using, by accessing the headers:
182
- #
183
+ #
183
184
  # ```ruby
184
185
  # servers = Set[]
185
- #
186
+ #
186
187
  # Ronin::Web::Spider.host('company.com') do |spider|
187
188
  # spider.all_headers do |headers|
188
189
  # servers << headers['server']
189
190
  # end
190
191
  # end
191
192
  # ```
192
- #
193
+ #
193
194
  # Pause the spider on a forbidden page:
194
- #
195
+ #
195
196
  # ```ruby
196
197
  # Ronin::Web::Spider.host('company.com') do |spider|
197
198
  # spider.every_forbidden_page do |page|
@@ -199,9 +200,9 @@ module Ronin
199
200
  # end
200
201
  # end
201
202
  # ```
202
- #
203
+ #
203
204
  # Skip the processing of a page:
204
- #
205
+ #
205
206
  # ```ruby
206
207
  # Ronin::Web::Spider.host('company.com') do |spider|
207
208
  # spider.every_missing_page do |page|
@@ -209,9 +210,9 @@ module Ronin
209
210
  # end
210
211
  # end
211
212
  # ```
212
- #
213
+ #
213
214
  # Skip the processing of links:
214
- #
215
+ #
215
216
  # ```ruby
216
217
  # Ronin::Web::Spider.host('company.com') do |spider|
217
218
  # spider.every_url do |url|
@@ -221,9 +222,9 @@ module Ronin
221
222
  # end
222
223
  # end
223
224
  # ```
224
- #
225
+ #
225
226
  # Detect when a new host name is spidered:
226
- #
227
+ #
227
228
  # ```ruby
228
229
  # Ronin::Web::Spider.domain('example.com') do |spider|
229
230
  # spider.every_host do |host|
@@ -231,9 +232,9 @@ module Ronin
231
232
  # end
232
233
  # end
233
234
  # ```
234
- #
235
+ #
235
236
  # Detect when a new SSL/TLS certificate is encountered:
236
- #
237
+ #
237
238
  # ```ruby
238
239
  # Ronin::Web::Spider.domain('example.com') do |spider|
239
240
  # spider.every_cert do |cert|
@@ -241,9 +242,9 @@ module Ronin
241
242
  # end
242
243
  # end
243
244
  # ```
244
- #
245
+ #
245
246
  # Print the MD5 checksum of every `favicon.ico` file:
246
- #
247
+ #
247
248
  # ```ruby
248
249
  # Ronin::Web::Spider.domain('example.com') do |spider|
249
250
  # spider.every_favicon do |page|
@@ -251,9 +252,9 @@ module Ronin
251
252
  # end
252
253
  # end
253
254
  # ```
254
- #
255
+ #
255
256
  # Print every HTML comment:
256
- #
257
+ #
257
258
  # ```ruby
258
259
  # Ronin::Web::Spider.domain('example.com') do |spider|
259
260
  # spider.every_html_comment do |comment|
@@ -261,9 +262,9 @@ module Ronin
261
262
  # end
262
263
  # end
263
264
  # ```
264
- #
265
+ #
265
266
  # Print all JavaScript source code:
266
- #
267
+ #
267
268
  # ```ruby
268
269
  # Ronin::Web::Spider.domain('example.com') do |spider|
269
270
  # spider.every_javascript do |js|
@@ -271,9 +272,9 @@ module Ronin
271
272
  # end
272
273
  # end
273
274
  # ```
274
- #
275
+ #
275
276
  # Print every JavaScript string literal:
276
- #
277
+ #
277
278
  # ```ruby
278
279
  # Ronin::Web::Spider.domain('example.com') do |spider|
279
280
  # spider.every_javascript_string do |str|
@@ -281,9 +282,9 @@ module Ronin
281
282
  # end
282
283
  # end
283
284
  # ```
284
- #
285
+ #
285
286
  # Print every JavaScript comment:
286
- #
287
+ #
287
288
  # ```ruby
288
289
  # Ronin::Web::Spider.domain('example.com') do |spider|
289
290
  # spider.every_javascript_comment do |comment|
@@ -291,9 +292,9 @@ module Ronin
291
292
  # end
292
293
  # end
293
294
  # ```
294
- #
295
+ #
295
296
  # Print every HTML and JavaScript comment:
296
- #
297
+ #
297
298
  # ```ruby
298
299
  # Ronin::Web::Spider.domain('example.com') do |spider|
299
300
  # spider.every_comment do |comment|
@@ -301,7 +302,7 @@ module Ronin
301
302
  # end
302
303
  # end
303
304
  # ```
304
- #
305
+ #
305
306
  module Spider
306
307
  #
307
308
  # Creates a new agent and begin spidering at the given URL.
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
3
  require 'yaml'
4
4
 
@@ -22,7 +22,7 @@ Gem::Specification.new do |gem|
22
22
  gem.homepage = gemspec['homepage']
23
23
  gem.metadata = gemspec['metadata'] if gemspec['metadata']
24
24
 
25
- glob = lambda { |patterns| gem.files & Dir[*patterns] }
25
+ glob = ->(patterns) { gem.files & Dir[*patterns] }
26
26
 
27
27
  gem.files = `git ls-files`.split($/)
28
28
  gem.files = glob[gemspec['files']] if gemspec['files']
@@ -46,7 +46,7 @@ Gem::Specification.new do |gem|
46
46
  gem.required_rubygems_version = gemspec['required_rubygems_version']
47
47
  gem.post_install_message = gemspec['post_install_message']
48
48
 
49
- split = lambda { |string| string.split(/,\s*/) }
49
+ split = ->(string) { string.split(/,\s*/) }
50
50
 
51
51
  if gemspec['dependencies']
52
52
  gemspec['dependencies'].each do |name,versions|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ronin-web-spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-01 00:00:00.000000000 Z
11
+ date: 2024-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -66,6 +66,7 @@ files:
66
66
  - ".github/workflows/ruby.yml"
67
67
  - ".gitignore"
68
68
  - ".rspec"
69
+ - ".rubocop.yml"
69
70
  - ".ruby-version"
70
71
  - ".yardopts"
71
72
  - COPYING.txt
@@ -105,8 +106,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
105
106
  - !ruby/object:Gem::Version
106
107
  version: '0'
107
108
  requirements: []
108
- rubygems_version: 3.3.26
109
+ rubygems_version: 3.3.27
109
110
  signing_key:
110
111
  specification_version: 4
111
- summary: collection of common web spidering routines
112
+ summary: A collection of common web spidering routines.
112
113
  test_files: []