RubyGems - ronin-web-spider - Versions diffs - 0.1.0 → 0.2.0.rc1 - Mend

ronin-web-spider 0.1.0 → 0.2.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +16 -1
data/.rubocop.yml +11 -0
data/ChangeLog.md +35 -1
data/Gemfile +3 -0
data/README.md +12 -3
data/Rakefile +2 -2
data/gemspec.yml +1 -1
data/lib/ronin/web/spider/agent.rb +311 -15
data/lib/ronin/web/spider/archive.rb +2 -1
data/lib/ronin/web/spider/exceptions.rb +2 -1
data/lib/ronin/web/spider/git_archive.rb +2 -1
data/lib/ronin/web/spider/version.rb +3 -2
data/lib/ronin/web/spider.rb +64 -63
data/ronin-web-spider.gemspec +3 -3
metadata +5 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fcb3d69132ae37799758c37282083f3b876e04e76aa3ab9f500f251b7df0984d
-  data.tar.gz: 04b92b26f1bcd6166530ddfe225cde18a4bbaa8a1eb3b395120ae1e6b41aec4b
+  metadata.gz: e1637185a37e17f587cab3bb0ec451dfa763ab58b167404ec5b1161a4a80316f
+  data.tar.gz: ea11da89c3c232feaca90c103c45cdb653eac9b94c6c97be34998d6b5089e896
 SHA512:
-  metadata.gz: e5cc4d39ac8e5f9d92edd240e836d5848f0b96798afbcab9c8116f8223142851d835b7bfd3e7a8d94e867951c4b995e0a66736a73b72d6a96f06fee6daf26bc9
-  data.tar.gz: 4f1facfbdffe1aca7fd0d10ff0c99d6f835b2633e94be49011b46127ca9cc7b76415930d5df0a961516000032b940f00e224c562923c06412c57f2896e50256f
+  metadata.gz: 4d2f5ac7b650096856b87f5e37cf42044a12db416eab2375715a39073606a1f4c30103fda27cb1faaf9a53533bdf323ae9912787078f24f120366fb519a3b4a1
+  data.tar.gz: b2ae98ce51187a5a65cd2355816b98d1916edb0d731f158e83a9366ec70261ef2e3eae9d5f3abe95b311a766c489064f855b4ee6afa4606592a5f9b560fbcb9d

data/.github/workflows/ruby.yml CHANGED Viewed

@@ -12,11 +12,12 @@ jobs:
           - '3.0'
           - '3.1'
           - '3.2'
+          - '3.3'
           - jruby
           - truffleruby
     name: Ruby ${{ matrix.ruby }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Set up Ruby
         uses: ruby/setup-ruby@v1
         with:
@@ -26,3 +27,17 @@ jobs:
         run: bundle install --jobs 4 --retry 3
       - name: Run tests
         run: bundle exec rake test
+  # rubocop linting
+  rubocop:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: 3.0
+      - name: Install dependencies
+        run: bundle install --jobs 4 --retry 3
+      - name: Run rubocop
+        run: bundle exec rubocop --parallel

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,11 @@
+AllCops:
+  NewCops: enable
+  SuggestExtensions: false
+  TargetRubyVersion: 3.1
+inherit_gem:
+  rubocop-ronin: rubocop.yml
+#
+# ronin-web-spider specific exceptions
+#

data/ChangeLog.md CHANGED Viewed

@@ -1,4 +1,37 @@
-### 0.1.0 / 2023-XX-XX
+### 0.2.0 / 2024-XX-XX
+* Added {Ronin::Web::Spider::Agent#every_javascript_url_string}.
+* Added {Ronin::Web::Spider::Agent#every_javascript_relative_path_string}.
+* Added {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string}.
+* Added {Ronin::Web::Spider::Agent#every_javascript_path_string}.
+* Allow {Ronin::Web::Spider::Agent#every_html_comment},
+  {Ronin::Web::Spider::Agent#every_javascript every_javascript},
+  {Ronin::Web::Spider::Agent#every_javascript_string every_javascript_string},
+  {Ronin::Web::Spider::Agent#every_javascript_relative_path_string every_javascript_relative_path_string},
+  {Ronin::Web::Spider::Agent#every_javascript_absolute_path_string every_javascript_absolute_path_string},
+  {Ronin::Web::Spider::Agent#every_javascript_url_string every_javascript_url_string}, and
+  {Ronin::Web::Spider::Agent#every_javascript_comment every_javascript_comment}
+  to also yield a `Spidr::Page` block argument for additional context.
+### 0.1.1 / 2024-06-19
+* Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
+  {Ronin::Web::Spider::Agent#every_javascript} when the page's `Content-Type`
+  header included `text/html` but lacked a response body, causing `page.doc` to
+  be `nil`.
+* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript} where parsed
+  JavaScript source code strings containing UTF-8 characters where being
+  incorrectly encoded as ASCII-8bit strings, if the page's `Content-Type` header
+  did not include a `charset=` attribute.
+* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript_string} where
+  inline JavaScript regexes containing the `"` or `'` characters (ex: `/["'=]/`)
+  would incorrectly be treated as the beginning or ends of JavaScript string
+  literals. Note that while this greatly improves the accuracy of
+  {Ronin::Web::Spider::Agent#every_javascript_string}, it still does not
+  support parsing JavaScript template literals that may also contain string
+  literals (ex: ````Hello \"World\"```` or ````Hello ${myFunc("string literal")}````).
+### 0.1.0 / 2023-02-01
 * Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
 * Relicensed as LGPL-3.0.
@@ -20,3 +53,4 @@
     * `every_comment` - yields every HTML or JavaScript comment.
   * Supports archiving spidered pages to a directory or git repository.
+[spidr]: https://github.com/postmodern/spidr#readme

data/Gemfile CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 source 'https://rubygems.org'
 gemspec
@@ -28,4 +29,6 @@ group :development do
   gem 'dead_end',        require: false
   gem 'sord',            require: false, platform: :mri
   gem 'stackprof',       require: false, platform: :mri
+  gem 'rubocop',         require: false, platform: :mri
+  gem 'rubocop-ronin',   require: false, platform: :mri
 end

data/README.md CHANGED Viewed

@@ -9,7 +9,6 @@
 * [Issues](https://github.com/ronin-rb/ronin-web-spider/issues)
 * [Documentation](https://ronin-rb.dev/docs/ronin-web-spider/frames)
 * [Discord](https://discord.gg/6WAb3PsVX9) |
-  [Twitter](https://twitter.com/ronin_rb) |
   [Mastodon](https://infosec.exchange/@ronin_rb)
 ## Description
@@ -38,7 +37,7 @@ ronin-web-spider is a collection of common web spidering routines using the
   * [every_comment][docs-every_comment] - yields every HTML or JavaScript
     comment.
 * Supports archiving spidered pages to a directory or git repository.
-* Has 94% documentation coverage.
+* Has 97% documentation coverage.
 * Has 94% test coverage.
 [docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
@@ -305,6 +304,16 @@ Ronin::Web::Spider.domain('example.com') do |spider|
 end
 ```
+Print every JavaScript URL string literal:
+```ruby
+Ronin::Web::Spider.domain('example.com') do |spider|
+  spider.every_javascript_url_string do |url|
+    puts url
+  end
+end
+```
 Print every JavaScript comment:
 ```ruby
@@ -391,7 +400,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
 ## License
-Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
+Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
 ronin-web-spider is free software: you can redistribute it and/or modify
 it under the terms of the GNU Lesser General Public License as published

data/Rakefile CHANGED Viewed

@@ -1,11 +1,11 @@
-require 'rubygems'
+# frozen_string_literal: true
 begin
   require 'bundler'
 rescue LoadError => e
   warn e.message
   warn "Run `gem install bundler` to install Bundler"
-  exit -1
+  exit(-1)
 end
 begin

data/gemspec.yml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: ronin-web-spider
-summary: collection of common web spidering routines
+summary: A collection of common web spidering routines.
 description:
   ronin-web-spider is a collection of common web spidering routines using the
   spidr gem.

data/lib/ronin/web/spider/agent.rb CHANGED Viewed

@@ -1,7 +1,8 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
-# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # ronin-web-spider is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published
@@ -22,6 +23,7 @@ require 'spidr/agent'
 require 'ronin/support/network/http'
 require 'ronin/support/crypto/cert'
 require 'ronin/support/text/patterns/source_code'
+require 'ronin/support/text/patterns/network'
 require 'ronin/support/encoding/js'
 module Ronin
@@ -224,10 +226,17 @@ module Ronin
         # @yield [comment]
         #   The given block will be pass every HTML comment.
         #
+        # @yield [comment, page]
+        #   If the block accepts two arguments, the HTML comment and the page
+        #   that the comment was found on will be passed to the given block.
+        #
         # @yieldparam [String] comment
         #   The HTML comment inner text, with leading and trailing whitespace
         #   stripped.
         #
+        # @yieldparam [Spidr::Page] page
+        #   The page that the HTML comment exists on.
+        #
         # @example
         #   spider.every_html_comment do |comment|
         #     puts comment
@@ -235,13 +244,19 @@ module Ronin
         #
         # @api public
         #
-        def every_html_comment
+        def every_html_comment(&block)
           every_html_page do |page|
+            next unless page.doc
             page.doc.xpath('//comment()').each do |comment|
               comment_text = comment.inner_text.strip
               unless comment_text.empty?
-                yield comment_text
+                if block.arity == 2
+                  yield comment_text, page
+                else
+                  yield comment_text
+                end
               end
             end
           end
@@ -253,9 +268,17 @@ module Ronin
         # @yield [js]
         #   The given block will be passed every piece of JavaScript source.
         #
+        # @yield [js, page]
+        #   If the block accepts two arguments, the JavaScript source and the
+        #   page that the JavaScript source was found on will be passed to the
+        #   given block.
+        #
         # @yieldparam [String] js
         #   The JavaScript source code.
         #
+        # @yieldparam [Spidr::Page] page
+        #   The page that the JavaScript source was found in or on.
+        #
         # @example
         #   spider.every_javascript do |js|
         #     puts js
@@ -263,24 +286,72 @@ module Ronin
         #
         # @api public
         #
-        def every_javascript
+        def every_javascript(&block)
           # yield inner text of every `<script type="text/javascript">` tag
           # and every `.js` URL.
           every_html_page do |page|
+            next unless page.doc
             page.doc.xpath('//script[@type="text/javascript"]').each do |script|
-              unless script.inner_text.empty?
-                yield script.inner_text
+              source = script.inner_text
+              source.force_encoding(Encoding::UTF_8)
+              unless source.empty?
+                if block.arity == 2
+                  yield source, page
+                else
+                  yield source
+                end
               end
             end
           end
           every_javascript_page do |page|
-            yield page.body
+            source = page.body
+            source.force_encoding(Encoding::UTF_8)
+            if block.arity == 2
+              yield source, page
+            else
+              yield source
+            end
           end
         end
         alias every_js every_javascript
+        # Regex to match and skip JavaScript inline regexes.
+        #
+        # @api private
+        #
+        # @since 0.1.1
+        JAVASCRIPT_INLINE_REGEX = %r{
+          (?# match before the regex to avoid matching division operators )
+          (?:[\{\[\(;:,]\s*|=\s*)
+          /
+            (?# inline regex contents )
+            (?:
+              \[ (?:\\. | [^\]]) \] (?# [...] ) |
+              \\.                   (?# backslash escaped characters ) |
+              [^/]                  (?# everything else )
+            )+
+          /[dgimsuvy]* (?# also match any regex flags )
+        }mx
+        # Regex to match and skip JavaScript template literals.
+        #
+        # @note
+        #   This regex will not properly match nested template literals:
+        #
+        #   ```javascript
+        #   `foo ${`bar ${1+1}`}`
+        #   ```
+        #
+        # @api private
+        #
+        # @since 0.1.1
+        JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
         #
         # Passes every JavaScript string value to the given block.
         #
@@ -288,35 +359,246 @@ module Ronin
         #   The given block will be passed each JavaScript string with the quote
         #   marks removed.
         #
+        # @yield [string, page]
+        #   If the block accepts two arguments, the JavaScript string and the
+        #   page that the JavaScript string was found on will be passed to the
+        #   given block.
+        #
         # @yieldparam [String] string
         #   The parsed contents of a JavaScript string.
         #
+        # @yieldparam [Spidr::Page] page
+        #   The page that the JavaScript string was found in or on.
+        #
         # @example
         #   spider.every_javascript_string do |str|
-        #    puts str
-        #  end
+        #     puts str
+        #   end
         #
         # @api public
         #
-        def every_javascript_string
-          every_javascript do |js|
-            js.scan(Support::Text::Patterns::STRING) do |js_string|
-              yield Support::Encoding::JS.unquote(js_string)
+        def every_javascript_string(&block)
+          every_javascript do |js,page|
+            scanner = StringScanner.new(js)
+            until scanner.eos?
+              # NOTE: this is a naive JavaScript string scanner and should
+              # eventually be replaced with a real JavaScript lexer or parser.
+              case scanner.peek(1)
+              when '"', "'" # beginning of a quoted string
+                js_string = scanner.scan(Support::Text::Patterns::STRING)
+                string    = Support::Encoding::JS.unquote(js_string)
+                if block.arity == 2
+                  yield string, page
+                else
+                  yield string
+                end
+              else
+                scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
+                  scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
+                  scanner.getch
+              end
             end
           end
         end
         alias every_js_string every_javascript_string
+        # Regular expression that matches relative paths within JavaScript.
+        #
+        # @note
+        #   This matches `foo/bar`, `foo/bar.ext`, `../foo`, and `foo.ext`,
+        #   but *not* `/foo`, `foo`, or `foo.`.
+        JAVASCRIPT_RELATIVE_PATH = %r{
+          \A
+            (?:
+               [^/\\. ]+\.[a-z0-9]+ (?# filename.ext)
+               |
+               [^/\\ ]+(?:/[^/\\ ]+)+ (?# dir/filename or dir/filename.ext)
+            )
+          \z
+        }x
+        #
+        # Passes every JavaScript relative path string to the given block.
+        #
+        # @yield [string]
+        #   The given block will be passed each JavaScript relative path string
+        #   with the quote marks removed.
+        #
+        # @yield [string, page]
+        #   If the block accepts two arguments, the JavaScript relative path
+        #   string and the page that the JavaScript relative path string was
+        #   found on will be passed to the given block.
+        #
+        # @yieldparam [String] string
+        #   The parsed contents of a literal JavaScript relative path string.
+        #
+        # @yieldparam [Spidr::Page] page
+        #   The page that the JavaScript relative path string was found in or
+        #   on.
+        #
+        # @example
+        #   spider.every_javascript_relative_path_string do |relative_path|
+        #     puts relative_path
+        #   end
+        #
+        # @api public
+        #
+        # @since 0.2.0
+        #
+        def every_javascript_relative_path_string(&block)
+          every_javascript_string do |string,page|
+            if string =~ JAVASCRIPT_RELATIVE_PATH
+              if block.arity == 2
+                yield string, page
+              else
+                yield string
+              end
+            end
+          end
+        end
+        alias every_js_relative_path_string every_javascript_relative_path_string
+        # Regular expression that matches absolute paths within JavaScript.
+        JAVASCRIPT_ABSOLUTE_PATH = %r{\A(?:/[^/\\ ]+)+\z}
+        #
+        # Passes every JavaScript absolute path string to the given block.
+        #
+        # @yield [string]
+        #   The given block will be passed each JavaScript absolute path string
+        #   with the quote marks removed.
+        #
+        # @yield [string, page]
+        #   If the block accepts two arguments, the JavaScript absolute path
+        #   string and the page that the JavaScript absolute path string was
+        #   found on will be passed to the given block.
+        #
+        # @yieldparam [String] string
+        #   The parsed contents of a literal JavaScript absolute path string.
+        #
+        # @yieldparam [Spidr::Page] page
+        #   The page that the JavaScript absolute path string was found in or
+        #   on.
+        #
+        # @example
+        #   spider.every_javascript_absolute_path_string do |absolute_path|
+        #     puts absolute_path
+        #   end
+        #
+        # @api public
+        #
+        # @since 0.2.0
+        #
+        def every_javascript_absolute_path_string(&block)
+          every_javascript_string do |string,page|
+            if string =~ JAVASCRIPT_ABSOLUTE_PATH
+              if block.arity == 2
+                yield string, page
+              else
+                yield string
+              end
+            end
+          end
+        end
+        alias every_js_absolute_path_string every_javascript_absolute_path_string
+        #
+        # Passes every JavaScript path string to the given block.
+        #
+        # @yield [string]
+        #   The given block will be passed each JavaScript path string with the
+        #   quote marks removed.
+        #
+        # @yield [string, page]
+        #   If the block accepts two arguments, the JavaScript path string and
+        #   the page that the JavaScript path string was found on will be
+        #   passed to the given block.
+        #
+        # @yieldparam [String] string
+        #   The parsed contents of a literal JavaScript path string.
+        #
+        # @yieldparam [Spidr::Page] page
+        #   The page that the JavaScript path string was found in or on.
+        #
+        # @example
+        #   spider.every_javascript_path_string do |path|
+        #     puts path
+        #   end
+        #
+        # @api public
+        #
+        # @since 0.2.0
+        #
+        def every_javascript_path_string(&block)
+          every_javascript_relative_path_string(&block)
+          every_javascript_absolute_path_string(&block)
+        end
+        alias every_js_path_string every_javascript_path_string
+        #
+        # Passes every JavaScript URL string to the given block.
+        #
+        # @yield [string]
+        #   The given block will be passed each JavaScript URL string with the
+        #   quote marks removed.
+        #
+        # @yield [string, page]
+        #   If the block accepts two arguments, the JavaScript URL string and
+        #   the page that the JavaScript URL string was found on will be passed
+        #   to the given block.
+        #
+        # @yieldparam [String] string
+        #   The parsed contents of a literal JavaScript URL string.
+        #
+        # @yieldparam [Spidr::Page] page
+        #   The page that the JavaScript URL string was found in or on.
+        #
+        # @example
+        #   spider.every_javascript_url_string do |url|
+        #     puts url
+        #   end
+        #
+        # @api public
+        #
+        # @since 0.2.0
+        #
+        def every_javascript_url_string(&block)
+          every_javascript_string do |string,page|
+            if string =~ Support::Text::Patterns::URL
+              if block.arity == 2
+                yield string, page
+              else
+                yield string
+              end
+            end
+          end
+        end
+        alias every_js_url_string every_javascript_url_string
         #
         # Passes every JavaScript comment to the given block.
         #
         # @yield [comment]
         #   The given block will be passed each JavaScript comment.
         #
+        # @yield [comment, page]
+        #   If the block accepts two arguments, the JavaScript comment and the
+        #   page that the JavaScript comment was found on will be passed to the
+        #   given block.
+        #
         # @yieldparam [String] comment
         #   The contents of a JavaScript comment.
         #
+        # @yieldparam [Spidr::Page] page
+        #   The page that the JavaScript comment was found in or on.
+        #
         # @example
         #   spider.every_javascript_comment do |comment|
         #     puts comment
@@ -325,8 +607,14 @@ module Ronin
         # @api public
         #
         def every_javascript_comment(&block)
-          every_javascript do |js|
-            js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
+          every_javascript do |js,page|
+            js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT) do |comment|
+              if block.arity == 2
+                yield comment, page
+              else
+                yield comment
+              end
+            end
           end
         end
@@ -338,9 +626,17 @@ module Ronin
         # @yield [comment]
         #   The given block will be passed each HTML or JavaScript comment.
         #
+        # @yield [comment, page]
+        #   If the block accepts two arguments, the HTML or JavaScript comment
+        #   and the page that the HTML/JavaScript comment was found on will be
+        #   passed to the given block.
+        #
         # @yieldparam [String] comment
         #   The contents of a HTML or JavaScript comment.
         #
+        # @yieldparam [Spidr::Page] page
+        #   The page that the HTML or JavaScript comment was found in or on.
+        #
         # @example
         #   spider.every_comment do |comment|
         #     puts comment

data/lib/ronin/web/spider/archive.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
@@ -31,7 +32,7 @@ module Ronin
       #
       #     require 'ronin/web/spider'
       #     require 'ronin/web/spider/archive'
-      #
+      #
       #     Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
       #       Ronin::Web::Spider.every_page(host: 'example.com') do |page|
       #         archive.write(page.url,page.body)

data/lib/ronin/web/spider/exceptions.rb CHANGED Viewed

@@ -1,7 +1,8 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
-# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # ronin-web-spider is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published

data/lib/ronin/web/spider/git_archive.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
@@ -33,7 +34,7 @@ module Ronin
       #     require 'ronin/web/spider'
       #     require 'ronin/web/spider/git_archive'
       #     require 'date'
-      #
+      #
       #     Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
       #       archive.commit("Updated #{Date.today}") do
       #         Ronin::Web::Spider.every_page(host: 'example.com') do |page|

data/lib/ronin/web/spider/version.rb CHANGED Viewed

@@ -1,7 +1,8 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
-# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # ronin-web-spider is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published
@@ -21,7 +22,7 @@ module Ronin
   module Web
     module Spider
       # ronin-web-spider version
-      VERSION = '0.1.0'
+      VERSION = '0.2.0.rc1'
     end
   end
 end

data/lib/ronin/web/spider.rb CHANGED Viewed

@@ -1,7 +1,8 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
-# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # ronin-web-spider is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published
@@ -30,136 +31,136 @@ module Ronin
     # ## Examples
     #
     # Spider a host:
-    #
+    #
     # ```ruby
     # require 'ronin/web/spider'
-    #
+    #
     # Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Spider a host:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('solnic.eu') do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Spider a domain (and any sub-domains):
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Spider a site:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Spider multiple hosts:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Do not spider certain links:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Do not spider links on certain ports:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Do not spider links blacklisted in robots.txt:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Print out visited URLs:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
     #   spider.every_url { |url| puts url }
     # end
     # ```
-    #
+    #
     # Build a URL map of a site:
-    #
+    #
     # ```ruby
     # url_map = Hash.new { |hash,key| hash[key] = [] }
-    #
+    #
     # Ronin::Web::Spider.site('http://intranet.com/') do |spider|
     #   spider.every_link do |origin,dest|
     #     url_map[dest] << origin
     #   end
     # end
     # ```
-    #
+    #
     # Print out the URLs that could not be requested:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/') do |spider|
     #   spider.every_failed_url { |url| puts url }
     # end
     # ```
-    #
+    #
     # Finds all pages which have broken links:
-    #
+    #
     # ```ruby
     # url_map = Hash.new { |hash,key| hash[key] = [] }
-    #
+    #
     # spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
     #   spider.every_link do |origin,dest|
     #     url_map[dest] << origin
     #   end
     # end
-    #
+    #
     # spider.failures.each do |url|
     #   puts "Broken link #{url} found in:"
-    #
+    #
     #   url_map[url].each { |page| puts "  #{page}" }
     # end
     # ```
-    #
+    #
     # Search HTML and XML pages:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/') do |spider|
     #   spider.every_page do |page|
     #     puts ">>> #{page.url}"
-    #
+    #
     #     page.search('//meta').each do |meta|
     #       name = (meta.attributes['name'] || meta.attributes['http-equiv'])
     #       value = meta.attributes['content']
-    #
+    #
     #       puts "  #{name} = #{value}"
     #     end
     #   end
     # end
     # ```
-    #
+    #
     # Print out the titles from every page:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
     #   spider.every_html_page do |page|
@@ -167,9 +168,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print out every HTTP redirect:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.every_redirect_page do |page|
@@ -177,21 +178,21 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Find what kinds of web servers a host is using, by accessing the headers:
-    #
+    #
     # ```ruby
     # servers = Set[]
-    #
+    #
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.all_headers do |headers|
     #     servers << headers['server']
     #   end
     # end
     # ```
-    #
+    #
     # Pause the spider on a forbidden page:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.every_forbidden_page do |page|
@@ -199,9 +200,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Skip the processing of a page:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.every_missing_page do |page|
@@ -209,9 +210,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Skip the processing of links:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.every_url do |url|
@@ -221,9 +222,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Detect when a new host name is spidered:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_host do |host|
@@ -231,9 +232,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Detect when a new SSL/TLS certificate is encountered:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_cert do |cert|
@@ -241,9 +242,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print the MD5 checksum of every `favicon.ico` file:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_favicon do |page|
@@ -251,9 +252,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print every HTML comment:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_html_comment do |comment|
@@ -261,9 +262,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print all JavaScript source code:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_javascript do |js|
@@ -271,9 +272,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print every JavaScript string literal:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_javascript_string do |str|
@@ -281,9 +282,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print every JavaScript comment:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_javascript_comment do |comment|
@@ -291,9 +292,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print every HTML and JavaScript comment:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_comment do |comment|
@@ -301,7 +302,7 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     module Spider
       #
       # Creates a new agent and begin spidering at the given URL.

data/ronin-web-spider.gemspec CHANGED Viewed

@@ -1,4 +1,4 @@
-# encoding: utf-8
+# frozen_string_literal: true
 require 'yaml'
@@ -22,7 +22,7 @@ Gem::Specification.new do |gem|
   gem.homepage    = gemspec['homepage']
   gem.metadata    = gemspec['metadata'] if gemspec['metadata']
-  glob = lambda { |patterns| gem.files & Dir[*patterns] }
+  glob = ->(patterns) { gem.files & Dir[*patterns] }
   gem.files  = `git ls-files`.split($/)
   gem.files  = glob[gemspec['files']] if gemspec['files']
@@ -46,7 +46,7 @@ Gem::Specification.new do |gem|
   gem.required_rubygems_version = gemspec['required_rubygems_version']
   gem.post_install_message      = gemspec['post_install_message']
-  split = lambda { |string| string.split(/,\s*/) }
+  split = ->(string) { string.split(/,\s*/) }
   if gemspec['dependencies']
     gemspec['dependencies'].each do |name,versions|

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ronin-web-spider
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0.rc1
 platform: ruby
 authors:
 - Postmodern
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-02-01 00:00:00.000000000 Z
+date: 2024-06-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: spidr
@@ -66,6 +66,7 @@ files:
 - ".github/workflows/ruby.yml"
 - ".gitignore"
 - ".rspec"
+- ".rubocop.yml"
 - ".ruby-version"
 - ".yardopts"
 - COPYING.txt
@@ -105,8 +106,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.3.26
+rubygems_version: 3.3.27
 signing_key:
 specification_version: 4
-summary: collection of common web spidering routines
+summary: A collection of common web spidering routines.
 test_files: []