RubyGems - ronin-web-spider - Versions diffs - 0.1.0.beta1 - Mend

ronin-web-spider 0.1.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +7 -0
data/.document +5 -0
data/.github/workflows/ruby.yml +31 -0
data/.gitignore +13 -0
data/.rspec +1 -0
data/.ruby-version +1 -0
data/.yardopts +1 -0
data/COPYING.txt +165 -0
data/ChangeLog.md +19 -0
data/Gemfile +31 -0
data/README.md +139 -0
data/Rakefile +31 -0
data/gemspec.yml +27 -0
data/lib/ronin/web/spider/agent.rb +302 -0
data/lib/ronin/web/spider/archive.rb +116 -0
data/lib/ronin/web/spider/exceptions.rb +36 -0
data/lib/ronin/web/spider/git_archive.rb +194 -0
data/lib/ronin/web/spider/version.rb +27 -0
data/lib/ronin/web/spider.rb +115 -0
data/ronin-web-spider.gemspec +61 -0
data/spec/agent_spec.rb +585 -0
data/spec/archive_spec.rb +91 -0
data/spec/example_app.rb +27 -0
data/spec/git_archive_spec.rb +137 -0
data/spec/spec_helper.rb +4 -0
data/spec/spider_spec.rb +252 -0
metadata +122 -0

data/lib/ronin/web/spider/agent.rb ADDED Viewed

@@ -0,0 +1,302 @@
+#
+# ronin-web-spider - A collection of common web spidering routines.
+#
+# Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
+#
+# ronin-web-spider is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ronin-web-spider is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with ronin-web-spider.  If not, see <https://www.gnu.org/licenses/>.
+#
+require 'spidr/agent'
+require 'ronin/support/network/http'
+require 'ronin/support/crypto/cert'
+require 'ronin/support/text/patterns/source_code'
+require 'ronin/support/encoding/js'
+module Ronin
+  module Web
+    module Spider
+      #
+      # Extends [Spidr::Agent](https://rubydoc.info/gems/spidr/Agent).
+      #
+      class Agent < Spidr::Agent
+        #
+        # Creates a new Spider object.
+        #
+        # @param [Spidr::Proxy, Addressable::URI, URI::HTTP, Hash, String, nil] proxy
+        #   The proxy to use while spidering.
+        #
+        # @param [String, nil] user_agent
+        #   The User-Agent string to send.
+        #
+        # @param [Hash{Symbol => Object}] kwargs
+        #   Additional keyword arguments for `Spidr::Agent#initialize`.
+        #
+        # @option kwargs [String, nil] :referer
+        #   The referer URL to send.
+        #
+        # @option kwargs [Integer] :delay (0)
+        #   Duration in seconds to pause between spidering each link.
+        #
+        # @option kwargs [Array] :schemes (['http', 'https'])
+        #   The list of acceptable URI schemes to visit.
+        #   The `https` scheme will be ignored if `net/https` cannot be
+        #   loaded.
+        #
+        # @option kwargs [String, nil] :host
+        #   The host-name to visit.
+        #
+        # @option kwargs [Array<String, Regexp, Proc>] :hosts
+        #   The patterns which match the host-names to visit.
+        #
+        # @option kwargs [Array<String, Regexp, Proc>] :ignore_hosts
+        #   The patterns which match the host-names to not visit.
+        #
+        # @option kwargs [Array<Integer, Regexp, Proc>] :ports
+        #   The patterns which match the ports to visit.
+        #
+        # @option kwargs [Array<Integer, Regexp, Proc>] :ignore_ports
+        #   The patterns which match the ports to not visit.
+        #
+        # @option kwargs [Array<String, Regexp, Proc>] :links
+        #   The patterns which match the links to visit.
+        #
+        # @option kwargs [Array<String, Regexp, Proc>] :ignore_links
+        #   The patterns which match the links to not visit.
+        #
+        # @option kwargs [Array<String, Regexp, Proc>] :exts
+        #   The patterns which match the URI path extensions to visit.
+        #
+        # @option kwargs [Array<String, Regexp, Proc>] :ignore_exts
+        #   The patterns which match the URI path extensions to not visit.
+        #
+        # @yield [agent]
+        #   If a block is given, it will be passed the newly created web spider
+        #   agent.
+        #
+        # @yieldparam [Agent] agent
+        #   The newly created web spider agent.
+        #
+        # @see https://rubydoc.info/gems/spidr/Spidr/Agent#initialize-instance_method
+        #
+        # @api public
+        #
+        def initialize(proxy:      Support::Network::HTTP.proxy,
+                       user_agent: Support::Network::HTTP.user_agent,
+                       **kwargs,
+                       &block)
+          proxy = case proxy
+                  when Addressable::URI
+                    Spidr::Proxy.new(
+                      host:     proxy.host,
+                      port:     proxy.port,
+                      user:     proxy.user,
+                      password: proxy.password
+                    )
+                  else
+                    proxy
+                  end
+          user_agent = case user_agent
+                       when Symbol
+                         Support::Network::HTTP::UserAgents[user_agent]
+                       else
+                         user_agent
+                       end
+          super(proxy: proxy, user_agent: user_agent, **kwargs,&block)
+        end
+        # The visited host names.
+        #
+        # @return [Set<String>, nil]
+        attr_reader :visited_hosts
+        #
+        # Passes every unique host name that the agent visits to the given
+        # block and populates {#visited_hosts}.
+        #
+        # @yield [host]
+        #
+        # @yieldparam [String] host
+        #
+        def every_host
+          @visited_hosts ||= Set.new
+          every_page do |page|
+            host = page.url.host
+            if @visited_hosts.add?(host)
+              yield host
+            end
+          end
+        end
+        # All certificates encountered while spidering.
+        #
+        # @return [Array<Ronin::Support::Crypto::Cert>]
+        attr_reader :collected_certs
+        #
+        # Passes every unique TLS certificate to the given block and populates
+        # {#collected_certs}.
+        #
+        # @yield [cert]
+        #
+        # @yieldparam [Ronin::Support::Crypto::Cert]
+        #
+        def every_cert
+          @collected_certs ||= []
+          serials = Set.new
+          every_page do |page|
+            if page.url.scheme == 'https'
+              cert = sessions[page.url].peer_cert
+              if serials.add?(cert.serial)
+                cert = Support::Crypto::Cert(cert)
+                @collected_certs << cert
+                yield cert
+              end
+            end
+          end
+        end
+        #
+        # Pass every favicon from every page to the given block.
+        #
+        # @yield [favicon]
+        #   The given block will be passed every encountered `.ico` file.
+        #
+        # @yieldparam [Spidr::Page] favicon
+        #   An encountered `.ico` file.
+        #
+        # @see https://rubydoc.info/gems/spidr/Spidr/Page
+        #
+        def every_favicon
+          every_page do |page|
+            yield page if page.icon?
+          end
+        end
+        #
+        # Passes every non-empty HTML comment to the given block.
+        #
+        # @yield [comment]
+        #   The given block will be passevery HTML comment.
+        #
+        # @yieldparam [String] comment
+        #   The HTML comment inner text, with leading and trailing whitespace
+        #   stripped.
+        #
+        def every_html_comment
+          every_html_page do |page|
+            page.doc.xpath('//comment()').each do |comment|
+              comment_text = comment.inner_text.strip
+              unless comment_text.empty?
+                yield comment_text
+              end
+            end
+          end
+        end
+        #
+        # Passes every piece of JavaScript to the given block.
+        #
+        # @yield [js]
+        #   The given block will be passed every piece of JavaScript source.
+        #
+        # @yieldparam [String] js
+        #   The JavaScript source code.
+        #
+        def every_javascript
+          # yield inner text of every `<script type="text/javascript">` tag
+          # and every `.js` URL.
+          every_html_page do |page|
+            page.doc.xpath('//script[@type="text/javascript"]').each do |script|
+              unless script.inner_text.empty?
+                yield script.inner_text
+              end
+            end
+          end
+          every_javascript_page do |page|
+            yield page.body
+          end
+        end
+        alias every_js every_javascript
+        #
+        # Passes every JavaScript string value to the given block.
+        #
+        # @yield [string]
+        #   The given block will be passed each JavaScript string with the quote
+        #   marks removed.
+        #
+        # @yieldparam [String] string
+        #   The parsed contents of a JavaScript string.
+        #
+        def every_javascript_string
+          every_javascript do |js|
+            js.scan(Support::Text::Patterns::STRING) do |js_string|
+              yield Support::Encoding::JS.unquote(js_string)
+            end
+          end
+        end
+        alias every_js_string every_javascript_string
+        #
+        # Passes every JavaScript comment to the given block.
+        #
+        # @yield [comment]
+        #   The given block will be passed each JavaScript comment.
+        #
+        # @yieldparam [String] comment
+        #   The contents of a JavaScript comment.
+        #
+        def every_javascript_comment(&block)
+          every_javascript do |js|
+            js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
+          end
+        end
+        alias every_js_comment every_javascript_comment
+        #
+        # Passes every HTML and JavaScript comment to the given block.
+        #
+        # @yield [comment]
+        #   The given block will be passed each HTML or JavaScript comment.
+        #
+        # @yieldparam [String] comment
+        #   The contents of a HTML or JavaScript comment.
+        #
+        # @see #every_html_comment
+        # @see #every_javascript_comment
+        #
+        def every_comment(&block)
+          every_html_comment(&block)
+          every_javascript_comment(&block)
+        end
+      end
+    end
+  end
+end

data/lib/ronin/web/spider/archive.rb ADDED Viewed

@@ -0,0 +1,116 @@
+#
+# ronin-web-spider - A collection of common web spidering routines.
+#
+# Copyright (c) 2022 Hal Brodigan (postmodern.mod3 at gmail.com)
+#
+# ronin-web-spider is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ronin-web-spider is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with ronin-web-spider.  If not, see <https://www.gnu.org/licenses/>.
+#
+require 'fileutils'
+module Ronin
+  module Web
+    module Spider
+      #
+      # Represents a web archive directory.
+      #
+      # ## Example
+      #
+      # Spider a host and archive every web page:
+      #
+      #     Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
+      #       Ronin::Web::Spider.every_page(host: 'example.com') do |page|
+      #         archive.write(page.url,page.body)
+      #       end
+      #     end
+      #
+      class Archive
+        # The path to the archive root directory.
+        #
+        # @return [String]
+        attr_reader :root
+        #
+        # Initializes the archive.
+        #
+        # @param [String] root
+        #   The path to the root directory.
+        #
+        def initialize(root)
+          @root = File.expand_path(root)
+        end
+        #
+        # Creates the archive and the archive's directory, if it already does
+        # not exist.
+        #
+        # @param [String] root
+        #   The path to the new archive.
+        #
+        # @yield [archive]
+        #   If a block is given, it will be passed the newly created archive.
+        #
+        # @yieldparam [Archive] archive
+        #   The newly created archive.
+        #
+        # @return [GitArchive]
+        #   The newly created archive.
+        #
+        def self.open(root)
+          archive = new(root)
+          FileUtils.mkdir_p(archive.root)
+          yield archive if block_given?
+          return archive
+        end
+        #
+        # Archives a webpage.
+        #
+        # @param [URI::HTTP] url
+        #   The URL of the response.
+        #
+        # @param [String] body
+        #   The response body to save.
+        #
+        # @return [String]
+        #   The full path to the archived page.
+        #
+        def write(url,body)
+          absolute_path = File.join(@root,url.request_uri[1..])
+          absolute_path << 'index.html' if absolute_path.end_with?('/')
+          parent_dir = File.dirname(absolute_path)
+          FileUtils.mkdir_p(parent_dir) unless File.directory?(parent_dir)
+          File.write(absolute_path,body)
+          return absolute_path
+        end
+        #
+        # Converts the archive to a String.
+        #
+        # @return [String]
+        #   The path of the archive directory.
+        #
+        def to_s
+          @root
+        end
+      end
+    end
+  end
+end

data/lib/ronin/web/spider/exceptions.rb ADDED Viewed

@@ -0,0 +1,36 @@
+#
+# ronin-web-spider - A collection of common web spidering routines.
+#
+# Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
+#
+# ronin-web-spider is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ronin-web-spider is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with ronin-web-spider.  If not, see <https://www.gnu.org/licenses/>.
+#
+module Ronin
+  module Web
+    module Spider
+      #
+      # An exception class for when a `git` command fails.
+      #
+      class GitError < RuntimeError
+      end
+      #
+      # The exception class that represents when `git` is not installed.
+      #
+      class GitNotInstalled < GitError
+      end
+    end
+  end
+end

data/lib/ronin/web/spider/git_archive.rb ADDED Viewed

@@ -0,0 +1,194 @@
+#
+# ronin-web-spider - A collection of common web spidering routines.
+#
+# Copyright (c) 2022 Hal Brodigan (postmodern.mod3 at gmail.com)
+#
+# ronin-web-spider is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ronin-web-spider is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with ronin-web-spider.  If not, see <https://www.gnu.org/licenses/>.
+#
+require 'ronin/web/spider/archive'
+require 'ronin/web/spider/exceptions'
+module Ronin
+  module Web
+    module Spider
+      #
+      # Represents a web archive directory that is backed by Git.
+      #
+      # ## Example
+      #
+      # Spider a host and archive every web page to a Git repository:
+      #
+      #     require 'ronin/web/spider/git_archive'
+      #     require 'ronin/web/spider'
+      #     require 'date'
+      #
+      #     Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
+      #       archive.commit("Updated #{Date.today}") do
+      #         Ronin::Web::Spider.every_page(host: 'example.com') do |page|
+      #           archive.write(page.url,page.body)
+      #         end
+      #       end
+      #     end
+      #
+      class GitArchive < Archive
+        #
+        # Creates the Git archive, if it already does not exist.
+        #
+        # @param [String] root
+        #   The path to the new Git archive.
+        #
+        # @yield [archive]
+        #   If a block is given, it will be passed the newly created Git
+        #   archive.
+        #
+        # @yieldparam [GitArchive] archive
+        #   The newly created Git archive.
+        #
+        # @return [GitArchive]
+        #   The newly created Git archive.
+        #
+        def self.open(root)
+          super(root) do |archive|
+            archive.init unless archive.git?
+            yield archive if block_given?
+          end
+        end
+        #
+        # Determines if the git repository has been initialized.
+        #
+        # @return [Boolean]
+        #
+        def git?
+          File.directory?(File.join(@root,'.git'))
+        end
+        #
+        # Initializes the Git repository.
+        #
+        # @return [true]
+        #   Indicates the Git repository was successfully initialized.
+        #
+        # @raise [GitError]
+        #   Indicates that the `git` command exited with an error.
+        #
+        # @raise [GitNotInstalled]
+        #   Indicates that `git` was not installed or could not be found in the
+        #   `$PATH` environment variable.
+        #
+        def init
+          git('init')
+        end
+        #
+        # Saves a webpage to the Git archive.
+        #
+        # @param [URI::HTTP] url
+        #   The URL of the response.
+        #
+        # @param [String] body
+        #   The response body to save.
+        #
+        # @return [String]
+        #   The full path to the archived page.
+        #
+        # @raise [GitError]
+        #   Indicates that the `git` command exited with an error.
+        #
+        # @raise [GitNotInstalled]
+        #   Indicates that `git` was not installed or could not be found in the
+        #   `$PATH` environment variable.
+        #
+        def write(url,body)
+          absolute_path = super(url,body)
+          git('add',absolute_path)
+          return absolute_path
+        end
+        #
+        # Commits changes to the Git archive.
+        #
+        # @param [String] message
+        #   The commit message.
+        #
+        # @yield [self]
+        #   If a block is given it will be called before committing any changes.
+        #
+        # @return [true]
+        #   Indicates whether the changes were successfully committed.
+        #
+        # @raise [GitError]
+        #   Indicates the `git` command exited with an error.
+        #
+        # @raise [GitNotInstalled]
+        #   Indicates that `git` was not installed or could not be found in the
+        #   `$PATH` environment variable.
+        #
+        # @example
+        #   archive.write(url,response.body)
+        #   archive.commit "Updated #{Date.today}"
+        #
+        # @example with a block:
+        #   archive.commit("Updated #{Date.today}") do
+        #     Ronin::Web::Spider.every_page(host: 'example.com') do |page|
+        #       archive.write(page.url,page.body)
+        #     end
+        #   end
+        #
+        def commit(message)
+          yield self if block_given?
+          git('commit','-m',message.to_s)
+        end
+        private
+        #
+        # Executes a `git` command in the archive root directory..
+        #
+        # @param [Array<String>] args
+        #   Additional arguments for the `git` command.
+        #
+        # @return [true]
+        #   Indicates that the `git` command executed successfully.
+        #
+        # @raise [GitError]
+        #   Indicates that the `git` command exited with an error.
+        #
+        # @raise [GitNotInstalled]
+        #   Indicates that `git` was not installed or could not be found in the
+        #   `$PATH` environment variable.
+        #
+        def git(*args)
+          command = ['git', '-C', @root]
+          command.concat(args)
+          case system(*command)
+          when false
+            raise(GitError,"git command failed: #{command.join(' ')}")
+          when nil
+            raise(GitNotInstalled,"the git command was not found")
+          else
+            true
+          end
+        end
+      end
+    end
+  end
+end

data/lib/ronin/web/spider/version.rb ADDED Viewed

@@ -0,0 +1,27 @@
+#
+# ronin-web-spider - A collection of common web spidering routines.
+#
+# Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
+#
+# ronin-web-spider is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ronin-web-spider is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with ronin-web-spider.  If not, see <https://www.gnu.org/licenses/>.
+#
+module Ronin
+  module Web
+    module Spider
+      # ronin-web-spider version
+      VERSION = '0.1.0.beta1'
+    end
+  end
+end