RubyGems - ronin-web-spider - Versions diffs - 0.1.0 → 0.1.1 - Mend

ronin-web-spider 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +16 -1
data/.rubocop.yml +11 -0
data/ChangeLog.md +20 -1
data/Gemfile +3 -0
data/README.md +1 -2
data/Rakefile +2 -2
data/gemspec.yml +1 -1
data/lib/ronin/web/spider/agent.rb +63 -7
data/lib/ronin/web/spider/archive.rb +2 -1
data/lib/ronin/web/spider/exceptions.rb +1 -0
data/lib/ronin/web/spider/git_archive.rb +2 -1
data/lib/ronin/web/spider/version.rb +2 -1
data/lib/ronin/web/spider.rb +63 -62
data/ronin-web-spider.gemspec +3 -3
metadata +5 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fcb3d69132ae37799758c37282083f3b876e04e76aa3ab9f500f251b7df0984d
-  data.tar.gz: 04b92b26f1bcd6166530ddfe225cde18a4bbaa8a1eb3b395120ae1e6b41aec4b
+  metadata.gz: dab34842325a731e13f23303b1dec66c7ab9d78b4805b982d518ba01024c9352
+  data.tar.gz: '0668f126b3e828c6409cc7b7adef2d74cd527f48de334b10a3d54f3767fe3afd'
 SHA512:
-  metadata.gz: e5cc4d39ac8e5f9d92edd240e836d5848f0b96798afbcab9c8116f8223142851d835b7bfd3e7a8d94e867951c4b995e0a66736a73b72d6a96f06fee6daf26bc9
-  data.tar.gz: 4f1facfbdffe1aca7fd0d10ff0c99d6f835b2633e94be49011b46127ca9cc7b76415930d5df0a961516000032b940f00e224c562923c06412c57f2896e50256f
+  metadata.gz: d474705a601b7fe27be2a9c5f5e5485ed39b38dec1581db295b0e1ff524c987c9e74522afd24569829bc7e64f6930767104f9bd927a3007a17f627de003492f7
+  data.tar.gz: 397b84308ec62d51e1dba64cff37c75eda8ebd8c2e6a792487468158fb774aae566b8be5cd0388521774d62873e5ba6b751423bc1138ac9e3d47804bdd877a81

data/.github/workflows/ruby.yml CHANGED Viewed

@@ -12,11 +12,12 @@ jobs:
           - '3.0'
           - '3.1'
           - '3.2'
+          - '3.3'
           - jruby
           - truffleruby
     name: Ruby ${{ matrix.ruby }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Set up Ruby
         uses: ruby/setup-ruby@v1
         with:
@@ -26,3 +27,17 @@ jobs:
         run: bundle install --jobs 4 --retry 3
       - name: Run tests
         run: bundle exec rake test
+  # rubocop linting
+  rubocop:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: 3.0
+      - name: Install dependencies
+        run: bundle install --jobs 4 --retry 3
+      - name: Run rubocop
+        run: bundle exec rubocop --parallel

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,11 @@
+AllCops:
+  NewCops: enable
+  SuggestExtensions: false
+  TargetRubyVersion: 3.1
+inherit_gem:
+  rubocop-ronin: rubocop.yml
+#
+# ronin-web-spider specific exceptions
+#

data/ChangeLog.md CHANGED Viewed

@@ -1,4 +1,22 @@
-### 0.1.0 / 2023-XX-XX
+### 0.1.1 / 2024-06-19
+* Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
+  {Ronin::Web::Spider::Agent#every_javascript} when the page's `Content-Type`
+  header included `text/html` but lacked a response body, causing `page.doc` to
+  be `nil`.
+* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript} where parsed
+  JavaScript source code strings containing UTF-8 characters where being
+  incorrectly encoded as ASCII-8bit strings, if the page's `Content-Type` header
+  did not include a `charset=` attribute.
+* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript_string} where
+  inline JavaScript regexes containing the `"` or `'` characters (ex: `/["'=]/`)
+  would incorrectly be treated as the beginning or ends of JavaScript string
+  literals. Note that while this greatly improves the accuracy of
+  {Ronin::Web::Spider::Agent#every_javascript_string}, it still does not
+  support parsing JavaScript template literals that may also contain string
+  literals (ex: ````Hello \"World\"```` or ````Hello ${myFunc("string literal")}````).
+### 0.1.0 / 2023-02-01
 * Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
 * Relicensed as LGPL-3.0.
@@ -20,3 +38,4 @@
     * `every_comment` - yields every HTML or JavaScript comment.
   * Supports archiving spidered pages to a directory or git repository.
+[spidr]: https://github.com/postmodern/spidr#readme

data/Gemfile CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 source 'https://rubygems.org'
 gemspec
@@ -28,4 +29,6 @@ group :development do
   gem 'dead_end',        require: false
   gem 'sord',            require: false, platform: :mri
   gem 'stackprof',       require: false, platform: :mri
+  gem 'rubocop',         require: false, platform: :mri
+  gem 'rubocop-ronin',   require: false, platform: :mri
 end

data/README.md CHANGED Viewed

@@ -9,7 +9,6 @@
 * [Issues](https://github.com/ronin-rb/ronin-web-spider/issues)
 * [Documentation](https://ronin-rb.dev/docs/ronin-web-spider/frames)
 * [Discord](https://discord.gg/6WAb3PsVX9) |
-  [Twitter](https://twitter.com/ronin_rb) |
   [Mastodon](https://infosec.exchange/@ronin_rb)
 ## Description
@@ -38,7 +37,7 @@ ronin-web-spider is a collection of common web spidering routines using the
   * [every_comment][docs-every_comment] - yields every HTML or JavaScript
     comment.
 * Supports archiving spidered pages to a directory or git repository.
-* Has 94% documentation coverage.
+* Has 97% documentation coverage.
 * Has 94% test coverage.
 [docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method

data/Rakefile CHANGED Viewed

@@ -1,11 +1,11 @@
-require 'rubygems'
+# frozen_string_literal: true
 begin
   require 'bundler'
 rescue LoadError => e
   warn e.message
   warn "Run `gem install bundler` to install Bundler"
-  exit -1
+  exit(-1)
 end
 begin

data/gemspec.yml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: ronin-web-spider
-summary: collection of common web spidering routines
+summary: A collection of common web spidering routines.
 description:
   ronin-web-spider is a collection of common web spidering routines using the
   spidr gem.

data/lib/ronin/web/spider/agent.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
@@ -237,6 +238,8 @@ module Ronin
         #
         def every_html_comment
           every_html_page do |page|
+            next unless page.doc
             page.doc.xpath('//comment()').each do |comment|
               comment_text = comment.inner_text.strip
@@ -267,20 +270,60 @@ module Ronin
           # yield inner text of every `<script type="text/javascript">` tag
           # and every `.js` URL.
           every_html_page do |page|
+            next unless page.doc
             page.doc.xpath('//script[@type="text/javascript"]').each do |script|
-              unless script.inner_text.empty?
-                yield script.inner_text
+              source = script.inner_text
+              source.force_encoding(Encoding::UTF_8)
+              unless source.empty?
+                yield source
               end
             end
           end
           every_javascript_page do |page|
-            yield page.body
+            source = page.body
+            source.force_encoding(Encoding::UTF_8)
+            yield source
           end
         end
         alias every_js every_javascript
+        # Regex to match and skip JavaScript inline regexes.
+        #
+        # @api private
+        #
+        # @since 0.1.1
+        JAVASCRIPT_INLINE_REGEX = %r{
+          (?# match before the regex to avoid matching division operators )
+          (?:[\{\[\(;:,]\s*|=\s*)
+          /
+            (?# inline regex contents )
+            (?:
+              \[ (?:\\. | [^\]]) \] (?# [...] ) |
+              \\.                   (?# backslash escaped characters ) |
+              [^/]                  (?# everything else )
+            )+
+          /[dgimsuvy]* (?# also match any regex flags )
+        }mx
+        # Regex to match and skip JavaScript template literals.
+        #
+        # @note
+        #   This regex will not properly match nested template literals:
+        #
+        #   ```javascript
+        #   `foo ${`bar ${1+1}`}`
+        #   ```
+        #
+        # @api private
+        #
+        # @since 0.1.1
+        JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
         #
         # Passes every JavaScript string value to the given block.
         #
@@ -293,15 +336,28 @@ module Ronin
         #
         # @example
         #   spider.every_javascript_string do |str|
-        #    puts str
-        #  end
+        #     puts str
+        #   end
         #
         # @api public
         #
         def every_javascript_string
           every_javascript do |js|
-            js.scan(Support::Text::Patterns::STRING) do |js_string|
-              yield Support::Encoding::JS.unquote(js_string)
+            scanner = StringScanner.new(js)
+            until scanner.eos?
+              # NOTE: this is a naive JavaScript string scanner and should
+              # eventually be replaced with a real JavaScript lexer or parser.
+              case scanner.peek(1)
+              when '"', "'" # beginning of a quoted string
+                js_string = scanner.scan(Support::Text::Patterns::STRING)
+                yield Support::Encoding::JS.unquote(js_string)
+              else
+                scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
+                  scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
+                  scanner.getch
+              end
             end
           end
         end

data/lib/ronin/web/spider/archive.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
@@ -31,7 +32,7 @@ module Ronin
       #
       #     require 'ronin/web/spider'
       #     require 'ronin/web/spider/archive'
-      #
+      #
       #     Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
       #       Ronin::Web::Spider.every_page(host: 'example.com') do |page|
       #         archive.write(page.url,page.body)

data/lib/ronin/web/spider/exceptions.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #

data/lib/ronin/web/spider/git_archive.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
@@ -33,7 +34,7 @@ module Ronin
       #     require 'ronin/web/spider'
       #     require 'ronin/web/spider/git_archive'
       #     require 'date'
-      #
+      #
       #     Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
       #       archive.commit("Updated #{Date.today}") do
       #         Ronin::Web::Spider.every_page(host: 'example.com') do |page|

data/lib/ronin/web/spider/version.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
@@ -21,7 +22,7 @@ module Ronin
   module Web
     module Spider
       # ronin-web-spider version
-      VERSION = '0.1.0'
+      VERSION = '0.1.1'
     end
   end
 end

data/lib/ronin/web/spider.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #
 # ronin-web-spider - A collection of common web spidering routines.
 #
@@ -30,136 +31,136 @@ module Ronin
     # ## Examples
     #
     # Spider a host:
-    #
+    #
     # ```ruby
     # require 'ronin/web/spider'
-    #
+    #
     # Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Spider a host:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('solnic.eu') do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Spider a domain (and any sub-domains):
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Spider a site:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Spider multiple hosts:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Do not spider certain links:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Do not spider links on certain ports:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Do not spider links blacklisted in robots.txt:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
     #   # ...
     # end
     # ```
-    #
+    #
     # Print out visited URLs:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
     #   spider.every_url { |url| puts url }
     # end
     # ```
-    #
+    #
     # Build a URL map of a site:
-    #
+    #
     # ```ruby
     # url_map = Hash.new { |hash,key| hash[key] = [] }
-    #
+    #
     # Ronin::Web::Spider.site('http://intranet.com/') do |spider|
     #   spider.every_link do |origin,dest|
     #     url_map[dest] << origin
     #   end
     # end
     # ```
-    #
+    #
     # Print out the URLs that could not be requested:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/') do |spider|
     #   spider.every_failed_url { |url| puts url }
     # end
     # ```
-    #
+    #
     # Finds all pages which have broken links:
-    #
+    #
     # ```ruby
     # url_map = Hash.new { |hash,key| hash[key] = [] }
-    #
+    #
     # spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
     #   spider.every_link do |origin,dest|
     #     url_map[dest] << origin
     #   end
     # end
-    #
+    #
     # spider.failures.each do |url|
     #   puts "Broken link #{url} found in:"
-    #
+    #
     #   url_map[url].each { |page| puts "  #{page}" }
     # end
     # ```
-    #
+    #
     # Search HTML and XML pages:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('http://company.com/') do |spider|
     #   spider.every_page do |page|
     #     puts ">>> #{page.url}"
-    #
+    #
     #     page.search('//meta').each do |meta|
     #       name = (meta.attributes['name'] || meta.attributes['http-equiv'])
     #       value = meta.attributes['content']
-    #
+    #
     #       puts "  #{name} = #{value}"
     #     end
     #   end
     # end
     # ```
-    #
+    #
     # Print out the titles from every page:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
     #   spider.every_html_page do |page|
@@ -167,9 +168,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print out every HTTP redirect:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.every_redirect_page do |page|
@@ -177,21 +178,21 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Find what kinds of web servers a host is using, by accessing the headers:
-    #
+    #
     # ```ruby
     # servers = Set[]
-    #
+    #
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.all_headers do |headers|
     #     servers << headers['server']
     #   end
     # end
     # ```
-    #
+    #
     # Pause the spider on a forbidden page:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.every_forbidden_page do |page|
@@ -199,9 +200,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Skip the processing of a page:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.every_missing_page do |page|
@@ -209,9 +210,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Skip the processing of links:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.host('company.com') do |spider|
     #   spider.every_url do |url|
@@ -221,9 +222,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Detect when a new host name is spidered:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_host do |host|
@@ -231,9 +232,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Detect when a new SSL/TLS certificate is encountered:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_cert do |cert|
@@ -241,9 +242,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print the MD5 checksum of every `favicon.ico` file:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_favicon do |page|
@@ -251,9 +252,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print every HTML comment:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_html_comment do |comment|
@@ -261,9 +262,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print all JavaScript source code:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_javascript do |js|
@@ -271,9 +272,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print every JavaScript string literal:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_javascript_string do |str|
@@ -281,9 +282,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print every JavaScript comment:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_javascript_comment do |comment|
@@ -291,9 +292,9 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     # Print every HTML and JavaScript comment:
-    #
+    #
     # ```ruby
     # Ronin::Web::Spider.domain('example.com') do |spider|
     #   spider.every_comment do |comment|
@@ -301,7 +302,7 @@ module Ronin
     #   end
     # end
     # ```
-    #
+    #
     module Spider
       #
       # Creates a new agent and begin spidering at the given URL.

data/ronin-web-spider.gemspec CHANGED Viewed

@@ -1,4 +1,4 @@
-# encoding: utf-8
+# frozen_string_literal: true
 require 'yaml'
@@ -22,7 +22,7 @@ Gem::Specification.new do |gem|
   gem.homepage    = gemspec['homepage']
   gem.metadata    = gemspec['metadata'] if gemspec['metadata']
-  glob = lambda { |patterns| gem.files & Dir[*patterns] }
+  glob = ->(patterns) { gem.files & Dir[*patterns] }
   gem.files  = `git ls-files`.split($/)
   gem.files  = glob[gemspec['files']] if gemspec['files']
@@ -46,7 +46,7 @@ Gem::Specification.new do |gem|
   gem.required_rubygems_version = gemspec['required_rubygems_version']
   gem.post_install_message      = gemspec['post_install_message']
-  split = lambda { |string| string.split(/,\s*/) }
+  split = ->(string) { string.split(/,\s*/) }
   if gemspec['dependencies']
     gemspec['dependencies'].each do |name,versions|

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ronin-web-spider
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Postmodern
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-02-01 00:00:00.000000000 Z
+date: 2024-06-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: spidr
@@ -66,6 +66,7 @@ files:
 - ".github/workflows/ruby.yml"
 - ".gitignore"
 - ".rspec"
+- ".rubocop.yml"
 - ".ruby-version"
 - ".yardopts"
 - COPYING.txt
@@ -105,8 +106,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.3.26
+rubygems_version: 3.3.27
 signing_key:
 specification_version: 4
-summary: collection of common web spidering routines
+summary: A collection of common web spidering routines.
 test_files: []