RubyGems - ronin-web-spider - Versions diffs - 0.1.0.beta2 → 0.1.1 - Mend

ronin-web-spider 0.1.0.beta2 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +17 -5
data/.rubocop.yml +11 -0
data/.yardopts +1 -1
data/ChangeLog.md +23 -1
data/Gemfile +3 -0
data/README.md +303 -32
data/Rakefile +2 -2
data/gemspec.yml +4 -4
data/lib/ronin/web/spider/agent.rb +123 -7
data/lib/ronin/web/spider/archive.rb +4 -0
data/lib/ronin/web/spider/exceptions.rb +2 -1
data/lib/ronin/web/spider/git_archive.rb +3 -2
data/lib/ronin/web/spider/version.rb +3 -2
data/lib/ronin/web/spider.rb +290 -1
data/ronin-web-spider.gemspec +5 -4
metadata +10 -19
data/spec/agent_spec.rb +0 -585
data/spec/archive_spec.rb +0 -91
data/spec/example_app.rb +0 -27
data/spec/git_archive_spec.rb +0 -137
data/spec/spec_helper.rb +0 -4
data/spec/spider_spec.rb +0 -252

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fe9c4af84eeeb8d8c8c46e8f1c0544ec3e92f4f3ff789e71ec495bae7bdc01ca
-  data.tar.gz: 23efe74dd0e37281fd701ebf86e55213c75d54c5f70ddc7e8abfeeec4608b8be
+  metadata.gz: dab34842325a731e13f23303b1dec66c7ab9d78b4805b982d518ba01024c9352
+  data.tar.gz: '0668f126b3e828c6409cc7b7adef2d74cd527f48de334b10a3d54f3767fe3afd'
 SHA512:
-  metadata.gz: 758dace33195064f8742496b3e39408ffa64fa92324d42201f291c906b72f835245eae7259fc8aba8f8160eba3ed9041b00b91117c88144c8af72de62237875f
-  data.tar.gz: f76c6a3d6150519fa91958183e0475a1211fa495d2d341e6a049fe6037dedcb00019e2230fdfd75194a5ab4b66d3c1b71e7226535d123b5a3476afe53fe26a0c
+  metadata.gz: d474705a601b7fe27be2a9c5f5e5485ed39b38dec1581db295b0e1ff524c987c9e74522afd24569829bc7e64f6930767104f9bd927a3007a17f627de003492f7
+  data.tar.gz: 397b84308ec62d51e1dba64cff37c75eda8ebd8c2e6a792487468158fb774aae566b8be5cd0388521774d62873e5ba6b751423bc1138ac9e3d47804bdd877a81

data/.github/workflows/ruby.yml CHANGED Viewed

@@ -12,20 +12,32 @@ jobs:
           - '3.0'
           - '3.1'
           - '3.2'
+          - '3.3'
           - jruby
           - truffleruby
     name: Ruby ${{ matrix.ruby }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Set up Ruby
         uses: ruby/setup-ruby@v1
         with:
           ruby-version: ${{ matrix.ruby }}
-      - name: Install libsqlite3
-        run: |
-          sudo apt update -y && \
-          sudo apt install -y --no-install-recommends --no-install-suggests libsqlite3-dev
+          bundler-cache: true
       - name: Install dependencies
         run: bundle install --jobs 4 --retry 3
       - name: Run tests
         run: bundle exec rake test
+  # rubocop linting
+  rubocop:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: 3.0
+      - name: Install dependencies
+        run: bundle install --jobs 4 --retry 3
+      - name: Run rubocop
+        run: bundle exec rubocop --parallel

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,11 @@
+AllCops:
+  NewCops: enable
+  SuggestExtensions: false
+  TargetRubyVersion: 3.1
+inherit_gem:
+  rubocop-ronin: rubocop.yml
+#
+# ronin-web-spider specific exceptions
+#

data/.yardopts CHANGED Viewed

	@@ -1 +1 @@
1	- --markup markdown --title 'Ronin ~~FIXME~~ Documentation' --protected
1	+ --markup markdown --title 'Ronin::Web::Spider Documentation' --protected

data/ChangeLog.md CHANGED Viewed

@@ -1,6 +1,27 @@
-### 0.1.0 / 2023-XX-XX
+### 0.1.1 / 2024-06-19
+* Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
+  {Ronin::Web::Spider::Agent#every_javascript} when the page's `Content-Type`
+  header included `text/html` but lacked a response body, causing `page.doc` to
+  be `nil`.
+* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript} where parsed
+  JavaScript source code strings containing UTF-8 characters where being
+  incorrectly encoded as ASCII-8bit strings, if the page's `Content-Type` header
+  did not include a `charset=` attribute.
+* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript_string} where
+  inline JavaScript regexes containing the `"` or `'` characters (ex: `/["'=]/`)
+  would incorrectly be treated as the beginning or ends of JavaScript string
+  literals. Note that while this greatly improves the accuracy of
+  {Ronin::Web::Spider::Agent#every_javascript_string}, it still does not
+  support parsing JavaScript template literals that may also contain string
+  literals (ex: ````Hello \"World\"```` or ````Hello ${myFunc("string literal")}````).
+### 0.1.0 / 2023-02-01
+* Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
+* Relicensed as LGPL-3.0.
 * Initial release:
+  * Requires `ruby` >= 3.0.0.
   * Built on top of the battle tested and versatile [spidr] gem.
   * Provides additional callback methods:
     * `every_host` - yields every unique host name that's spidered.
@@ -17,3 +38,4 @@
     * `every_comment` - yields every HTML or JavaScript comment.
   * Supports archiving spidered pages to a directory or git repository.
+[spidr]: https://github.com/postmodern/spidr#readme

data/Gemfile CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 source 'https://rubygems.org'
 gemspec
@@ -28,4 +29,6 @@ group :development do
   gem 'dead_end',        require: false
   gem 'sord',            require: false, platform: :mri
   gem 'stackprof',       require: false, platform: :mri
+  gem 'rubocop',         require: false, platform: :mri
+  gem 'rubocop-ronin',   require: false, platform: :mri
 end

data/README.md CHANGED Viewed

@@ -2,13 +2,13 @@
 [![CI](https://github.com/ronin-rb/ronin-web-spider/actions/workflows/ruby.yml/badge.svg)](https://github.com/ronin-rb/ronin-web-spider/actions/workflows/ruby.yml)
 [![Code Climate](https://codeclimate.com/github/ronin-rb/ronin-web-spider.svg)](https://codeclimate.com/github/ronin-rb/ronin-web-spider)
+[![Gem Version](https://badge.fury.io/rb/ronin-web-spider.svg)](https://badge.fury.io/rb/ronin-web-spider)
 * [Website](https://ronin-rb.dev/)
 * [Source](https://github.com/ronin-rb/ronin-web-spider)
 * [Issues](https://github.com/ronin-rb/ronin-web-spider/issues)
 * [Documentation](https://ronin-rb.dev/docs/ronin-web-spider/frames)
 * [Discord](https://discord.gg/6WAb3PsVX9) |
-  [Twitter](https://twitter.com/ronin_rb) |
   [Mastodon](https://infosec.exchange/@ronin_rb)
 ## Description
@@ -20,22 +20,35 @@ ronin-web-spider is a collection of common web spidering routines using the
 * Built on top of the battle tested and versatile [spidr] gem.
 * Provides additional callback methods:
-  * `every_host` - yields every unique host name that's spidered.
-  * `every_cert` - yields every unique SSL/TLS certificate encountered while
-    spidering.
-  * `every_favicon` - yields every favicon file that's encountered while
-    spidering.
-  * `every_html_comment` - yields every HTML comment.
-  * `every_javascript` - yields all JavaScript source code from either inline
-    `<script>` or `.js` files.
-  * `every_javascript_string` - yields every single-quoted or double-quoted
-    String literal from all JavaScript source code.
-  * `every_javascript_comment` - yields every JavaScript comment.
-  * `every_comment` - yields every HTML or JavaScript comment.
+  * [every_host][docs-every_host] - yields every unique host name that's
+    spidered.
+  * [every_cert][docs-every_cert] - yields every unique SSL/TLS certificate
+    encountered while spidering.
+  * [every_favicon][docs-every_favicon] - yields every favicon file that's
+    encountered while spidering.
+  * [every_html_comment][docs-every_html_comment] - yields every HTML comment.
+  * [every_javascript][docs-every_javascript] - yields all JavaScript source
+    code from either inline `<script>` or `.js` files.
+  * [every_javascript_string][docs-every_javascript_string] - yields every
+    single-quoted or double-quoted String literal from all JavaScript source
+    code.
+  * [every_javascript_comment][docs-every_javascript_comment] - yields every
+    JavaScript comment.
+  * [every_comment][docs-every_comment] - yields every HTML or JavaScript
+    comment.
 * Supports archiving spidered pages to a directory or git repository.
-* Has 94% documentation coverage.
+* Has 97% documentation coverage.
 * Has 94% test coverage.
+[docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
+[docs-every_cert]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_cert-instance_method
+[docs-every_favicon]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_favicon-instance_method
+[docs-every_html_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_html_comment-instance_method
+[docs-every_javascript]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript-instance_method
+[docs-every_javascript_string]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript_string-instance_method
+[docs-every_javascript_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript_comment-instance_method
+[docs-every_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_comment-instance_method
 ## Examples
 Spider a host:
@@ -43,41 +56,299 @@ Spider a host:
 ```ruby
 require 'ronin/web/spider'
-Ronin::Web::Spider.host('www.example.com') do |agent|
-  agent.ever_url do |url|
-    # ...
+Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
+  # ...
+end
+```
+Spider a host:
+```ruby
+Ronin::Web::Spider.host('solnic.eu') do |agent|
+  # ...
+end
+```
+Spider a domain (and any sub-domains):
+```ruby
+Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
+  # ...
+end
+```
+Spider a site:
+```ruby
+Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
+  # ...
+end
+```
+Spider multiple hosts:
+```ruby
+Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
+  # ...
+end
+```
+Do not spider certain links:
+```ruby
+Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
+  # ...
+end
+```
+Do not spider links on certain ports:
+```ruby
+Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
+  # ...
+end
+```
+Do not spider links blacklisted in robots.txt:
+```ruby
+Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
+  # ...
+end
+```
+Print out visited URLs:
+```ruby
+Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
+  spider.every_url { |url| puts url }
+end
+```
+Build a URL map of a site:
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
+Ronin::Web::Spider.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
+```
+Print out the URLs that could not be requested:
+```ruby
+Ronin::Web::Spider.site('http://company.com/') do |spider|
+  spider.every_failed_url { |url| puts url }
+end
+```
+Finds all pages which have broken links:
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
+spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
+spider.failures.each do |url|
+  puts "Broken link #{url} found in:"
+  url_map[url].each { |page| puts "  #{page}" }
+end
+```
+Search HTML and XML pages:
+```ruby
+Ronin::Web::Spider.site('http://company.com/') do |spider|
+  spider.every_page do |page|
+    puts ">>> #{page.url}"
+    page.search('//meta').each do |meta|
+      name = (meta.attributes['name'] || meta.attributes['http-equiv'])
+      value = meta.attributes['content']
+      puts "  #{name} = #{value}"
+    end
+  end
+end
+```
+Print out the titles from every page:
+```ruby
+Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
+  spider.every_html_page do |page|
+    puts page.title
+  end
+end
+```
+Print out every HTTP redirect:
+```ruby
+Ronin::Web::Spider.host('company.com') do |spider|
+  spider.every_redirect_page do |page|
+    puts "#{page.url} -> #{page.headers['Location']}"
   end
+end
+```
+Find what kinds of web servers a host is using, by accessing the headers:
+```ruby
+servers = Set[]
-  agent.every_url_like(/.../) do |url|
-    # ...
+Ronin::Web::Spider.host('company.com') do |spider|
+  spider.all_headers do |headers|
+    servers << headers['server']
   end
+end
+```
-  agent.every_page do |page|
-    # ...
+Pause the spider on a forbidden page:
+```ruby
+Ronin::Web::Spider.host('company.com') do |spider|
+  spider.every_forbidden_page do |page|
+    spider.pause!
   end
 end
 ```
-See [Spidr::Agent] documentation for more agent methods.
+Skip the processing of a page:
-[Spidr::Agent]: https://rubydoc.info/gems/spidr/Spidr/Agent
+```ruby
+Ronin::Web::Spider.host('company.com') do |spider|
+  spider.every_missing_page do |page|
+    spider.skip_page!
+  end
+end
+```
-Spider a domain:
+Skip the processing of links:
 ```ruby
-Ronin::Web::Spider.domain('example.com') do |agent|
-  agent.every_page do |page|
-    # ...
+Ronin::Web::Spider.host('company.com') do |spider|
+  spider.every_url do |url|
+    if url.path.split('/').find { |dir| dir.to_i > 1000 }
+      spider.skip_link!
+    end
   end
 end
 ```
-Spider a website:
+Detect when a new host name is spidered:
 ```ruby
-Ronin::Web::Spider.site('https://www.example.com/index.html') do |agent|
-  agent.every_page do |page|
-    # ...
+Ronin::Web::Spider.domain('example.com') do |spider|
+  spider.every_host do |host|
+    puts "Spidering #{host} ..."
+  end
+end
+```
+Detect when a new SSL/TLS certificate is encountered:
+```ruby
+Ronin::Web::Spider.domain('example.com') do |spider|
+  spider.every_cert do |cert|
+    puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
+  end
+end
+```
+Print the MD5 checksum of every `favicon.ico` file:
+```ruby
+Ronin::Web::Spider.domain('example.com') do |spider|
+  spider.every_favicon do |page|
+    puts "#{page.url}: #{page.body.md5}"
+  end
+end
+```
+Print every HTML comment:
+```ruby
+Ronin::Web::Spider.domain('example.com') do |spider|
+  spider.every_html_comment do |comment|
+    puts comment
+  end
+end
+```
+Print all JavaScript source code:
+```ruby
+Ronin::Web::Spider.domain('example.com') do |spider|
+  spider.every_javascript do |js|
+    puts js
+  end
+end
+```
+Print every JavaScript string literal:
+```ruby
+Ronin::Web::Spider.domain('example.com') do |spider|
+  spider.every_javascript_string do |str|
+    puts str
+  end
+end
+```
+Print every JavaScript comment:
+```ruby
+Ronin::Web::Spider.domain('example.com') do |spider|
+  spider.every_javascript_comment do |comment|
+    puts comment
+  end
+end
+```
+Print every HTML and JavaScript comment:
+```ruby
+Ronin::Web::Spider.domain('example.com') do |spider|
+  spider.every_comment do |comment|
+    puts comment
+  end
+end
+```
+Spider a host and archive every web page:
+```ruby
+require 'ronin/web/spider'
+require 'ronin/web/spider/archive'
+Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
+  Ronin::Web::Spider.every_page(host: 'example.com') do |page|
+    archive.write(page.url,page.body)
+  end
+end
+```
+Spider a host and archive every web page to a Git repository:
+```ruby
+require 'ronin/web/spider/git_archive'
+require 'ronin/web/spider'
+require 'date'
+Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
+  archive.commit("Updated #{Date.today}") do
+    Ronin::Web::Spider.every_page(host: 'example.com') do |page|
+      archive.write(page.url,page.body)
+    end
   end
 end
 ```
@@ -119,7 +390,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
 ## License
-Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
+Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
 ronin-web-spider is free software: you can redistribute it and/or modify
 it under the terms of the GNU Lesser General Public License as published

data/Rakefile CHANGED Viewed

@@ -1,11 +1,11 @@
-require 'rubygems'
+# frozen_string_literal: true
 begin
   require 'bundler'
 rescue LoadError => e
   warn e.message
   warn "Run `gem install bundler` to install Bundler"
-  exit -1
+  exit(-1)
 end
 begin

data/gemspec.yml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: ronin-web-spider
-summary: collection of common web spidering routines
+summary: A collection of common web spidering routines.
 description:
   ronin-web-spider is a collection of common web spidering routines using the
   spidr gem.
@@ -11,17 +11,17 @@ homepage: https://ronin-rb.dev/
 has_yard: true
 metadata:
-  documentation_uri: https://rubydoc.info/gems/ronin-web-spider
+  documentation_uri: https://ronin-rb.dev/docs/ronin-web-spider
   source_code_uri:   https://github.com/ronin-rb/ronin-web-spider
   bug_tracker_uri:   https://github.com/ronin-rb/ronin-web-spider/issues
-  changelog_uri:     https://github.com/ronin-rb/ronin-web-spider/blob/master/ChangeLog.md
+  changelog_uri:     https://github.com/ronin-rb/ronin-web-spider/blob/main/ChangeLog.md
   rubygems_mfa_required: 'true'
 required_ruby_version: ">= 3.0.0"
 dependencies:
   spidr: ~> 0.7
-  ronin-support: ~> 1.0.0.beta1
+  ronin-support: ~> 1.0
 development_dependencies:
   bundler: ~> 2.0