RubyGems - spidr - Versions diffs - 0.6.1 → 0.7.1 - Mend

spidr 0.6.1 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +4 -4
data/.editorconfig +11 -0
data/.github/workflows/ruby.yml +26 -0
data/.gitignore +4 -5
data/ChangeLog.md +19 -1
data/Gemfile +7 -4
data/LICENSE.txt +1 -1
data/README.md +136 -79
data/Rakefile +1 -0
data/gemspec.yml +7 -0
data/lib/spidr/agent/actions.rb +3 -1
data/lib/spidr/agent/events.rb +3 -1
data/lib/spidr/agent/filters.rb +57 -56
data/lib/spidr/agent/robots.rb +2 -0
data/lib/spidr/agent/sanitizers.rb +7 -8
data/lib/spidr/agent.rb +232 -108
data/lib/spidr/auth_credential.rb +2 -0
data/lib/spidr/auth_store.rb +9 -7
data/lib/spidr/cookie_jar.rb +7 -5
data/lib/spidr/extensions/uri.rb +3 -1
data/lib/spidr/extensions.rb +3 -1
data/lib/spidr/page/content_types.rb +53 -0
data/lib/spidr/page/cookies.rb +2 -0
data/lib/spidr/page/html.rb +21 -20
data/lib/spidr/page/status_codes.rb +15 -11
data/lib/spidr/page.rb +3 -1
data/lib/spidr/proxy.rb +8 -14
data/lib/spidr/rules.rb +7 -8
data/lib/spidr/session_cache.rb +26 -22
data/lib/spidr/settings/proxy.rb +22 -6
data/lib/spidr/settings/timeouts.rb +2 -0
data/lib/spidr/settings/user_agent.rb +2 -0
data/lib/spidr/settings.rb +5 -3
data/lib/spidr/spidr.rb +22 -11
data/lib/spidr/version.rb +3 -1
data/lib/spidr.rb +5 -3
data/spec/agent_spec.rb +356 -7
data/spec/example_page.rb +2 -0
data/spec/page/content_types_spec.rb +22 -0
data/spec/page/html_spec.rb +255 -51
data/spec/page/status_codes_spec.rb +4 -4
data/spec/proxy_spec.rb +2 -2
data/spec/settings/proxy_examples.rb +31 -11
data/spec/spec_helper.rb +3 -0
data/spidr.gemspec +1 -4
metadata +8 -7
data/.travis.yml +0 -16

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
-  data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
+  metadata.gz: 471764341b98b0cfeb57db24ac34a849dcfdcf43a751b648451a20c29c1ec051
+  data.tar.gz: '009c903cf30a13e55bbb8029fe2fdbfa4f8a8af32126b74aeb558f1afd3d3d88'
 SHA512:
-  metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
-  data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77
+  metadata.gz: bddb65750dce8f6193764ac9d372adfa1893dc8743c24c383c359069043b51cd94e09ecd8bffad16bb8b4d92f99324c98ca95f8f59a9c9655a3f2fb7c42b9f57
+  data.tar.gz: c02f98806d9297ee22c6552eaaf6bb82f619001af25b0d8eeaabf91d0e32ab7154b5436de71ed4773b15353ba5556b52ece92a6035a891eb001c27b90e5cdda5

data/.editorconfig ADDED Viewed

@@ -0,0 +1,11 @@
+root = true
+[*]
+end_of_line = lf
+insert_final_newline = true
+tab_width = 8
+trim_trailing_whitespace = true
+[{Gemfile,Rakefile,*.rb,*.gemspec,*.yml}]
+indent_style = space
+indent_size = 2

data/.github/workflows/ruby.yml ADDED Viewed

@@ -0,0 +1,26 @@
+name: CI
+on: [ push, pull_request ]
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ruby:
+          - '3.0'
+          - '3.1'
+          - '3.2'
+          - '3.3'
+          - jruby
+    name: Ruby ${{ matrix.ruby }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+          bundler-cache: true
+      - name: Run tests
+        run: bundle exec rake test

data/.gitignore CHANGED Viewed

@@ -1,8 +1,7 @@
-pkg
-doc
-web
-tmp
-Gemfile.lock
+/Gemfile.lock
+/coverage
+/doc
+/pkg
 .DS_Store
 .bundle
 .yardoc

data/ChangeLog.md CHANGED Viewed

@@ -1,6 +1,24 @@
+### 0.7.1 / 2024-01-25
+* Switched to using `require_relative` to improve load-times.
+* Added `# frozen_string_literal: true` to all files.
+* Use keyword arguments for {Spidr.domain}.
+* Rescue `URI::Error` instead of `Exception` when calling `URI::HTTP#merge` in
+  {Spidr::Page#to_absolute}.
+### 0.7.0 / 2022-12-31
+* Added {Spidr.domain} and {Spidr::Agent.domain}.
+* Added {Spidr::Page#gif?}.
+* Added {Spidr::Page#jpeg?}.
+* Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
+* Added {Spidr::Page#png?}.
+* {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
+  `URI::HTTP` object.
 ### 0.6.1 / 2019-10-24
-* Check for opaque component of URIs before attempting to set the path
+* Check for the opaque component of URIs before attempting to set the path
   component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
   opaque` exceptions.
 * Fix `@robots` instance variable warning (@spk).

data/Gemfile CHANGED Viewed

@@ -12,10 +12,13 @@ group :development do
   gem 'rake'
   gem 'rubygems-tasks', '~> 0.2'
-  gem 'rspec',    '~> 3.0'
-  gem 'webmock',  '~> 3.0'
-  gem 'sinatra',  '~> 1.0'
+  gem 'rspec',     '~> 3.0'
+  gem 'webmock',   '~> 3.0'
+  gem 'sinatra',   '~> 2.0'
+  gem 'simplecov', '~> 0.20'
   gem 'kramdown'
-  gem 'yard',     '~> 0.9'
+  gem 'redcarpet', platform: :mri
+  gem 'yard',      '~> 0.9'
+  gem 'yard-spellcheck', require: false
 end

data/LICENSE.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2008-2016 Hal Brodigan
+Copyright (c) 2008-2024 Hal Brodigan
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 # Spidr
+[![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
 * [Homepage](https://github.com/postmodern/spidr#readme)
 * [Source](https://github.com/postmodern/spidr)
 * [Issues](https://github.com/postmodern/spidr/issues)
 * [Mailing List](http://groups.google.com/group/spidr)
-* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
-* [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
 ## Description
@@ -49,137 +49,194 @@ and easy to use.
 Start spidering from a URL:
-    Spidr.start_at('http://tenderlovemaking.com/')
+```ruby
+Spidr.start_at('http://tenderlovemaking.com/') do |agent|
+  # ...
+end
+```
 Spider a host:
-    Spidr.host('solnic.eu')
+```ruby
+Spidr.host('solnic.eu') do |agent|
+  # ...
+end
+```
+Spider a domain (and any sub-domains):
+```ruby
+Spidr.domain('ruby-lang.org') do |agent|
+  # ...
+end
+```
 Spider a site:
-    Spidr.site('http://www.rubyflow.com/')
+```ruby
+Spidr.site('http://www.rubyflow.com/') do |agent|
+  # ...
+end
+```
 Spider multiple hosts:
-    Spidr.start_at(
-      'http://company.com/',
-      hosts: [
-        'company.com',
-        /host[\d]+\.company\.com/
-      ]
-    )
+```ruby
+Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
+  # ...
+end
+```
 Do not spider certain links:
-    Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
+```ruby
+Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
+  # ...
+end
+```
 Do not spider links on certain ports:
-    Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
+```ruby
+Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
+  # ...
+end
+```
 Do not spider links blacklisted in robots.txt:
-    Spidr.site(
-      'http://company.com/',
-      robots: true
-    )
+```ruby
+Spidr.site('http://company.com/', robots: true) do |agent|
+  # ...
+end
+```
 Print out visited URLs:
-    Spidr.site('http://www.rubyinside.com/') do |spider|
-      spider.every_url { |url| puts url }
-    end
+```ruby
+Spidr.site('http://www.rubyinside.com/') do |spider|
+  spider.every_url { |url| puts url }
+end
+```
 Build a URL map of a site:
-    url_map = Hash.new { |hash,key| hash[key] = [] }
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
-    Spidr.site('http://intranet.com/') do |spider|
-      spider.every_link do |origin,dest|
-        url_map[dest] << origin
-      end
-    end
+Spidr.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
+```
 Print out the URLs that could not be requested:
-    Spidr.site('http://company.com/') do |spider|
-      spider.every_failed_url { |url| puts url }
-    end
+```ruby
+Spidr.site('http://company.com/') do |spider|
+  spider.every_failed_url { |url| puts url }
+end
+```
 Finds all pages which have broken links:
-    url_map = Hash.new { |hash,key| hash[key] = [] }
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
-    spider = Spidr.site('http://intranet.com/') do |spider|
-      spider.every_link do |origin,dest|
-        url_map[dest] << origin
-      end
-    end
+spider = Spidr.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
-    spider.failures.each do |url|
-      puts "Broken link #{url} found in:"
+spider.failures.each do |url|
+  puts "Broken link #{url} found in:"
-      url_map[url].each { |page| puts "  #{page}" }
-    end
+  url_map[url].each { |page| puts "  #{page}" }
+end
+```
 Search HTML and XML pages:
-    Spidr.site('http://company.com/') do |spider|
-      spider.every_page do |page|
-        puts ">>> #{page.url}"
+```ruby
+Spidr.site('http://company.com/') do |spider|
+  spider.every_page do |page|
+    puts ">>> #{page.url}"
-        page.search('//meta').each do |meta|
-          name = (meta.attributes['name'] || meta.attributes['http-equiv'])
-          value = meta.attributes['content']
+    page.search('//meta').each do |meta|
+      name = (meta.attributes['name'] || meta.attributes['http-equiv'])
+      value = meta.attributes['content']
-          puts "  #{name} = #{value}"
-        end
-      end
+      puts "  #{name} = #{value}"
     end
+  end
+end
+```
 Print out the titles from every page:
-    Spidr.site('https://www.ruby-lang.org/') do |spider|
-      spider.every_html_page do |page|
-        puts page.title
-      end
-    end
+```ruby
+Spidr.site('https://www.ruby-lang.org/') do |spider|
+  spider.every_html_page do |page|
+    puts page.title
+  end
+end
+```
+Print out every HTTP redirect:
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_redirect_page do |page|
+    puts "#{page.url} -> #{page.headers['Location']}"
+  end
+end
+```
 Find what kinds of web servers a host is using, by accessing the headers:
-    servers = Set[]
+```ruby
+servers = Set[]
-    Spidr.host('company.com') do |spider|
-      spider.all_headers do |headers|
-        servers << headers['server']
-      end
-    end
+Spidr.host('company.com') do |spider|
+  spider.all_headers do |headers|
+    servers << headers['server']
+  end
+end
+```
 Pause the spider on a forbidden page:
-    Spidr.host('company.com') do |spider|
-      spider.every_forbidden_page do |page|
-        spider.pause!
-      end
-    end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_forbidden_page do |page|
+    spider.pause!
+  end
+end
+```
 Skip the processing of a page:
-    Spidr.host('company.com') do |spider|
-      spider.every_missing_page do |page|
-        spider.skip_page!
-      end
-    end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_missing_page do |page|
+    spider.skip_page!
+  end
+end
+```
 Skip the processing of links:
-    Spidr.host('company.com') do |spider|
-      spider.every_url do |url|
-        if url.path.split('/').find { |dir| dir.to_i > 1000 }
-          spider.skip_link!
-        end
-      end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_url do |url|
+    if url.path.split('/').find { |dir| dir.to_i > 1000 }
+      spider.skip_link!
     end
+  end
+end
+```
 ## Requirements
@@ -188,12 +245,12 @@ Skip the processing of links:
 ## Install
-    $ gem install spidr
+```shell
+$ gem install spidr
+```
 ## License
-Copyright (c) 2008-2016 Hal Brodigan
 See {file:LICENSE.txt} for license information.
 [ruby]: https://www.ruby-lang.org/

data/Rakefile CHANGED Viewed

@@ -12,6 +12,7 @@ Gem::Tasks.new
 require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new
+task :test    => :spec
 task :default => :spec
 require 'yard'

data/gemspec.yml CHANGED Viewed

@@ -11,6 +11,13 @@ email: postmodern.mod3@gmail.com
 homepage: https://github.com/postmodern/spidr#readme
 has_yard: true
+metadata:
+  documentation_uri: https://rubydoc.info/gems/spidr
+  source_code_uri:   https://github.com/postmodern/spidr.rb
+  bug_tracker_uri:   https://github.com/postmodern/spidr.rb/issues
+  changelog_uri:     https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
+  rubygems_mfa_required: 'true'
 required_ruby_version: ">= 2.0.0"
 dependencies:

data/lib/spidr/agent/actions.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module Spidr
   class Agent
     module Actions
@@ -96,7 +98,7 @@ module Spidr
     protected
-    def initialize_actions(options={})
+    def initialize_actions
       @paused = false
     end
   end

data/lib/spidr/agent/events.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module Spidr
   class Agent
     #
@@ -520,7 +522,7 @@ module Spidr
     protected
-    def initialize_events(options={})
+    def initialize_events
       @every_url_blocks        = []
       @every_failed_url_blocks = []
       @every_url_like_blocks   = Hash.new { |hash,key| hash[key] = [] }

data/lib/spidr/agent/filters.rb CHANGED Viewed

@@ -1,4 +1,6 @@
-require 'spidr/rules'
+# frozen_string_literal: true
+require_relative '../rules'
 module Spidr
   class Agent
@@ -170,7 +172,7 @@ module Spidr
     #
     # @yieldparam [String] link
     #   A link to accept or reject.
-    #
+    #
     # @since 0.2.4
     #
     def visit_links_like(pattern=nil,&block)
@@ -238,7 +240,7 @@ module Spidr
     #
     # @yieldparam [URI::HTTP, URI::HTTPS] url
     #   A URL to accept or reject.
-    #
+    #
     # @since 0.2.4
     #
     def visit_urls_like(pattern=nil,&block)
@@ -356,89 +358,88 @@ module Spidr
     #
     # Initializes filtering rules.
     #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Array] :schemes (['http', 'https'])
+    # @param [Array<String>] schemes
     #   The list of acceptable URI schemes to visit.
     #   The `https` scheme will be ignored if `net/https` cannot be loaded.
     #
-    # @option options [String] :host
+    # @param [String] host
     #   The host-name to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :hosts
+    # @param [Array<String, Regexp, Proc>] hosts
     #   The patterns which match the host-names to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_hosts
+    # @param [Array<String, Regexp, Proc>] ignore_hosts
     #   The patterns which match the host-names to not visit.
     #
-    # @option options [Array<Integer, Regexp, Proc>] :ports
+    # @param [Array<Integer, Regexp, Proc>] ports
     #   The patterns which match the ports to visit.
     #
-    # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
+    # @param [Array<Integer, Regexp, Proc>] ignore_ports
     #   The patterns which match the ports to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :links
+    # @param [Array<String, Regexp, Proc>] links
     #   The patterns which match the links to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_links
+    # @param [Array<String, Regexp, Proc>] ignore_links
     #   The patterns which match the links to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :urls
+    # @param [Array<String, Regexp, Proc>] urls
     #   The patterns which match the URLs to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_urls
+    # @param [Array<String, Regexp, Proc>] ignore_urls
     #   The patterns which match the URLs to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :exts
+    # @param [Array<String, Regexp, Proc>] exts
     #   The patterns which match the URI path extensions to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_exts
+    # @param [Array<String, Regexp, Proc>] ignore_exts
     #   The patterns which match the URI path extensions to not visit.
     #
-    def initialize_filters(options={})
-      @schemes = []
+    def initialize_filters(schemes:      self.class.default_schemes,
+                           host:         nil,
+                           hosts:        nil,
+                           ignore_hosts: nil,
+                           ports:        nil,
+                           ignore_ports: nil,
+                           links:        nil,
+                           ignore_links: nil,
+                           urls:         nil,
+                           ignore_urls:  nil,
+                           exts:         nil,
+                           ignore_exts:  nil)
+      @schemes = schemes.map(&:to_s)
+      @host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
+      @port_rules = Rules.new(accept: ports, reject: ignore_ports)
+      @link_rules = Rules.new(accept: links, reject: ignore_links)
+      @url_rules  = Rules.new(accept: urls,  reject: ignore_urls)
+      @ext_rules  = Rules.new(accept: exts,  reject: ignore_exts)
+      visit_hosts_like(host) if host
+    end
-      if options[:schemes]
-        self.schemes = options[:schemes]
-      else
-        @schemes << 'http'
+    #
+    # Determines the default URI schemes to follow.
+    #
+    # @return [Array<String>]
+    #   The default URI schemes to follow.
+    #
+    # @since 0.6.2
+    #
+    def self.default_schemes
+      schemes = ['http']
-        begin
-          require 'net/https'
+      begin
+        require 'net/https'
-          @schemes << 'https'
-        rescue Gem::LoadError => e
-          raise(e)
-        rescue ::LoadError
-          warn "Warning: cannot load 'net/https', https support disabled"
-        end
+        schemes << 'https'
+      rescue Gem::LoadError => e
+        raise(e)
+      rescue ::LoadError
+        warn "Warning: cannot load 'net/https', https support disabled"
       end
-      @host_rules = Rules.new(
-        accept: options[:hosts],
-        reject: options[:ignore_hosts]
-      )
-      @port_rules = Rules.new(
-        accept: options[:ports],
-        reject: options[:ignore_ports]
-      )
-      @link_rules = Rules.new(
-        accept: options[:links],
-        reject: options[:ignore_links]
-      )
-      @url_rules = Rules.new(
-        accept: options[:urls],
-        reject: options[:ignore_urls]
-      )
-      @ext_rules = Rules.new(
-        accept: options[:exts],
-        reject: options[:ignore_exts]
-      )
-      if options[:host]
-        visit_hosts_like(options[:host])
-      end
+      return schemes
     end
     #

data/lib/spidr/agent/robots.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 begin
   require 'robots'
 rescue LoadError

data/lib/spidr/agent/sanitizers.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'uri'
 module Spidr
@@ -34,20 +36,17 @@ module Spidr
     #
     # Initializes the Sanitizer rules.
     #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Boolean] :strip_fragments (true)
+    # @param [Boolean] strip_fragments
     #   Specifies whether or not to strip the fragment component from URLs.
     #
-    # @option options [Boolean] :strip_query (false)
+    # @param [Boolean] strip_query
     #   Specifies whether or not to strip the query component from URLs.
     #
     # @since 0.2.2
     #
-    def initialize_sanitizers(options={})
-      @strip_fragments = options.fetch(:strip_fragments,true)
-      @strip_query     = options.fetch(:strip_query,false)
+    def initialize_sanitizers(strip_fragments: true, strip_query: false)
+      @strip_fragments = strip_fragments
+      @strip_query     = strip_query
     end
   end