RubyGems - spidr - Versions diffs - 0.6.1 → 0.7.1 - Mend

spidr 0.6.1 → 0.7.1

Files changed (47) hide show

checksums.yaml +4 -4
data/.editorconfig +11 -0
data/.github/workflows/ruby.yml +26 -0
data/.gitignore +4 -5
data/ChangeLog.md +19 -1
data/Gemfile +7 -4
data/LICENSE.txt +1 -1
data/README.md +136 -79
data/Rakefile +1 -0
data/gemspec.yml +7 -0
data/lib/spidr/agent/actions.rb +3 -1
data/lib/spidr/agent/events.rb +3 -1
data/lib/spidr/agent/filters.rb +57 -56
data/lib/spidr/agent/robots.rb +2 -0
data/lib/spidr/agent/sanitizers.rb +7 -8
data/lib/spidr/agent.rb +232 -108
data/lib/spidr/auth_credential.rb +2 -0
data/lib/spidr/auth_store.rb +9 -7
data/lib/spidr/cookie_jar.rb +7 -5
data/lib/spidr/extensions/uri.rb +3 -1
data/lib/spidr/extensions.rb +3 -1
data/lib/spidr/page/content_types.rb +53 -0
data/lib/spidr/page/cookies.rb +2 -0
data/lib/spidr/page/html.rb +21 -20
data/lib/spidr/page/status_codes.rb +15 -11
data/lib/spidr/page.rb +3 -1
data/lib/spidr/proxy.rb +8 -14
data/lib/spidr/rules.rb +7 -8
data/lib/spidr/session_cache.rb +26 -22
data/lib/spidr/settings/proxy.rb +22 -6
data/lib/spidr/settings/timeouts.rb +2 -0
data/lib/spidr/settings/user_agent.rb +2 -0
data/lib/spidr/settings.rb +5 -3
data/lib/spidr/spidr.rb +22 -11
data/lib/spidr/version.rb +3 -1
data/lib/spidr.rb +5 -3
data/spec/agent_spec.rb +356 -7
data/spec/example_page.rb +2 -0
data/spec/page/content_types_spec.rb +22 -0
data/spec/page/html_spec.rb +255 -51
data/spec/page/status_codes_spec.rb +4 -4
data/spec/proxy_spec.rb +2 -2
data/spec/settings/proxy_examples.rb +31 -11
data/spec/spec_helper.rb +3 -0
data/spidr.gemspec +1 -4
metadata +8 -7
data/.travis.yml +0 -16

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e2202e0ce389cbbc6f88360d7e7b430328bc6da973b69929ebc54b1d92c104bb
-  data.tar.gz: 0777e972ef2cb1d540ee138a7703ee67f88b482260074c13e8c08a8da963aa77
+  metadata.gz: 471764341b98b0cfeb57db24ac34a849dcfdcf43a751b648451a20c29c1ec051
+  data.tar.gz: '009c903cf30a13e55bbb8029fe2fdbfa4f8a8af32126b74aeb558f1afd3d3d88'
 SHA512:
-  metadata.gz: 45e923ad3aa59812de4af67cc3a1739d7751749b3aa3205c5979e1f29c302abd43316cd74ad7c61befda2d31b0ff080872047e9713537d3bfe2509a8d555156a
-  data.tar.gz: deae3dbd7d9566723ca8760064c715de8795b1d4e4be1ae0866497d697a411af3e8bbe2a677028a5bf57bba8e681bd9d712cef3a2d0d123bc903af28f5e32a77
+  metadata.gz: bddb65750dce8f6193764ac9d372adfa1893dc8743c24c383c359069043b51cd94e09ecd8bffad16bb8b4d92f99324c98ca95f8f59a9c9655a3f2fb7c42b9f57
+  data.tar.gz: c02f98806d9297ee22c6552eaaf6bb82f619001af25b0d8eeaabf91d0e32ab7154b5436de71ed4773b15353ba5556b52ece92a6035a891eb001c27b90e5cdda5

data/.editorconfig ADDED Viewed

@@ -0,0 +1,11 @@
+root = true
+[*]
+end_of_line = lf
+insert_final_newline = true
+tab_width = 8
+trim_trailing_whitespace = true
+[{Gemfile,Rakefile,*.rb,*.gemspec,*.yml}]
+indent_style = space
+indent_size = 2

data/.github/workflows/ruby.yml ADDED Viewed

@@ -0,0 +1,26 @@
+name: CI
+on: [ push, pull_request ]
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ruby:
+          - '3.0'
+          - '3.1'
+          - '3.2'
+          - '3.3'
+          - jruby
+    name: Ruby ${{ matrix.ruby }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+          bundler-cache: true
+      - name: Run tests
+        run: bundle exec rake test

data/.gitignore CHANGED Viewed

@@ -1,8 +1,7 @@
-pkg
-doc
-web
-tmp
-Gemfile.lock
+/Gemfile.lock
+/coverage
+/doc
+/pkg
 .DS_Store
 .bundle
 .yardoc

data/ChangeLog.md CHANGED Viewed

@@ -1,6 +1,24 @@
+### 0.7.1 / 2024-01-25
+* Switched to using `require_relative` to improve load-times.
+* Added `# frozen_string_literal: true` to all files.
+* Use keyword arguments for {Spidr.domain}.
+* Rescue `URI::Error` instead of `Exception` when calling `URI::HTTP#merge` in
+  {Spidr::Page#to_absolute}.
+### 0.7.0 / 2022-12-31
+* Added {Spidr.domain} and {Spidr::Agent.domain}.
+* Added {Spidr::Page#gif?}.
+* Added {Spidr::Page#jpeg?}.
+* Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
+* Added {Spidr::Page#png?}.
+* {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
+  `URI::HTTP` object.
 ### 0.6.1 / 2019-10-24
-* Check for opaque component of URIs before attempting to set the path
+* Check for the opaque component of URIs before attempting to set the path
   component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
   opaque` exceptions.
 * Fix `@robots` instance variable warning (@spk).

data/Gemfile CHANGED Viewed

@@ -12,10 +12,13 @@ group :development do
   gem 'rake'
   gem 'rubygems-tasks', '~> 0.2'
-  gem 'rspec',    '~> 3.0'
-  gem 'webmock',  '~> 3.0'
-  gem 'sinatra',  '~> 1.0'
+  gem 'rspec',     '~> 3.0'
+  gem 'webmock',   '~> 3.0'
+  gem 'sinatra',   '~> 2.0'
+  gem 'simplecov', '~> 0.20'
   gem 'kramdown'
-  gem 'yard',     '~> 0.9'
+  gem 'redcarpet', platform: :mri
+  gem 'yard',      '~> 0.9'
+  gem 'yard-spellcheck', require: false
 end

data/LICENSE.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2008-2016 Hal Brodigan
+Copyright (c) 2008-2024 Hal Brodigan
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 # Spidr
+[![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
 * [Homepage](https://github.com/postmodern/spidr#readme)
 * [Source](https://github.com/postmodern/spidr)
 * [Issues](https://github.com/postmodern/spidr/issues)
 * [Mailing List](http://groups.google.com/group/spidr)
-* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
-* [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
 ## Description
@@ -49,137 +49,194 @@ and easy to use.
 Start spidering from a URL:
-    Spidr.start_at('http://tenderlovemaking.com/')
+```ruby
+Spidr.start_at('http://tenderlovemaking.com/') do |agent|
+  # ...
+end
+```
 Spider a host:
-    Spidr.host('solnic.eu')
+```ruby
+Spidr.host('solnic.eu') do |agent|
+  # ...
+end
+```
+Spider a domain (and any sub-domains):
+```ruby
+Spidr.domain('ruby-lang.org') do |agent|
+  # ...
+end
+```
 Spider a site:
-    Spidr.site('http://www.rubyflow.com/')
+```ruby
+Spidr.site('http://www.rubyflow.com/') do |agent|
+  # ...
+end
+```
 Spider multiple hosts:
-    Spidr.start_at(
-      'http://company.com/',
-      hosts: [
-        'company.com',
-        /host[\d]+\.company\.com/
-      ]
-    )
+```ruby
+Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
+  # ...
+end
+```
 Do not spider certain links:
-    Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
+```ruby
+Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
+  # ...
+end
+```
 Do not spider links on certain ports:
-    Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
+```ruby
+Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
+  # ...
+end
+```
 Do not spider links blacklisted in robots.txt:
-    Spidr.site(
-      'http://company.com/',
-      robots: true
-    )
+```ruby
+Spidr.site('http://company.com/', robots: true) do |agent|
+  # ...
+end
+```
 Print out visited URLs:
-    Spidr.site('http://www.rubyinside.com/') do |spider|
-      spider.every_url { |url| puts url }
-    end
+```ruby
+Spidr.site('http://www.rubyinside.com/') do |spider|
+  spider.every_url { |url| puts url }
+end
+```
 Build a URL map of a site:
-    url_map = Hash.new { |hash,key| hash[key] = [] }
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
-    Spidr.site('http://intranet.com/') do |spider|
-      spider.every_link do |origin,dest|
-        url_map[dest] << origin
-      end
-    end
+Spidr.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
+```
 Print out the URLs that could not be requested:
-    Spidr.site('http://company.com/') do |spider|
-      spider.every_failed_url { |url| puts url }
-    end
+```ruby
+Spidr.site('http://company.com/') do |spider|
+  spider.every_failed_url { |url| puts url }
+end
+```
 Finds all pages which have broken links:
-    url_map = Hash.new { |hash,key| hash[key] = [] }
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
-    spider = Spidr.site('http://intranet.com/') do |spider|
-      spider.every_link do |origin,dest|
-        url_map[dest] << origin
-      end
-    end
+spider = Spidr.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
-    spider.failures.each do |url|
-      puts "Broken link #{url} found in:"
+spider.failures.each do |url|
+  puts "Broken link #{url} found in:"
-      url_map[url].each { |page| puts "  #{page}" }
-    end
+  url_map[url].each { |page| puts "  #{page}" }
+end
+```
 Search HTML and XML pages:
-    Spidr.site('http://company.com/') do |spider|
-      spider.every_page do |page|
-        puts ">>> #{page.url}"
+```ruby
+Spidr.site('http://company.com/') do |spider|
+  spider.every_page do |page|
+    puts ">>> #{page.url}"
-        page.search('//meta').each do |meta|
-          name = (meta.attributes['name'] || meta.attributes['http-equiv'])
-          value = meta.attributes['content']
+    page.search('//meta').each do |meta|
+      name = (meta.attributes['name'] || meta.attributes['http-equiv'])
+      value = meta.attributes['content']
-          puts "  #{name} = #{value}"
-        end
-      end
+      puts "  #{name} = #{value}"
     end
+  end
+end
+```
 Print out the titles from every page:
-    Spidr.site('https://www.ruby-lang.org/') do |spider|
-      spider.every_html_page do |page|
-        puts page.title
-      end
-    end
+```ruby
+Spidr.site('https://www.ruby-lang.org/') do |spider|
+  spider.every_html_page do |page|
+    puts page.title
+  end
+end
+```
+Print out every HTTP redirect:
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_redirect_page do |page|
+    puts "#{page.url} -> #{page.headers['Location']}"
+  end
+end
+```
 Find what kinds of web servers a host is using, by accessing the headers:
-    servers = Set[]
+```ruby
+servers = Set[]
-    Spidr.host('company.com') do |spider|
-      spider.all_headers do |headers|
-        servers << headers['server']
-      end
-    end
+Spidr.host('company.com') do |spider|
+  spider.all_headers do |headers|
+    servers << headers['server']
+  end
+end
+```
 Pause the spider on a forbidden page:
-    Spidr.host('company.com') do |spider|
-      spider.every_forbidden_page do |page|
-        spider.pause!
-      end
-    end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_forbidden_page do |page|
+    spider.pause!
+  end
+end
+```
 Skip the processing of a page:
-    Spidr.host('company.com') do |spider|
-      spider.every_missing_page do |page|
-        spider.skip_page!
-      end
-    end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_missing_page do |page|
+    spider.skip_page!
+  end
+end
+```
 Skip the processing of links:
-    Spidr.host('company.com') do |spider|
-      spider.every_url do |url|
-        if url.path.split('/').find { |dir| dir.to_i > 1000 }
-          spider.skip_link!
-        end
-      end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_url do |url|
+    if url.path.split('/').find { |dir| dir.to_i > 1000 }
+      spider.skip_link!
     end
+  end
+end
+```
 ## Requirements
@@ -188,12 +245,12 @@ Skip the processing of links:
 ## Install
-    $ gem install spidr
+```shell
+$ gem install spidr
+```
 ## License
-Copyright (c) 2008-2016 Hal Brodigan
 See {file:LICENSE.txt} for license information.
 [ruby]: https://www.ruby-lang.org/

data/Rakefile CHANGED Viewed

@@ -12,6 +12,7 @@ Gem::Tasks.new
 require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new
+task :test    => :spec
 task :default => :spec
 require 'yard'

data/gemspec.yml CHANGED Viewed

@@ -11,6 +11,13 @@ email: postmodern.mod3@gmail.com
 homepage: https://github.com/postmodern/spidr#readme
 has_yard: true
+metadata:
+  documentation_uri: https://rubydoc.info/gems/spidr
+  source_code_uri:   https://github.com/postmodern/spidr.rb
+  bug_tracker_uri:   https://github.com/postmodern/spidr.rb/issues
+  changelog_uri:     https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
+  rubygems_mfa_required: 'true'
 required_ruby_version: ">= 2.0.0"
 dependencies:

data/lib/spidr/agent/actions.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module Spidr
   class Agent
     module Actions
@@ -96,7 +98,7 @@ module Spidr
     protected
-    def initialize_actions(options={})
+    def initialize_actions
       @paused = false
     end
   end

data/lib/spidr/agent/events.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module Spidr
   class Agent
     #
@@ -520,7 +522,7 @@ module Spidr
     protected
-    def initialize_events(options={})
+    def initialize_events
       @every_url_blocks        = []
       @every_failed_url_blocks = []
       @every_url_like_blocks   = Hash.new { |hash,key| hash[key] = [] }

data/lib/spidr/agent/filters.rb CHANGED Viewed

@@ -1,4 +1,6 @@
-require 'spidr/rules'
+# frozen_string_literal: true
+require_relative '../rules'
 module Spidr
   class Agent
@@ -170,7 +172,7 @@ module Spidr
     #
     # @yieldparam [String] link
     #   A link to accept or reject.
-    #
+    #
     # @since 0.2.4
     #
     def visit_links_like(pattern=nil,&block)
@@ -238,7 +240,7 @@ module Spidr
     #
     # @yieldparam [URI::HTTP, URI::HTTPS] url
     #   A URL to accept or reject.
-    #
+    #
     # @since 0.2.4
     #
     def visit_urls_like(pattern=nil,&block)
@@ -356,89 +358,88 @@ module Spidr
     #
     # Initializes filtering rules.
     #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Array] :schemes (['http', 'https'])
+    # @param [Array<String>] schemes
     #   The list of acceptable URI schemes to visit.
     #   The `https` scheme will be ignored if `net/https` cannot be loaded.
     #
-    # @option options [String] :host
+    # @param [String] host
     #   The host-name to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :hosts
+    # @param [Array<String, Regexp, Proc>] hosts
     #   The patterns which match the host-names to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_hosts
+    # @param [Array<String, Regexp, Proc>] ignore_hosts
     #   The patterns which match the host-names to not visit.
     #
-    # @option options [Array<Integer, Regexp, Proc>] :ports
+    # @param [Array<Integer, Regexp, Proc>] ports
     #   The patterns which match the ports to visit.
     #
-    # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
+    # @param [Array<Integer, Regexp, Proc>] ignore_ports
     #   The patterns which match the ports to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :links
+    # @param [Array<String, Regexp, Proc>] links
     #   The patterns which match the links to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_links
+    # @param [Array<String, Regexp, Proc>] ignore_links
     #   The patterns which match the links to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :urls
+    # @param [Array<String, Regexp, Proc>] urls
     #   The patterns which match the URLs to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_urls
+    # @param [Array<String, Regexp, Proc>] ignore_urls
     #   The patterns which match the URLs to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :exts
+    # @param [Array<String, Regexp, Proc>] exts
     #   The patterns which match the URI path extensions to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_exts
+    # @param [Array<String, Regexp, Proc>] ignore_exts
     #   The patterns which match the URI path extensions to not visit.
     #
-    def initialize_filters(options={})
-      @schemes = []
+    def initialize_filters(schemes:      self.class.default_schemes,
+                           host:         nil,
+                           hosts:        nil,
+                           ignore_hosts: nil,
+                           ports:        nil,
+                           ignore_ports: nil,
+                           links:        nil,
+                           ignore_links: nil,
+                           urls:         nil,
+                           ignore_urls:  nil,
+                           exts:         nil,
+                           ignore_exts:  nil)
+      @schemes = schemes.map(&:to_s)
+      @host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
+      @port_rules = Rules.new(accept: ports, reject: ignore_ports)
+      @link_rules = Rules.new(accept: links, reject: ignore_links)
+      @url_rules  = Rules.new(accept: urls,  reject: ignore_urls)
+      @ext_rules  = Rules.new(accept: exts,  reject: ignore_exts)
+      visit_hosts_like(host) if host
+    end
-      if options[:schemes]
-        self.schemes = options[:schemes]
-      else
-        @schemes << 'http'
+    #
+    # Determines the default URI schemes to follow.
+    #
+    # @return [Array<String>]
+    #   The default URI schemes to follow.
+    #
+    # @since 0.6.2
+    #
+    def self.default_schemes
+      schemes = ['http']
-        begin
-          require 'net/https'
+      begin
+        require 'net/https'
-          @schemes << 'https'
-        rescue Gem::LoadError => e
-          raise(e)
-        rescue ::LoadError
-          warn "Warning: cannot load 'net/https', https support disabled"
-        end
+        schemes << 'https'
+      rescue Gem::LoadError => e
+        raise(e)
+      rescue ::LoadError
+        warn "Warning: cannot load 'net/https', https support disabled"
       end
-      @host_rules = Rules.new(
-        accept: options[:hosts],
-        reject: options[:ignore_hosts]
-      )
-      @port_rules = Rules.new(
-        accept: options[:ports],
-        reject: options[:ignore_ports]
-      )
-      @link_rules = Rules.new(
-        accept: options[:links],
-        reject: options[:ignore_links]
-      )
-      @url_rules = Rules.new(
-        accept: options[:urls],
-        reject: options[:ignore_urls]
-      )
-      @ext_rules = Rules.new(
-        accept: options[:exts],
-        reject: options[:ignore_exts]
-      )
-      if options[:host]
-        visit_hosts_like(options[:host])
-      end
+      return schemes
     end
     #

data/lib/spidr/agent/robots.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 begin
   require 'robots'
 rescue LoadError

data/lib/spidr/agent/sanitizers.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'uri'
 module Spidr
@@ -34,20 +36,17 @@ module Spidr
     #
     # Initializes the Sanitizer rules.
     #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Boolean] :strip_fragments (true)
+    # @param [Boolean] strip_fragments
     #   Specifies whether or not to strip the fragment component from URLs.
     #
-    # @option options [Boolean] :strip_query (false)
+    # @param [Boolean] strip_query
     #   Specifies whether or not to strip the query component from URLs.
     #
     # @since 0.2.2
     #
-    def initialize_sanitizers(options={})
-      @strip_fragments = options.fetch(:strip_fragments,true)
-      @strip_query     = options.fetch(:strip_query,false)
+    def initialize_sanitizers(strip_fragments: true, strip_query: false)
+      @strip_fragments = strip_fragments
+      @strip_query     = strip_query
     end
   end