RubyGems - spidr - Versions diffs - 0.6.0 → 0.7.0 - Mend

spidr 0.6.0 → 0.7.0

Files changed (35) hide show

checksums.yaml +5 -5
data/.editorconfig +11 -0
data/.github/workflows/ruby.yml +26 -0
data/.gitignore +4 -5
data/ChangeLog.md +17 -0
data/Gemfile +8 -5
data/LICENSE.txt +1 -1
data/README.md +137 -78
data/Rakefile +1 -0
data/gemspec.yml +8 -1
data/lib/spidr/agent/actions.rb +1 -1
data/lib/spidr/agent/events.rb +1 -1
data/lib/spidr/agent/filters.rb +55 -56
data/lib/spidr/agent/sanitizers.rb +6 -9
data/lib/spidr/agent.rb +230 -120
data/lib/spidr/auth_store.rb +10 -6
data/lib/spidr/page/content_types.rb +51 -0
data/lib/spidr/page/html.rb +17 -19
data/lib/spidr/page/status_codes.rb +12 -10
data/lib/spidr/proxy.rb +6 -14
data/lib/spidr/rules.rb +5 -8
data/lib/spidr/session_cache.rb +23 -21
data/lib/spidr/settings/proxy.rb +19 -5
data/lib/spidr/spidr.rb +16 -6
data/lib/spidr/version.rb +1 -1
data/spec/agent_spec.rb +357 -10
data/spec/example_page.rb +2 -0
data/spec/page/content_types_spec.rb +22 -0
data/spec/page/html_spec.rb +255 -51
data/spec/page/status_codes_spec.rb +4 -4
data/spec/proxy_spec.rb +2 -2
data/spec/settings/proxy_examples.rb +31 -11
data/spec/spec_helper.rb +3 -0
metadata +19 -19
data/.travis.yml +0 -14

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
-  data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
+SHA256:
+  metadata.gz: 46a2f2ad2ca789b83fac0e2519294403734e2ad6d647fbc3a612d429e57c1b43
+  data.tar.gz: b72f561e337c6a0fcdbca9f59562e06f0b5854b15d321f90be1a4168b352faca
 SHA512:
-  metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
-  data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
+  metadata.gz: ced221d8cdbeaf95df12d6c038de6539a5148657209137433cc82c5abc69779a13376a7e6becdf423d2f2bdd9ebfaf8c7b94a51dda70ffcbab932da4fc5260b3
+  data.tar.gz: f54bedf3648dd033b8a37388413ae4ab71b4b09f16cc508b8e43e72f2ef870c59fe325e3f36a841791d9d843acb08bb02009469168e9b231a9835a0249b55b6c

data/.editorconfig ADDED Viewed

@@ -0,0 +1,11 @@
+root = true
+[*]
+end_of_line = lf
+insert_final_newline = true
+tab_width = 8
+trim_trailing_whitespace = true
+[{Gemfile,Rakefile,*.rb,*.gemspec,*.yml}]
+indent_style = space
+indent_size = 2

data/.github/workflows/ruby.yml ADDED Viewed

@@ -0,0 +1,26 @@
+name: CI
+on: [ push, pull_request ]
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ruby:
+          - 2.7
+          - '3.0'
+          - '3.1'
+          - jruby
+    name: Ruby ${{ matrix.ruby }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+      - name: Install dependencies
+        run: bundle install --jobs 4 --retry 3
+      - name: Run tests
+        run: bundle exec rake test

data/.gitignore CHANGED Viewed

@@ -1,8 +1,7 @@
-pkg
-doc
-web
-tmp
-Gemfile.lock
+/Gemfile.lock
+/coverage
+/doc
+/pkg
 .DS_Store
 .bundle
 .yardoc

data/ChangeLog.md CHANGED Viewed

@@ -1,3 +1,20 @@
+### 0.7.0 / 2022-12-31
+* Added {Spidr.domain} and {Spidr::Agent.domain}.
+* Added {Spidr::Page#gif?}.
+* Added {Spidr::Page#jpeg?}.
+* Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
+* Added {Spidr::Page#png?}.
+* {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
+  `URI::HTTP` object.
+### 0.6.1 / 2019-10-24
+* Check for the opaque component of URIs before attempting to set the path
+  component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
+  opaque` exceptions.
+* Fix `@robots` instance variable warning (@spk).
 ### 0.6.0 / 2016-08-04
 * Added {Spidr::Proxy}.

data/Gemfile CHANGED Viewed

@@ -12,10 +12,13 @@ group :development do
   gem 'rake'
   gem 'rubygems-tasks', '~> 0.2'
-  gem 'rspec',    '~> 3.0'
-  gem 'webmock',  '~> 2.0'
-  gem 'sinatra',  '~> 1.0'
+  gem 'rspec',     '~> 3.0'
+  gem 'webmock',   '~> 3.0'
+  gem 'sinatra',   '~> 2.0'
+  gem 'simplecov', '~> 0.20'
-  gem 'kramdown', '~> 0.12'
-  gem 'yard',     '~> 0.8'
+  gem 'kramdown'
+  gem 'redcarpet', platform: :mri
+  gem 'yard',      '~> 0.9'
+  gem 'yard-spellcheck', require: false
 end

data/LICENSE.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2008-2016 Hal Brodigan
+Copyright (c) 2008-2022 Hal Brodigan
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 # Spidr
+[![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
 * [Homepage](https://github.com/postmodern/spidr#readme)
 * [Source](https://github.com/postmodern/spidr)
 * [Issues](https://github.com/postmodern/spidr/issues)
 * [Mailing List](http://groups.google.com/group/spidr)
-* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
-* [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
 ## Description
@@ -49,137 +49,194 @@ and easy to use.
 Start spidering from a URL:
-    Spidr.start_at('http://tenderlovemaking.com/')
+```ruby
+Spidr.start_at('http://tenderlovemaking.com/') do |agent|
+  # ...
+end
+```
 Spider a host:
-    Spidr.host('solnic.eu')
+```ruby
+Spidr.host('solnic.eu') do |agent|
+  # ...
+end
+```
+Spider a domain (and any sub-domains):
+```ruby
+Spidr.domain('ruby-lang.org') do |agent|
+  # ...
+end
+```
 Spider a site:
-    Spidr.site('http://www.rubyflow.com/')
+```ruby
+Spidr.site('http://www.rubyflow.com/') do |agent|
+  # ...
+end
+```
 Spider multiple hosts:
-    Spidr.start_at(
-      'http://company.com/',
-      hosts: [
-        'company.com',
-        /host[\d]+\.company\.com/
-      ]
-    )
+```ruby
+Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
+  # ...
+end
+```
 Do not spider certain links:
-    Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
+```ruby
+Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
+  # ...
+end
+```
 Do not spider links on certain ports:
-    Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
+```ruby
+Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
+  # ...
+end
+```
 Do not spider links blacklisted in robots.txt:
-    Spidr.site(
-      'http://company.com/',
-      robots: true
-    )
+```ruby
+Spidr.site('http://company.com/', robots: true) do |agent|
+  # ...
+end
+```
 Print out visited URLs:
-    Spidr.site('http://www.rubyinside.com/') do |spider|
-      spider.every_url { |url| puts url }
-    end
+```ruby
+Spidr.site('http://www.rubyinside.com/') do |spider|
+  spider.every_url { |url| puts url }
+end
+```
 Build a URL map of a site:
-    url_map = Hash.new { |hash,key| hash[key] = [] }
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
-    Spidr.site('http://intranet.com/') do |spider|
-      spider.every_link do |origin,dest|
-        url_map[dest] << origin
-      end
-    end
+Spidr.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
+```
 Print out the URLs that could not be requested:
-    Spidr.site('http://company.com/') do |spider|
-      spider.every_failed_url { |url| puts url }
-    end
+```ruby
+Spidr.site('http://company.com/') do |spider|
+  spider.every_failed_url { |url| puts url }
+end
+```
 Finds all pages which have broken links:
-    url_map = Hash.new { |hash,key| hash[key] = [] }
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
-    spider = Spidr.site('http://intranet.com/') do |spider|
-      spider.every_link do |origin,dest|
-        url_map[dest] << origin
-      end
-    end
+spider = Spidr.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
-    spider.failures.each do |url|
-      puts "Broken link #{url} found in:"
+spider.failures.each do |url|
+  puts "Broken link #{url} found in:"
-      url_map[url].each { |page| puts "  #{page}" }
-    end
+  url_map[url].each { |page| puts "  #{page}" }
+end
+```
 Search HTML and XML pages:
-    Spidr.site('http://company.com/') do |spider|
-      spider.every_page do |page|
-        puts ">>> #{page.url}"
+```ruby
+Spidr.site('http://company.com/') do |spider|
+  spider.every_page do |page|
+    puts ">>> #{page.url}"
-        page.search('//meta').each do |meta|
-          name = (meta.attributes['name'] || meta.attributes['http-equiv'])
-          value = meta.attributes['content']
+    page.search('//meta').each do |meta|
+      name = (meta.attributes['name'] || meta.attributes['http-equiv'])
+      value = meta.attributes['content']
-          puts "  #{name} = #{value}"
-        end
-      end
+      puts "  #{name} = #{value}"
     end
+  end
+end
+```
 Print out the titles from every page:
-    Spidr.site('https://www.ruby-lang.org/') do |spider|
-      spider.every_html_page do |page|
-        puts page.title
-      end
-    end
+```ruby
+Spidr.site('https://www.ruby-lang.org/') do |spider|
+  spider.every_html_page do |page|
+    puts page.title
+  end
+end
+```
+Print out every HTTP redirect:
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_redirect_page do |page|
+    puts "#{page.url} -> #{page.headers['Location']}"
+  end
+end
+```
 Find what kinds of web servers a host is using, by accessing the headers:
-    servers = Set[]
+```ruby
+servers = Set[]
-    Spidr.host('company.com') do |spider|
-      spider.all_headers do |headers|
-        servers << headers['server']
-      end
-    end
+Spidr.host('company.com') do |spider|
+  spider.all_headers do |headers|
+    servers << headers['server']
+  end
+end
+```
 Pause the spider on a forbidden page:
-    spider = Spidr.host('company.com') do |spider|
-      spider.every_forbidden_page do |page|
-        spider.pause!
-      end
-    end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_forbidden_page do |page|
+    spider.pause!
+  end
+end
+```
 Skip the processing of a page:
-    Spidr.host('company.com') do |spider|
-      spider.every_missing_page do |page|
-        spider.skip_page!
-      end
-    end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_missing_page do |page|
+    spider.skip_page!
+  end
+end
+```
 Skip the processing of links:
-    Spidr.host('company.com') do |spider|
-      spider.every_url do |url|
-        if url.path.split('/').find { |dir| dir.to_i > 1000 }
-          spider.skip_link!
-        end
-      end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_url do |url|
+    if url.path.split('/').find { |dir| dir.to_i > 1000 }
+      spider.skip_link!
     end
+  end
+end
+```
 ## Requirements
@@ -188,11 +245,13 @@ Skip the processing of links:
 ## Install
-    $ gem install spidr
+```shell
+$ gem install spidr
+```
 ## License
-Copyright (c) 2008-2016 Hal Brodigan
+Copyright (c) 2008-2022 Hal Brodigan
 See {file:LICENSE.txt} for license information.

data/Rakefile CHANGED Viewed

@@ -12,6 +12,7 @@ Gem::Tasks.new
 require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new
+task :test    => :spec
 task :default => :spec
 require 'yard'

data/gemspec.yml CHANGED Viewed

@@ -11,10 +11,17 @@ email: postmodern.mod3@gmail.com
 homepage: https://github.com/postmodern/spidr#readme
 has_yard: true
+metadata:
+  documentation_uri: https://rubydoc.info/gems/spidr
+  source_code_uri:   https://github.com/postmodern/spidr.rb
+  bug_tracker_uri:   https://github.com/postmodern/spidr.rb/issues
+  changelog_uri:     https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
+  rubygems_mfa_required: 'true'
 required_ruby_version: ">= 2.0.0"
 dependencies:
   nokogiri: ~> 1.3
 development_dependencies:
-  bundler: ~> 1.0
+  bundler: ~> 2.0

data/lib/spidr/agent/actions.rb CHANGED Viewed

@@ -96,7 +96,7 @@ module Spidr
     protected
-    def initialize_actions(options={})
+    def initialize_actions
       @paused = false
     end
   end

data/lib/spidr/agent/events.rb CHANGED Viewed

@@ -520,7 +520,7 @@ module Spidr
     protected
-    def initialize_events(options={})
+    def initialize_events
       @every_url_blocks        = []
       @every_failed_url_blocks = []
       @every_url_like_blocks   = Hash.new { |hash,key| hash[key] = [] }

data/lib/spidr/agent/filters.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module Spidr
     #   agent.schemes = ['http']
     #
     def schemes=(new_schemes)
-      @schemes = new_schemes.map { |scheme| scheme.to_s }
+      @schemes = new_schemes.map(&:to_s)
     end
     #
@@ -356,89 +356,88 @@ module Spidr
     #
     # Initializes filtering rules.
     #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Array] :schemes (['http', 'https'])
+    # @param [Array<String>] schemes
     #   The list of acceptable URI schemes to visit.
     #   The `https` scheme will be ignored if `net/https` cannot be loaded.
     #
-    # @option options [String] :host
+    # @param [String] host
     #   The host-name to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :hosts
+    # @param [Array<String, Regexp, Proc>] hosts
     #   The patterns which match the host-names to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_hosts
+    # @param [Array<String, Regexp, Proc>] ignore_hosts
     #   The patterns which match the host-names to not visit.
     #
-    # @option options [Array<Integer, Regexp, Proc>] :ports
+    # @param [Array<Integer, Regexp, Proc>] ports
     #   The patterns which match the ports to visit.
     #
-    # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
+    # @param [Array<Integer, Regexp, Proc>] ignore_ports
     #   The patterns which match the ports to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :links
+    # @param [Array<String, Regexp, Proc>] links
     #   The patterns which match the links to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_links
+    # @param [Array<String, Regexp, Proc>] ignore_links
     #   The patterns which match the links to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :urls
+    # @param [Array<String, Regexp, Proc>] urls
     #   The patterns which match the URLs to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_urls
+    # @param [Array<String, Regexp, Proc>] ignore_urls
     #   The patterns which match the URLs to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :exts
+    # @param [Array<String, Regexp, Proc>] exts
     #   The patterns which match the URI path extensions to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_exts
+    # @param [Array<String, Regexp, Proc>] ignore_exts
     #   The patterns which match the URI path extensions to not visit.
     #
-    def initialize_filters(options={})
-      @schemes = []
+    def initialize_filters(schemes:      self.class.default_schemes,
+                           host:         nil,
+                           hosts:        nil,
+                           ignore_hosts: nil,
+                           ports:        nil,
+                           ignore_ports: nil,
+                           links:        nil,
+                           ignore_links: nil,
+                           urls:         nil,
+                           ignore_urls:  nil,
+                           exts:         nil,
+                           ignore_exts:  nil)
+      @schemes = schemes.map(&:to_s)
+      @host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
+      @port_rules = Rules.new(accept: ports, reject: ignore_ports)
+      @link_rules = Rules.new(accept: links, reject: ignore_links)
+      @url_rules  = Rules.new(accept: urls,  reject: ignore_urls)
+      @ext_rules  = Rules.new(accept: exts,  reject: ignore_exts)
+      visit_hosts_like(host) if host
+    end
-      if options[:schemes]
-        self.schemes = options[:schemes]
-      else
-        @schemes << 'http'
+    #
+    # Determines the default URI schemes to follow.
+    #
+    # @return [Array<String>]
+    #   The default URI schemes to follow.
+    #
+    # @since 0.6.2
+    #
+    def self.default_schemes
+      schemes = ['http']
-        begin
-          require 'net/https'
+      begin
+        require 'net/https'
-          @schemes << 'https'
-        rescue Gem::LoadError => e
-          raise(e)
-        rescue ::LoadError
-          warn "Warning: cannot load 'net/https', https support disabled"
-        end
+        schemes << 'https'
+      rescue Gem::LoadError => e
+        raise(e)
+      rescue ::LoadError
+        warn "Warning: cannot load 'net/https', https support disabled"
       end
-      @host_rules = Rules.new(
-        accept: options[:hosts],
-        reject: options[:ignore_hosts]
-      )
-      @port_rules = Rules.new(
-        accept: options[:ports],
-        reject: options[:ignore_ports]
-      )
-      @link_rules = Rules.new(
-        accept: options[:links],
-        reject: options[:ignore_links]
-      )
-      @url_rules = Rules.new(
-        accept: options[:urls],
-        reject: options[:ignore_urls]
-      )
-      @ext_rules = Rules.new(
-        accept: options[:exts],
-        reject: options[:ignore_exts]
-      )
-      if options[:host]
-        visit_hosts_like(options[:host])
-      end
+      return schemes
     end
     #
@@ -452,9 +451,9 @@ module Spidr
     #
     def visit_scheme?(scheme)
       if scheme
-        return @schemes.include?(scheme)
+        @schemes.include?(scheme)
       else
-        return true
+        true
       end
     end

data/lib/spidr/agent/sanitizers.rb CHANGED Viewed

@@ -21,7 +21,7 @@ module Spidr
     # @since 0.2.2
     #
     def sanitize_url(url)
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       url.fragment = nil if @strip_fragments
       url.query    = nil if @strip_query
@@ -34,20 +34,17 @@ module Spidr
     #
     # Initializes the Sanitizer rules.
     #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Boolean] :strip_fragments (true)
+    # @param [Boolean] strip_fragments
     #   Specifies whether or not to strip the fragment component from URLs.
     #
-    # @option options [Boolean] :strip_query (false)
+    # @param [Boolean] strip_query
     #   Specifies whether or not to strip the query component from URLs.
     #
     # @since 0.2.2
     #
-    def initialize_sanitizers(options={})
-      @strip_fragments = options.fetch(:strip_fragments,true)
-      @strip_query     = options.fetch(:strip_query,false)
+    def initialize_sanitizers(strip_fragments: true, strip_query: false)
+      @strip_fragments = strip_fragments
+      @strip_query     = strip_query
     end
   end