RubyGems - spidr - Versions diffs - 0.6.0 → 0.7.0 - Mend

spidr 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +5 -5
data/.editorconfig +11 -0
data/.github/workflows/ruby.yml +26 -0
data/.gitignore +4 -5
data/ChangeLog.md +17 -0
data/Gemfile +8 -5
data/LICENSE.txt +1 -1
data/README.md +137 -78
data/Rakefile +1 -0
data/gemspec.yml +8 -1
data/lib/spidr/agent/actions.rb +1 -1
data/lib/spidr/agent/events.rb +1 -1
data/lib/spidr/agent/filters.rb +55 -56
data/lib/spidr/agent/sanitizers.rb +6 -9
data/lib/spidr/agent.rb +230 -120
data/lib/spidr/auth_store.rb +10 -6
data/lib/spidr/page/content_types.rb +51 -0
data/lib/spidr/page/html.rb +17 -19
data/lib/spidr/page/status_codes.rb +12 -10
data/lib/spidr/proxy.rb +6 -14
data/lib/spidr/rules.rb +5 -8
data/lib/spidr/session_cache.rb +23 -21
data/lib/spidr/settings/proxy.rb +19 -5
data/lib/spidr/spidr.rb +16 -6
data/lib/spidr/version.rb +1 -1
data/spec/agent_spec.rb +357 -10
data/spec/example_page.rb +2 -0
data/spec/page/content_types_spec.rb +22 -0
data/spec/page/html_spec.rb +255 -51
data/spec/page/status_codes_spec.rb +4 -4
data/spec/proxy_spec.rb +2 -2
data/spec/settings/proxy_examples.rb +31 -11
data/spec/spec_helper.rb +3 -0
metadata +19 -19
data/.travis.yml +0 -14

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
-  data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
+SHA256:
+  metadata.gz: 46a2f2ad2ca789b83fac0e2519294403734e2ad6d647fbc3a612d429e57c1b43
+  data.tar.gz: b72f561e337c6a0fcdbca9f59562e06f0b5854b15d321f90be1a4168b352faca
 SHA512:
-  metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
-  data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
+  metadata.gz: ced221d8cdbeaf95df12d6c038de6539a5148657209137433cc82c5abc69779a13376a7e6becdf423d2f2bdd9ebfaf8c7b94a51dda70ffcbab932da4fc5260b3
+  data.tar.gz: f54bedf3648dd033b8a37388413ae4ab71b4b09f16cc508b8e43e72f2ef870c59fe325e3f36a841791d9d843acb08bb02009469168e9b231a9835a0249b55b6c

data/.editorconfig ADDED Viewed

@@ -0,0 +1,11 @@
+root = true
+[*]
+end_of_line = lf
+insert_final_newline = true
+tab_width = 8
+trim_trailing_whitespace = true
+[{Gemfile,Rakefile,*.rb,*.gemspec,*.yml}]
+indent_style = space
+indent_size = 2

data/.github/workflows/ruby.yml ADDED Viewed

@@ -0,0 +1,26 @@
+name: CI
+on: [ push, pull_request ]
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ruby:
+          - 2.7
+          - '3.0'
+          - '3.1'
+          - jruby
+    name: Ruby ${{ matrix.ruby }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+      - name: Install dependencies
+        run: bundle install --jobs 4 --retry 3
+      - name: Run tests
+        run: bundle exec rake test

data/.gitignore CHANGED Viewed

@@ -1,8 +1,7 @@
-pkg
-doc
-web
-tmp
-Gemfile.lock
+/Gemfile.lock
+/coverage
+/doc
+/pkg
 .DS_Store
 .bundle
 .yardoc

data/ChangeLog.md CHANGED Viewed

@@ -1,3 +1,20 @@
+### 0.7.0 / 2022-12-31
+* Added {Spidr.domain} and {Spidr::Agent.domain}.
+* Added {Spidr::Page#gif?}.
+* Added {Spidr::Page#jpeg?}.
+* Added {Spidr::Page#icon?} and {Spidr::Page#ico?}.
+* Added {Spidr::Page#png?}.
+* {Spidr.proxy=} and {Spidr::Agent#proxy=} can now accept a `String` or a
+  `URI::HTTP` object.
+### 0.6.1 / 2019-10-24
+* Check for the opaque component of URIs before attempting to set the path
+  component (@kyaroch). This fixes `URI::InvalidURIError: path conflicts with
+  opaque` exceptions.
+* Fix `@robots` instance variable warning (@spk).
 ### 0.6.0 / 2016-08-04
 * Added {Spidr::Proxy}.

data/Gemfile CHANGED Viewed

@@ -12,10 +12,13 @@ group :development do
   gem 'rake'
   gem 'rubygems-tasks', '~> 0.2'
-  gem 'rspec',    '~> 3.0'
-  gem 'webmock',  '~> 2.0'
-  gem 'sinatra',  '~> 1.0'
+  gem 'rspec',     '~> 3.0'
+  gem 'webmock',   '~> 3.0'
+  gem 'sinatra',   '~> 2.0'
+  gem 'simplecov', '~> 0.20'
-  gem 'kramdown', '~> 0.12'
-  gem 'yard',     '~> 0.8'
+  gem 'kramdown'
+  gem 'redcarpet', platform: :mri
+  gem 'yard',      '~> 0.9'
+  gem 'yard-spellcheck', require: false
 end

data/LICENSE.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2008-2016 Hal Brodigan
+Copyright (c) 2008-2022 Hal Brodigan
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 # Spidr
+[![CI](https://github.com/postmodern/spidr/actions/workflows/ruby.yml/badge.svg)](https://github.com/postmodern/spidr/actions/workflows/ruby.yml)
 * [Homepage](https://github.com/postmodern/spidr#readme)
 * [Source](https://github.com/postmodern/spidr)
 * [Issues](https://github.com/postmodern/spidr/issues)
 * [Mailing List](http://groups.google.com/group/spidr)
-* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
-* [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
 ## Description
@@ -49,137 +49,194 @@ and easy to use.
 Start spidering from a URL:
-    Spidr.start_at('http://tenderlovemaking.com/')
+```ruby
+Spidr.start_at('http://tenderlovemaking.com/') do |agent|
+  # ...
+end
+```
 Spider a host:
-    Spidr.host('solnic.eu')
+```ruby
+Spidr.host('solnic.eu') do |agent|
+  # ...
+end
+```
+Spider a domain (and any sub-domains):
+```ruby
+Spidr.domain('ruby-lang.org') do |agent|
+  # ...
+end
+```
 Spider a site:
-    Spidr.site('http://www.rubyflow.com/')
+```ruby
+Spidr.site('http://www.rubyflow.com/') do |agent|
+  # ...
+end
+```
 Spider multiple hosts:
-    Spidr.start_at(
-      'http://company.com/',
-      hosts: [
-        'company.com',
-        /host[\d]+\.company\.com/
-      ]
-    )
+```ruby
+Spidr.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
+  # ...
+end
+```
 Do not spider certain links:
-    Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
+```ruby
+Spidr.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
+  # ...
+end
+```
 Do not spider links on certain ports:
-    Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
+```ruby
+Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
+  # ...
+end
+```
 Do not spider links blacklisted in robots.txt:
-    Spidr.site(
-      'http://company.com/',
-      robots: true
-    )
+```ruby
+Spidr.site('http://company.com/', robots: true) do |agent|
+  # ...
+end
+```
 Print out visited URLs:
-    Spidr.site('http://www.rubyinside.com/') do |spider|
-      spider.every_url { |url| puts url }
-    end
+```ruby
+Spidr.site('http://www.rubyinside.com/') do |spider|
+  spider.every_url { |url| puts url }
+end
+```
 Build a URL map of a site:
-    url_map = Hash.new { |hash,key| hash[key] = [] }
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
-    Spidr.site('http://intranet.com/') do |spider|
-      spider.every_link do |origin,dest|
-        url_map[dest] << origin
-      end
-    end
+Spidr.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
+```
 Print out the URLs that could not be requested:
-    Spidr.site('http://company.com/') do |spider|
-      spider.every_failed_url { |url| puts url }
-    end
+```ruby
+Spidr.site('http://company.com/') do |spider|
+  spider.every_failed_url { |url| puts url }
+end
+```
 Finds all pages which have broken links:
-    url_map = Hash.new { |hash,key| hash[key] = [] }
+```ruby
+url_map = Hash.new { |hash,key| hash[key] = [] }
-    spider = Spidr.site('http://intranet.com/') do |spider|
-      spider.every_link do |origin,dest|
-        url_map[dest] << origin
-      end
-    end
+spider = Spidr.site('http://intranet.com/') do |spider|
+  spider.every_link do |origin,dest|
+    url_map[dest] << origin
+  end
+end
-    spider.failures.each do |url|
-      puts "Broken link #{url} found in:"
+spider.failures.each do |url|
+  puts "Broken link #{url} found in:"
-      url_map[url].each { |page| puts "  #{page}" }
-    end
+  url_map[url].each { |page| puts "  #{page}" }
+end
+```
 Search HTML and XML pages:
-    Spidr.site('http://company.com/') do |spider|
-      spider.every_page do |page|
-        puts ">>> #{page.url}"
+```ruby
+Spidr.site('http://company.com/') do |spider|
+  spider.every_page do |page|
+    puts ">>> #{page.url}"
-        page.search('//meta').each do |meta|
-          name = (meta.attributes['name'] || meta.attributes['http-equiv'])
-          value = meta.attributes['content']
+    page.search('//meta').each do |meta|
+      name = (meta.attributes['name'] || meta.attributes['http-equiv'])
+      value = meta.attributes['content']
-          puts "  #{name} = #{value}"
-        end
-      end
+      puts "  #{name} = #{value}"
     end
+  end
+end
+```
 Print out the titles from every page:
-    Spidr.site('https://www.ruby-lang.org/') do |spider|
-      spider.every_html_page do |page|
-        puts page.title
-      end
-    end
+```ruby
+Spidr.site('https://www.ruby-lang.org/') do |spider|
+  spider.every_html_page do |page|
+    puts page.title
+  end
+end
+```
+Print out every HTTP redirect:
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_redirect_page do |page|
+    puts "#{page.url} -> #{page.headers['Location']}"
+  end
+end
+```
 Find what kinds of web servers a host is using, by accessing the headers:
-    servers = Set[]
+```ruby
+servers = Set[]
-    Spidr.host('company.com') do |spider|
-      spider.all_headers do |headers|
-        servers << headers['server']
-      end
-    end
+Spidr.host('company.com') do |spider|
+  spider.all_headers do |headers|
+    servers << headers['server']
+  end
+end
+```
 Pause the spider on a forbidden page:
-    spider = Spidr.host('company.com') do |spider|
-      spider.every_forbidden_page do |page|
-        spider.pause!
-      end
-    end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_forbidden_page do |page|
+    spider.pause!
+  end
+end
+```
 Skip the processing of a page:
-    Spidr.host('company.com') do |spider|
-      spider.every_missing_page do |page|
-        spider.skip_page!
-      end
-    end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_missing_page do |page|
+    spider.skip_page!
+  end
+end
+```
 Skip the processing of links:
-    Spidr.host('company.com') do |spider|
-      spider.every_url do |url|
-        if url.path.split('/').find { |dir| dir.to_i > 1000 }
-          spider.skip_link!
-        end
-      end
+```ruby
+Spidr.host('company.com') do |spider|
+  spider.every_url do |url|
+    if url.path.split('/').find { |dir| dir.to_i > 1000 }
+      spider.skip_link!
     end
+  end
+end
+```
 ## Requirements
@@ -188,11 +245,13 @@ Skip the processing of links:
 ## Install
-    $ gem install spidr
+```shell
+$ gem install spidr
+```
 ## License
-Copyright (c) 2008-2016 Hal Brodigan
+Copyright (c) 2008-2022 Hal Brodigan
 See {file:LICENSE.txt} for license information.

data/Rakefile CHANGED Viewed

@@ -12,6 +12,7 @@ Gem::Tasks.new
 require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new
+task :test    => :spec
 task :default => :spec
 require 'yard'

data/gemspec.yml CHANGED Viewed

@@ -11,10 +11,17 @@ email: postmodern.mod3@gmail.com
 homepage: https://github.com/postmodern/spidr#readme
 has_yard: true
+metadata:
+  documentation_uri: https://rubydoc.info/gems/spidr
+  source_code_uri:   https://github.com/postmodern/spidr.rb
+  bug_tracker_uri:   https://github.com/postmodern/spidr.rb/issues
+  changelog_uri:     https://github.com/postmodern/spidr.rb/blob/master/ChangeLog.md
+  rubygems_mfa_required: 'true'
 required_ruby_version: ">= 2.0.0"
 dependencies:
   nokogiri: ~> 1.3
 development_dependencies:
-  bundler: ~> 1.0
+  bundler: ~> 2.0

data/lib/spidr/agent/actions.rb CHANGED Viewed

@@ -96,7 +96,7 @@ module Spidr
     protected
-    def initialize_actions(options={})
+    def initialize_actions
       @paused = false
     end
   end

data/lib/spidr/agent/events.rb CHANGED Viewed

@@ -520,7 +520,7 @@ module Spidr
     protected
-    def initialize_events(options={})
+    def initialize_events
       @every_url_blocks        = []
       @every_failed_url_blocks = []
       @every_url_like_blocks   = Hash.new { |hash,key| hash[key] = [] }

data/lib/spidr/agent/filters.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module Spidr
     #   agent.schemes = ['http']
     #
     def schemes=(new_schemes)
-      @schemes = new_schemes.map { |scheme| scheme.to_s }
+      @schemes = new_schemes.map(&:to_s)
     end
     #
@@ -356,89 +356,88 @@ module Spidr
     #
     # Initializes filtering rules.
     #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Array] :schemes (['http', 'https'])
+    # @param [Array<String>] schemes
     #   The list of acceptable URI schemes to visit.
     #   The `https` scheme will be ignored if `net/https` cannot be loaded.
     #
-    # @option options [String] :host
+    # @param [String] host
     #   The host-name to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :hosts
+    # @param [Array<String, Regexp, Proc>] hosts
     #   The patterns which match the host-names to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_hosts
+    # @param [Array<String, Regexp, Proc>] ignore_hosts
     #   The patterns which match the host-names to not visit.
     #
-    # @option options [Array<Integer, Regexp, Proc>] :ports
+    # @param [Array<Integer, Regexp, Proc>] ports
     #   The patterns which match the ports to visit.
     #
-    # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
+    # @param [Array<Integer, Regexp, Proc>] ignore_ports
     #   The patterns which match the ports to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :links
+    # @param [Array<String, Regexp, Proc>] links
     #   The patterns which match the links to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_links
+    # @param [Array<String, Regexp, Proc>] ignore_links
     #   The patterns which match the links to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :urls
+    # @param [Array<String, Regexp, Proc>] urls
     #   The patterns which match the URLs to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_urls
+    # @param [Array<String, Regexp, Proc>] ignore_urls
     #   The patterns which match the URLs to not visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :exts
+    # @param [Array<String, Regexp, Proc>] exts
     #   The patterns which match the URI path extensions to visit.
     #
-    # @option options [Array<String, Regexp, Proc>] :ignore_exts
+    # @param [Array<String, Regexp, Proc>] ignore_exts
     #   The patterns which match the URI path extensions to not visit.
     #
-    def initialize_filters(options={})
-      @schemes = []
+    def initialize_filters(schemes:      self.class.default_schemes,
+                           host:         nil,
+                           hosts:        nil,
+                           ignore_hosts: nil,
+                           ports:        nil,
+                           ignore_ports: nil,
+                           links:        nil,
+                           ignore_links: nil,
+                           urls:         nil,
+                           ignore_urls:  nil,
+                           exts:         nil,
+                           ignore_exts:  nil)
+      @schemes = schemes.map(&:to_s)
+      @host_rules = Rules.new(accept: hosts, reject: ignore_hosts)
+      @port_rules = Rules.new(accept: ports, reject: ignore_ports)
+      @link_rules = Rules.new(accept: links, reject: ignore_links)
+      @url_rules  = Rules.new(accept: urls,  reject: ignore_urls)
+      @ext_rules  = Rules.new(accept: exts,  reject: ignore_exts)
+      visit_hosts_like(host) if host
+    end
-      if options[:schemes]
-        self.schemes = options[:schemes]
-      else
-        @schemes << 'http'
+    #
+    # Determines the default URI schemes to follow.
+    #
+    # @return [Array<String>]
+    #   The default URI schemes to follow.
+    #
+    # @since 0.6.2
+    #
+    def self.default_schemes
+      schemes = ['http']
-        begin
-          require 'net/https'
+      begin
+        require 'net/https'
-          @schemes << 'https'
-        rescue Gem::LoadError => e
-          raise(e)
-        rescue ::LoadError
-          warn "Warning: cannot load 'net/https', https support disabled"
-        end
+        schemes << 'https'
+      rescue Gem::LoadError => e
+        raise(e)
+      rescue ::LoadError
+        warn "Warning: cannot load 'net/https', https support disabled"
       end
-      @host_rules = Rules.new(
-        accept: options[:hosts],
-        reject: options[:ignore_hosts]
-      )
-      @port_rules = Rules.new(
-        accept: options[:ports],
-        reject: options[:ignore_ports]
-      )
-      @link_rules = Rules.new(
-        accept: options[:links],
-        reject: options[:ignore_links]
-      )
-      @url_rules = Rules.new(
-        accept: options[:urls],
-        reject: options[:ignore_urls]
-      )
-      @ext_rules = Rules.new(
-        accept: options[:exts],
-        reject: options[:ignore_exts]
-      )
-      if options[:host]
-        visit_hosts_like(options[:host])
-      end
+      return schemes
     end
     #
@@ -452,9 +451,9 @@ module Spidr
     #
     def visit_scheme?(scheme)
       if scheme
-        return @schemes.include?(scheme)
+        @schemes.include?(scheme)
       else
-        return true
+        true
       end
     end

data/lib/spidr/agent/sanitizers.rb CHANGED Viewed

@@ -21,7 +21,7 @@ module Spidr
     # @since 0.2.2
     #
     def sanitize_url(url)
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url)
       url.fragment = nil if @strip_fragments
       url.query    = nil if @strip_query
@@ -34,20 +34,17 @@ module Spidr
     #
     # Initializes the Sanitizer rules.
     #
-    # @param [Hash] options
-    #   Additional options.
-    #
-    # @option options [Boolean] :strip_fragments (true)
+    # @param [Boolean] strip_fragments
     #   Specifies whether or not to strip the fragment component from URLs.
     #
-    # @option options [Boolean] :strip_query (false)
+    # @param [Boolean] strip_query
     #   Specifies whether or not to strip the query component from URLs.
     #
     # @since 0.2.2
     #
-    def initialize_sanitizers(options={})
-      @strip_fragments = options.fetch(:strip_fragments,true)
-      @strip_query     = options.fetch(:strip_query,false)
+    def initialize_sanitizers(strip_fragments: true, strip_query: false)
+      @strip_fragments = strip_fragments
+      @strip_query     = strip_query
     end
   end