RubyGems - spider - Versions diffs - 0.5.4 → 0.7.0 - Mend

spider 0.5.4 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/VERSION +1 -0
data/lib/spider/included_in_memcached.rb +6 -5
data/lib/spider/spider_instance.rb +31 -10
data/lib/spider.rb +3 -2
metadata +8 -19
data/AUTHORS +0 -17
data/CHANGES +0 -68
data/LICENSE +0 -21
data/README.md +0 -175
data/spec/spec_helper.rb +0 -90
data/spec/spider/included_in_memcached_spec.rb +0 -43
data/spec/spider/included_in_redis_spec.rb +0 -43
data/spec/spider/spider_instance_spec.rb +0 -405
data/spec/spider_spec.rb +0 -33
data/spider.gemspec +0 -20

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b87ed979c115546fa802f888fea0baf322f458be6e50e40f8ea8fd9ee392c8ac
-  data.tar.gz: eabe506949614a5622afa2def1da954d352d737dccadd0622d060be13c061115
+  metadata.gz: 7286f875f41881c9c8c385987c322b9832e5b07c6147aa4910a900e59015927e
+  data.tar.gz: 145ecd718a34521c17f0f9e939cb66f964ca0a1f93533969caf7d54426b213a8
 SHA512:
-  metadata.gz: ab52efe227f19067dd52efb0333d5901d3725f4e88fa0b86942c12bd702efa6b3bf4ed72b03d5ce58b32a22c574b0ee31764fa5e37df07a57132c250bf6b0658
-  data.tar.gz: ede92b88eb09867c41c1f7fcca58c99a55e63f02bb54fa094ece40e8875240734aa9abdcaf043db003120a44b239198dc6d063779543927febb493116732b7f3
+  metadata.gz: ba771c7dbbe3df286475a5586ba2d2b63affe762b7bf504694e6865e39c8e7f5047811ac97285a1edc5d99cd478993fd2be9ee3ae244cc1690975f7ab3f0e779
+  data.tar.gz: 2bd17f25db36a267b5534ff663b8394e7439a1701970e6d23ea295818732133b23c4bb35f82aa1f6f90022edcaceafc47b3fee3f7902fd9028a2ec8ad697a2a4

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.7.0

data/lib/spider/included_in_memcached.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 # Use memcached to track cycles.
-require 'memcache'
+require 'dalli'
 # A specialized class using memcached to track items stored. It supports
 # three operations: new, <<, and include? . Together these can be used to
@@ -12,10 +12,11 @@ require 'memcache'
 #    s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
 #  end
 class IncludedInMemcached
-  # Construct a new IncludedInMemcached instance. All arguments here are
-  # passed to MemCache (part of the memcache-client gem).
-  def initialize(*a)
-    @c = MemCache.new(*a)
+  # Construct a new IncludedInMemcached instance. The first argument should be
+  # the memcached server address (e.g., 'localhost:11211'). Additional options
+  # can be passed as a hash (see Dalli::Client documentation).
+  def initialize(server, options = {})
+    @c = Dalli::Client.new(server, options)
   end
   # Add an item to the memcache.

data/lib/spider/spider_instance.rb CHANGED Viewed

@@ -165,7 +165,6 @@ class SpiderInstance
     trap("SIGINT") { @interrupted = true }
     begin
       next_urls = @next_urls.pop
-      tmp_n_u = {}
       next_urls.each do |prior_url, urls|
         urls = [urls] unless urls.kind_of?(Array)
         urls.map do |a_url|
@@ -176,12 +175,9 @@ class SpiderInstance
           @setup.call(a_url) unless @setup.nil?
           get_page(parsed_url) do |response|
             do_callbacks(a_url, response, prior_url)
-            #tmp_n_u[a_url] = generate_next_urls(a_url, response)
-            #@next_urls.push tmp_n_u
             generate_next_urls(a_url, response).each do |a_next_url|
               @next_urls.push a_url => a_next_url
             end
-            #exit if interrupted
           end
           @teardown.call(a_url) unless @teardown.nil?
           break if @interrupted
@@ -256,7 +252,7 @@ class SpiderInstance
   def do_callbacks(a_url, resp, prior_url) #:nodoc:
     cbs = [@callbacks[:every],
       resp.success? ?  @callbacks[:success] : @callbacks[:failure],
-      @callbacks[resp.code]]
+      @callbacks[resp.code.to_i]]
     cbs.each do |cb|
       cb.call(a_url, resp, prior_url) if cb
@@ -264,11 +260,34 @@ class SpiderInstance
   end
   def generate_next_urls(a_url, resp) #:nodoc:
+    # Only scan for links if the content-type is HTML or the URL ends with .html
+    content_type = resp['Content-Type'] || resp['content-type'] || ''
+    url_ends_with_html = a_url.downcase.end_with?('.html')
+    unless content_type.downcase.include?('text/html') || url_ends_with_html
+      return []
+    end
     web_page = resp.body
     base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
                 [a_url[0,a_url.rindex('/')]])[0]
     base_url = remove_trailing_slash(base_url)
-    web_page.scan(/href="(.*?)"/i).flatten.map do |link|
+    # Extract anchor tags with href attributes, respecting rel="nofollow"
+    web_page.scan(/<a\s[^>]*href="([^"]*)"[^>]*>/i).flatten.map do |link|
+      # Get the full anchor tag to check for rel attribute
+      anchor_match = web_page.match(/<a\s[^>]*href="#{Regexp.escape(link)}"[^>]*>/i)
+      next nil unless anchor_match
+      anchor_tag = anchor_match[0]
+      # Check if this link has rel="nofollow" or similar attributes that should be respected
+      if anchor_tag.match(/rel\s*=\s*["']([^"']*nofollow[^"']*)["']/i) ||
+         anchor_tag.match(/rel\s*=\s*["']([^"']*sponsored[^"']*)["']/i) ||
+         anchor_tag.match(/rel\s*=\s*["']([^"']*ugc[^"']*)["']/i)
+        next nil  # Skip links with nofollow, sponsored, or ugc rel attributes
+      end
       begin
         parsed_link = URI.parse(link)
         if parsed_link.fragment == '#'
@@ -287,14 +306,16 @@ class SpiderInstance
     case parsed_additional_url.scheme
       when nil
         u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
+        # Include port if it's not the default port
+        port_part = (u.port && ((u.scheme == 'http' && u.port != 80) || (u.scheme == 'https' && u.port != 443))) ? ":#{u.port}" : ""
         if additional_url[0].chr == '/'
-          "#{u.scheme}://#{u.host}#{additional_url}"
+          "#{u.scheme}://#{u.host}#{port_part}#{additional_url}"
         elsif u.path.nil? || u.path == ''
-          "#{u.scheme}://#{u.host}/#{additional_url}"
+          "#{u.scheme}://#{u.host}#{port_part}/#{additional_url}"
         elsif u.path[0].chr == '/'
-          "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
+          "#{u.scheme}://#{u.host}#{port_part}#{u.path}/#{additional_url}"
         else
-          "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
+          "#{u.scheme}://#{u.host}#{port_part}/#{u.path}/#{additional_url}"
         end
     else
       additional_url

data/lib/spider.rb CHANGED Viewed

@@ -4,8 +4,9 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
 # links, and doing it all over again.
 class Spider
-  VERSION_INFO = [0, 5, 4] unless defined?(self::VERSION_INFO)
-  VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
+  VERSION = File.read(
+    File.expand_path('../VERSION', __dir__)
+  ).strip.freeze
   def self.version
     VERSION

metadata CHANGED Viewed

@@ -1,14 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: spider
 version: !ruby/object:Gem::Version
-  version: 0.5.4
+  version: 0.7.0
 platform: ruby
 authors:
 - John Nagro
-autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-06-04 00:00:00.000000000 Z
+date: 1980-01-02 00:00:00.000000000 Z
 dependencies: []
 description: |
   A Web spidering library: handles robots.txt, scraping, finding more
@@ -18,10 +17,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- AUTHORS
-- CHANGES
-- LICENSE
-- README.md
+- VERSION
 - lib/spider.rb
 - lib/spider/included_in_file.rb
 - lib/spider/included_in_memcached.rb
@@ -29,17 +25,12 @@ files:
 - lib/spider/next_urls_in_sqs.rb
 - lib/spider/robot_rules.rb
 - lib/spider/spider_instance.rb
-- spec/spec_helper.rb
-- spec/spider/included_in_memcached_spec.rb
-- spec/spider/included_in_redis_spec.rb
-- spec/spider/spider_instance_spec.rb
-- spec/spider_spec.rb
-- spider.gemspec
 homepage: https://github.com/johnnagro/spider
 licenses:
 - MIT
-metadata: {}
-post_install_message:
+metadata:
+  source_code_uri: https://github.com/johnnagro/spider
+  changelog_uri: https://github.com/johnnagro/spider/blob/main/CHANGELOG.md
 rdoc_options: []
 require_paths:
 - lib
@@ -47,16 +38,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: '2.5'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.6
-signing_key:
+rubygems_version: 3.6.9
 specification_version: 4
 summary: A Web spidering library
 test_files: []

data/AUTHORS DELETED Viewed

@@ -1,17 +0,0 @@
-The Ruby Spider Gem would not be what it is today without the help of
-the following kind souls:
-Alexandre Rousseau
-Brian Campbell
-Henri Cook
-James Edward Gray II
-Jeremy Evans
-Joao Eriberto Mota Filho
-John Buckley
-John Nagro
-Matt Horan
-Marc (@brigriffin)
-Mike Burns (original author)
-Olle Jonsson
-Sander van der Vliet
-Stuart Yamartino

data/CHANGES DELETED Viewed

@@ -1,68 +0,0 @@
-2018-04-23 v0.5.3
-* release simply to add missing CHANGES notes
-2018-04-23 v0.5.2
-* fixed #2 thanks to @jeremyevans
-* added Redis as cache wrapper thanks to @brigriffin
-2016-09-04 v0.5.1
-* added the ability to stop a crawl
-2016-05-13 v0.5.0
-* fixed #1 thanks to @eribertomota
-* got it running on more recent versions of ruby
-* cleaned up the docs a bit
-* cleaned up the licensing and attribution
-2009-05-21
-* fixed an issue with robots.txt on ssl hosts
-* fixed an issue with pulling robots.txt from disallowed hosts
-* fixed a documentation error with ExpiredLinks
-* Many thanks to Brian Campbell
-2008-10-09
-* fixed a situation with nested slashes in urls, thanks to Sander van der Vliet and John Buckley
-2008-07-06
-* Trap interrupts and shutdown gracefully
-* Support for custom urls-to-crawl objects
-* Example AmazonSQS urls-to-crawl support (next_urls_in_sqs.rb)
-2007-11-09:
-* Handle redirects that assume a base URL.
-2007-11-08:
-* Move spider_instance.rb, robot_rules.rb, and included_in_memcached.rb into
-  spider subdirectory.
-2007-11-02:
-* Memcached support.
-2007-10-31:
-* Add `setup' and `teardown' handlers.
-* Can set the headers for a HTTP request.
-* Changed :any to :every .
-* Changed the arguments to the :every, :success, :failure, and code handler.
-2007-10-23:
-* URLs without a page component but with a query component.
-* HTTP Redirect.
-* HTTPS.
-* Version 0.2.1 .
-2007-10-22:
-* Use RSpec to ensure that it mostly works.
-* Use WEBrick to create a small test server for additional testing.
-* Completely re-do the API to prepare for future expansion.
-* Add the ability to apply each URL to a series of custom allowed?-like
-  matchers.
-* BSD license.
-* Version 0.2.0 .
-2007-03-30:
-* Clean up the documentation.
-2007-03-28:
-* Change the tail recursion to a `while' loop, to please Ruby.
-* Documentation.
-* Initial release: version 0.1.0 .

data/LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-The MIT License (MIT)
-Copyright (c) 2007-2016 Spider Team Authors
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

data/README.md DELETED Viewed

@@ -1,175 +0,0 @@
-# Spider
-_a Web spidering library for Ruby. It handles the robots.txt,
-scraping, collecting, and looping so that you can just handle the data._
-## Examples
-### Crawl the Web, loading each page in turn, until you run out of memory
-```ruby
- require 'spider'
- Spider.start_at('http://cashcats.biz/') {}
-```
-### To handle erroneous responses
-```ruby
- require 'spider'
- Spider.start_at('http://cashcats.biz/') do |s|
-   s.on :failure do |a_url, resp, prior_url|
-     puts "URL failed: #{a_url}"
-     puts " linked from #{prior_url}"
-   end
- end
-```
-### Or handle successful responses
-```ruby
- require 'spider'
- Spider.start_at('http://cashcats.biz/') do |s|
-   s.on :success do |a_url, resp, prior_url|
-     puts "#{a_url}: #{resp.code}"
-     puts resp.body
-     puts
-   end
- end
-```
-### Limit to just one domain
-```ruby
- require 'spider'
- Spider.start_at('http://cashcats.biz/') do |s|
-   s.add_url_check do |a_url|
-     a_url =~ %r{^http://cashcats.biz.*}
-   end
- end
-```
-### Pass headers to some requests
-```ruby
- require 'spider'
- Spider.start_at('http://cashcats.biz/') do |s|
-   s.setup do |a_url|
-     if a_url =~ %r{^http://.*wikipedia.*}
-       headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
-     end
-   end
- end
-```
-### Use memcached to track cycles
-```ruby
- require 'spider'
- require 'spider/included_in_memcached'
- SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
- Spider.start_at('http://cashcats.biz/') do |s|
-   s.check_already_seen_with IncludedInMemcached.new(SERVERS)
- end
-```
-### Use Redis to track cycles
-```ruby
- require 'spider'
- require 'spider/included_in_redis'
- Spider.start_at('http://cashcats.biz/') do |s|
-   s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
- end
-```
-### Use Plain text to track cycles
-```ruby
- require 'spider'
- require 'spider/included_in_redis'
- Spider.start_at('http://cashcats.biz/') do |s|
-   s.check_already_seen_with IncludedInFile.new('/tmp/cashcats_crawl.txt')
- end
-```
-### Track cycles with a custom object
-```ruby
- require 'spider'
- class ExpireLinks < Hash
-   def <<(v)
-     self[v] = Time.now
-   end
-   def include?(v)
-     self[v].kind_of?(Time) && (self[v] + 86400) >= Time.now
-   end
- end
- Spider.start_at('http://cashcats.biz/') do |s|
-   s.check_already_seen_with ExpireLinks.new
- end
-```
-### Store nodes to visit with Amazon SQS
-```ruby
- require 'spider'
- require 'spider/next_urls_in_sqs'
- Spider.start_at('http://cashcats.biz') do |s|
-   s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
- end
-```
-### Store nodes to visit with a custom object
-```ruby
- require 'spider'
- class MyArray < Array
-   def pop
-     super
-   end
-   def push(a_msg)
-     super(a_msg)
-   end
- end
- Spider.start_at('http://cashcats.biz') do |s|
-   s.store_next_urls_with MyArray.new
- end
-```
-### Create a URL graph
-```ruby
- require 'spider'
- nodes = {}
- Spider.start_at('http://cashcats.biz/') do |s|
-   s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
-   s.on(:every) do |a_url, resp, prior_url|
-     nodes[prior_url] ||= []
-     nodes[prior_url] << a_url
-   end
- end
-```
-### Use a proxy
-```ruby
- require 'net/http_configuration'
- require 'spider'
- http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
-                                          :proxy_port => 8881)
- http_conf.apply do
-   Spider.start_at('http://img.4chan.org/b/') do |s|
-     s.on(:success) do |a_url, resp, prior_url|
-       File.open(a_url.gsub('/',':'),'w') do |f|
-         f.write(resp.body)
-       end
-     end
-   end
- end
-```
-_Copyright (c) 2007-2016 Spider Team Authors_

data/spec/spec_helper.rb DELETED Viewed

@@ -1,90 +0,0 @@
-require 'rubygems'
-require 'webrick'
-require 'spec'
-Spec::Runner.configure { |c| c.mock_with :mocha }
-def local_require(*files)
-  files.each do |file|
-    require File.dirname(__FILE__)+'/../lib/'+file
-  end
-end
-class BeStaticServerPages
-  def initialize
-    @pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
-    @actual = nil
-  end
-  attr :actual, true
-  def matches?(actual)
-    @actual = actual
-    actual == @pages
-  end
-  def failure_message
-    "expected #{@pages.inspect}, got #{@actual.inspect}"
-  end
-  def description
-    "be the pages returned by the static server (#{@pages.inspect})"
-  end
-end
-def with_web_server(svlt)
-  server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
-                                   :AccessLog => [])
-  server.mount('/', svlt)
-  Thread.new {server.start}
-  begin
-    yield
-  ensure
-    server.shutdown
-  end
-end
-def with_memcached
-  system('memcached -d -P /tmp/spider-memcached.pid')
-  cacher = IncludedInMemcached.new('localhost:11211')
-  begin
-    yield
-  ensure
-    system('kill -KILL `cat /tmp/spider-memcached.pid`')
-  end
-end
-def be_static_server_pages
-  BeStaticServerPages.new
-end
-class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
-  def do_GET(req, res)
-    res['Content-type'] = 'text/plain'
-    res.body = "response\n"
-  end
-end
-class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
-  def do_GET(req, res)
-    res['Content-type'] = 'text/html'
-    if req.path == '/foo'
-      res.body = <<-END
-      <a href="/">a</a>
-      END
-    else
-      res.body = <<-END
-      <a href="/foo">b</a>
-      END
-    end
-  end
-end
-def null_logger
-  l = stub
-  [:log, :fatal, :error, :warn , :info, :debug].each do |k|
-    l.stubs(k)
-    l.stubs("#{k}?".to_sym)
-  end
-  l
-end

data/spec/spider/included_in_memcached_spec.rb DELETED Viewed

@@ -1,43 +0,0 @@
-require File.dirname(__FILE__)+'/../spec_helper'
-def before_specing_memcached
-  local_require 'spider/included_in_memcached'
-  system('memcached -d -P /tmp/spider-memcached.pid')
-end
-def after_specing_memcached
-  system('kill -KILL `cat /tmp/spider-memcached.pid`')
-end
-Spec::Runner.configure { |c| c.mock_with :mocha }
-describe 'Object to halt cycles' do
-  before do
-    before_specing_memcached
-  end
-  it 'should understand <<' do
-    c = IncludedInMemcached.new('localhost:11211')
-    c.should respond_to(:<<)
-  end
-  it 'should understand included?' do
-    c = IncludedInMemcached.new('localhost:11211')
-    c.should respond_to(:include?)
-  end
-  it 'should produce false if the object is not included' do
-    c = IncludedInMemcached.new('localhost:11211')
-    c.include?('a').should be_false
-  end
-  it 'should produce true if the object is included' do
-    c = IncludedInMemcached.new('localhost:11211')
-    c << 'a'
-    c.include?('a').should be_true
-  end
-  after do
-    after_specing_memcached
-  end
-end

data/spec/spider/included_in_redis_spec.rb DELETED Viewed

@@ -1,43 +0,0 @@
-require File.dirname(__FILE__)+'/../spec_helper'
-def before_specing_redis
-  local_require 'spider/included_in_redis'
-  system('redis-server 127.0.0.1:6379')
-end
-def after_specing_redis
-  system('kill -KILL `pidof redis-server`')
-end
-Spec::Runner.configure { |c| c.mock_with :mocha }
-describe 'Object to halt cycles' do
-  before do
-    before_specing_redis
-  end
-  it 'should understand <<' do
-    c = IncludedInRedis.new(host: 'localhost', port: 6379)
-    c.should respond_to(:<<)
-  end
-  it 'should understand included?' do
-    c = IncludedInRedis.new(host: 'localhost', port: 6379)
-    c.should respond_to(:include?)
-  end
-  it 'should produce false if the object is not included' do
-    c = IncludedInRedis.new(host: 'localhost', port: 6379)
-    c.include?('a').should be_false
-  end
-  it 'should produce true if the object is included' do
-    c = IncludedInRedis.new(host: 'localhost', port: 6379)
-    c << 'a'
-    c.include?('a').should be_true
-  end
-  after do
-    after_specing_redis
-  end
-end

data/spec/spider/spider_instance_spec.rb DELETED Viewed

@@ -1,405 +0,0 @@
-require File.dirname(__FILE__)+'/../spec_helper'
-require 'webrick'
-require 'webrick/https'
-local_require 'spider', 'spider/included_in_memcached'
-describe 'SpiderInstance' do
-  # http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
-  # URL. Bug reported by Henri Cook.
-  it 'should construct a complete redirect URL' do
-    @response_called = false
-    redirected_resp = stub(:redirect? => true,
-                          :[] => '/default.htm')
-    success_resp = stub(:redirect? => false)
-    http_req = stub(:request => true)
-    http_mock_redir = stub(:use_ssl= => true)
-    http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
-    http_mock_success = stub(:use_ssl= => true)
-    http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
-    Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
-      returns(http_mock_success)
-    si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
-    si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
-      @response_called = true
-    end
-    @response_called.should be_true
-  end
-  it 'should prevent cycles with an IncludedInMemcached' do
-    with_memcached do
-      cacher = IncludedInMemcached.new('localhost:11211')
-      it_should_prevent_cycles_with(cacher)
-    end
-  end
-  it 'should prevent cycles with an Array' do
-    cacher = Array.new
-    it_should_prevent_cycles_with(cacher)
-  end
-  it 'should call the "setup" callback before loading the Web page' do
-    mock_successful_http
-    @on_called = false
-    @before_called = false
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.setup       { |*a| @before_called = Time.now }
-    si.on(:every)  { |*a| @on_called = Time.now }
-    si.start!
-    @on_called.should_not be_false
-    @before_called.should_not be_false
-    @before_called.should_not be_false
-    @before_called.should < @on_called
-  end
-  it 'should call the "teardown" callback after running all other callbacks' do
-    mock_successful_http
-    @on_called = false
-    @after_called = false
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:every)  { |*a| @on_called = Time.now }
-    si.teardown    { |*a| @after_called = Time.now }
-    si.start!
-    @on_called.should_not be_false
-    @after_called.should_not be_false
-    @after_called.should_not be_false
-    @after_called.should > @on_called
-  end
-  it 'should pass headers set by a setup handler to the HTTP request' do
-    mock_successful_http
-    Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
-    si = SpiderInstance.new(nil => ['http://example.com/foo'])
-    si.stubs(:allowable_url?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.setup do |a_url|
-      si.headers['X-Header-Set'] = 'True'
-    end
-    si.teardown do |a_url|
-      si.clear_headers
-    end
-    si.start!
-  end
-  it 'should call the :every callback with the current URL, the response, and the prior URL' do
-    mock_successful_http
-    callback_arguments_on(:every)
-  end
-  it 'should call the :success callback with the current URL, the request, and the prior URL' do
-    mock_successful_http
-    callback_arguments_on(:success)
-  end
-  it 'should call the :failure callback with the current URL, the request, and the prior URL' do
-    mock_failed_http
-    callback_arguments_on(:failure)
-  end
-  it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
-    mock_failed_http
-    callback_arguments_on(404)
-  end
-  it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
-    mock_successful_http
-    callback_arguments_on(200)
-  end
-  # Bug reported by John Nagro, using the example source http://eons.com/
-  # had to change line 192; uses request_uri now instead of path.
-  it 'should handle query URLs without a path' do
-    u = 'http://localhost:8888?s=1'
-    u_p = URI.parse(u)
-    @block_called = false
-    with_web_server(QueryServlet) do
-      si = SpiderInstance.new({nil => [u]})
-      si.get_page(u_p) do
-        @block_called = true
-      end
-    end
-    @block_called.should be_true
-  end
-  # This solves a problem reported by John Nagro.
-  it 'should handle redirects' do
-    u = 'http://example.com/'
-    u_p = URI.parse(u)
-    @redirect_handled = false
-    mock_redirect_http
-    si = SpiderInstance.new({nil => [u]})
-    si.get_page(u_p) do
-      @redirect_handled = true
-    end
-    @redirect_handled.should be_true
-  end
-  it 'should handle HTTPS' do
-    u = 'https://localhost:10443/'
-    u_p = URI.parse(u)
-    @page_called = false
-    server = WEBrick::HTTPServer.new(:Port => 10443,
-                                     :Logger => null_logger,
-                                     :AccessLog => [],
-                                     :SSLEnable => true,
-                                     :SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]],
-                                     :SSLComment => 'Comment of some sort')
-    server.mount('/', QueryServlet)
-    Thread.new {server.start}
-    si = SpiderInstance.new({nil => [u]})
-    si.get_page(u_p) { @page_called = true }
-    server.shutdown
-    @page_called.should be_true
-  end
-  it 'should skip URLs when allowable_url? is false' do
-    u = 'http://example.com/'
-    u_p = URI.parse(u)
-    http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
-    Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
-    si = SpiderInstance.new({nil => [u]})
-    si.expects(:allowable_url?).with(u, u_p).returns(false)
-    si.expects(:get_page).times(0)
-    si.start!
-  end
-  it 'should not skip URLs when allowable_url? is true' do
-    u = 'http://example.com/'
-    u_p = URI.parse(u)
-    http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
-    Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
-    si = SpiderInstance.new({nil => [u]})
-    si.expects(:allowable_url?).with(u, u_p).returns(true)
-    si.expects(:get_page).with(URI.parse(u))
-    si.start!
-  end
-  it 'should disallow URLs when the robots.txt says to' do
-    robot_rules = stub
-    SpiderInstance.any_instance.expects(:open).
-      with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
-        'Accept' => 'text/html,text/xml,application/xml,text/plain').
-      yields(stub(:read => 'robots.txt content'))
-    robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
-                                     'robots.txt content')
-    robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
-    si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
-    allowable = si.allowable_url?('http://example.com/',
-                                  URI.parse('http://example.com/'))
-    allowable.should be_false
-  end
-  it 'should disallow URLs when they fail any url_check' do
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.add_url_check { |a_url| false }
-    allowable = si.allowable_url?('http://example.com/',
-                                  URI.parse('http://example.com/'))
-    allowable.should be_false
-  end
-  it 'should support multiple url_checks' do
-    @first_url_check = false
-    @second_url_check = false
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.add_url_check do |a_url|
-      @first_url_check = true
-      true
-    end
-    si.add_url_check do |a_url|
-      @second_url_check = true
-      false
-    end
-    allowable = si.allowable_url?('http://example.com/',
-                                  URI.parse('http://example.com/'))
-    allowable.should be_false
-    @first_url_check.should be_true
-    @second_url_check.should be_true
-  end
-  it 'should avoid cycles' do
-    u = 'http://example.com/'
-    u_p = URI.parse(u)
-    si = SpiderInstance.new({nil => [u]}, [u_p])
-    si.stubs(:allowed?).returns(true)
-    allowable = si.allowable_url?(u, u_p)
-    allowable.should be_false
-    u_p.should_not be_nil
-  end
-  it 'should call the 404 handler for 404s' do
-    @proc_called = false
-    mock_failed_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(404) {|*a| @proc_called = true}
-    si.start!
-    @proc_called.should be_true
-  end
-  it 'should call the :success handler on success' do
-    @proc_called = false
-    mock_successful_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:success) {|*a| @proc_called = true}
-    si.start!
-    @proc_called.should be_true
-  end
-  it 'should not call the :success handler on failure' do
-    @proc_called = false
-    mock_failed_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:success) {|*a| @proc_called = true}
-    si.start!
-    @proc_called.should be_false
-  end
-  it 'should call the :success handler and the 200 handler on 200' do
-    @proc_200_called = false
-    @proc_success_called = false
-    mock_successful_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:success) {|*a| @proc_success_called = true}
-    si.on(200)      {|*a| @proc_200_called     = true}
-    si.start!
-    @proc_200_called.should be_true
-    @proc_success_called.should be_true
-  end
-  it 'should not call the :failure handler on success' do
-    @proc_called = false
-    mock_successful_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:failure) {|*a| @proc_called = true}
-    si.start!
-    @proc_called.should be_false
-  end
-  it 'should call the :failure handler on failure' do
-    @proc_called = false
-    mock_failed_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:failure) {|*a| @proc_called = true}
-    si.start!
-    @proc_called.should be_true
-  end
-  it 'should call the :failure handler and the 404 handler on 404' do
-    @proc_404_called = false
-    @proc_failure_called = false
-    mock_failed_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:failure) {|*a| @proc_failure_called = true}
-    si.on(404) {|*a| @proc_404_called = true}
-    si.start!
-    @proc_404_called.should be_true
-    @proc_failure_called.should be_true
-  end
-  it 'should call the :every handler even when a handler for the error code is defined' do
-    @any_called = false
-    mock_successful_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:every) { |*a| @any_called = true }
-    si.on(202) {|*a|}
-    si.start!
-    @any_called.should be_true
-  end
-  it 'should support a block as a response handler' do
-    @proc_called = false
-    mock_successful_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:every) { |*a| @proc_called = true }
-    si.start!
-    @proc_called.should be_true
-  end
-  it 'should support a proc as a response handler' do
-    @proc_called = false
-    mock_successful_http
-    si = SpiderInstance.new({nil => ['http://example.com/']})
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(:every, Proc.new { |*a| @proc_called = true })
-    si.start!
-    @proc_called.should be_true
-  end
-  def mock_http(http_req)
-    http_obj = mock(:use_ssl= => true)
-    http_obj.expects(:start).
-      yields(mock(:request => http_req)).returns(http_req)
-    Net::HTTP.expects(:new).returns(http_obj)
-  end
-  def mock_successful_http
-    http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
-    mock_http(http_req)
-  end
-  def mock_failed_http
-    http_req = stub(:redirect? => false, :success? => false, :code => 404)
-    mock_http(http_req)
-  end
-  def mock_redirect_http
-    http_req = stub(:redirect? => true, :success? => false, :code => 404)
-    http_req.expects(:[]).with('Location').returns('http://example.com/')
-    http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
-    http_obj = mock(:use_ssl= => true)
-    http_obj.expects(:start).
-      yields(mock(:request => http_req)).returns(http_req)
-    http_obj2 = mock(:use_ssl= => true)
-    http_obj2.expects(:start).
-      yields(mock(:request => http_req2)).returns(http_req2)
-    Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
-  end
-  def callback_arguments_on(code)
-    si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
-    si.stubs(:allowed?).returns(true)
-    si.stubs(:generate_next_urls).returns([])
-    si.on(code) do |a_url, resp, prior_url|
-      a_url.should == 'http://example.com/'
-      resp.should_not be_nil
-      prior_url.should == 'http://foo.com/'
-    end
-    si.start!
-  end
-  def it_should_prevent_cycles_with(cacher)
-    u = 'http://localhost:8888/'
-    u_p = URI.parse(u)
-    u2 = 'http://localhost:8888/foo'
-    u_p2 = URI.parse(u2)
-    with_web_server(LoopingServlet) do
-      si = SpiderInstance.new(nil => [u])
-      si.check_already_seen_with cacher
-      si.start!
-    end
-  end
-end

data/spec/spider_spec.rb DELETED Viewed

@@ -1,33 +0,0 @@
-require File.dirname(__FILE__)+'/spec_helper'
-local_require 'spider', 'spider/included_in_memcached'
-describe 'Spider' do
-  it 'should find two pages without cycles using defaults' do
-    u = []
-    with_web_server(LoopingServlet) do
-      u = find_pages_with_static_server
-    end
-    u.should be_static_server_pages
-  end
-  it 'should find two pages without cycles using memcached' do
-    u = []
-    with_web_server(LoopingServlet) do
-      with_memcached do
-        u = find_pages_with_static_server do |s|
-          s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
-        end
-      end
-    end
-    u.should be_static_server_pages
-  end
-  def find_pages_with_static_server(&block)
-    pages = []
-    Spider.start_at('http://localhost:8888/') do |s|
-      block.call(s) unless block.nil?
-      s.on(:every){ |u,r,p| pages << u }
-    end
-    pages
-  end
-end

data/spider.gemspec DELETED Viewed

@@ -1,20 +0,0 @@
-require 'rubygems'
-require File.expand_path('../lib/spider', __FILE__)
-spec = Gem::Specification.new do |s|
-  s.author = 'John Nagro'
-  s.email = 'john.nagro@gmail.com'
-  s.license = 'MIT'
-  s.has_rdoc = true
-  s.homepage = 'https://github.com/johnnagro/spider'
-  s.name = 'spider'
-  s.summary = 'A Web spidering library'
-  s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
-  s.require_path = 'lib'
-  s.description = <<-EOF
-A Web spidering library: handles robots.txt, scraping, finding more
-links, and doing it all over again.
-EOF
-  s.version = Spider::VERSION
-end