RubyGems - spider - Versions diffs - 0.4.4 → 0.5.0 - Mend

spider 0.4.4 → 0.5.0

Files changed (74) hide show

checksums.yaml +7 -0
data/AUTHORS +12 -0
data/CHANGES +6 -0
data/LICENSE +21 -0
data/{README → README.md} +50 -43
data/lib/spider.rb +12 -29
data/lib/spider/included_in_memcached.rb +1 -24
data/lib/spider/next_urls_in_sqs.rb +6 -29
data/lib/spider/robot_rules.rb +61 -57
data/lib/spider/spider_instance.rb +8 -31
data/spider.gemspec +4 -2
metadata +33 -124
data/doc/classes/BeStaticServerPages.html +0 -197
data/doc/classes/BeStaticServerPages.src/M000030.html +0 -19
data/doc/classes/BeStaticServerPages.src/M000031.html +0 -19
data/doc/classes/BeStaticServerPages.src/M000032.html +0 -18
data/doc/classes/BeStaticServerPages.src/M000033.html +0 -18
data/doc/classes/IncludedInMemcached.html +0 -199
data/doc/classes/IncludedInMemcached.src/M000015.html +0 -18
data/doc/classes/IncludedInMemcached.src/M000016.html +0 -18
data/doc/classes/IncludedInMemcached.src/M000017.html +0 -18
data/doc/classes/LoopingServlet.html +0 -137
data/doc/classes/LoopingServlet.src/M000037.html +0 -23
data/doc/classes/NextUrlsInSQS.html +0 -204
data/doc/classes/NextUrlsInSQS.src/M000018.html +0 -19
data/doc/classes/NextUrlsInSQS.src/M000019.html +0 -22
data/doc/classes/NextUrlsInSQS.src/M000020.html +0 -19
data/doc/classes/QueryServlet.html +0 -137
data/doc/classes/QueryServlet.src/M000038.html +0 -19
data/doc/classes/RobotRules.html +0 -175
data/doc/classes/RobotRules.src/M000034.html +0 -19
data/doc/classes/RobotRules.src/M000035.html +0 -67
data/doc/classes/RobotRules.src/M000036.html +0 -24
data/doc/classes/Spider.html +0 -170
data/doc/classes/Spider.src/M000029.html +0 -21
data/doc/classes/SpiderInstance.html +0 -345
data/doc/classes/SpiderInstance.src/M000021.html +0 -18
data/doc/classes/SpiderInstance.src/M000022.html +0 -22
data/doc/classes/SpiderInstance.src/M000023.html +0 -22
data/doc/classes/SpiderInstance.src/M000024.html +0 -24
data/doc/classes/SpiderInstance.src/M000025.html +0 -18
data/doc/classes/SpiderInstance.src/M000026.html +0 -18
data/doc/classes/SpiderInstance.src/M000027.html +0 -18
data/doc/classes/SpiderInstance.src/M000028.html +0 -18
data/doc/created.rid +0 -1
data/doc/files/README.html +0 -223
data/doc/files/lib/spider/included_in_memcached_rb.html +0 -142
data/doc/files/lib/spider/next_urls_in_sqs_rb.html +0 -144
data/doc/files/lib/spider/robot_rules_rb.html +0 -114
data/doc/files/lib/spider/spider_instance_rb.html +0 -117
data/doc/files/lib/spider_rb.html +0 -254
data/doc/files/spec/spec_helper_rb.html +0 -196
data/doc/files/spec/spec_helper_rb.src/M000001.html +0 -20
data/doc/files/spec/spec_helper_rb.src/M000002.html +0 -26
data/doc/files/spec/spec_helper_rb.src/M000003.html +0 -24
data/doc/files/spec/spec_helper_rb.src/M000004.html +0 -18
data/doc/files/spec/spec_helper_rb.src/M000005.html +0 -23
data/doc/files/spec/spider/included_in_memcached_spec_rb.html +0 -142
data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +0 -19
data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +0 -18
data/doc/files/spec/spider/spider_instance_spec_rb.html +0 -210
data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +0 -21
data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +0 -19
data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +0 -19
data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +0 -27
data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +0 -26
data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +0 -27
data/doc/files/spec/spider_spec_rb.html +0 -127
data/doc/files/spec/spider_spec_rb.src/M000014.html +0 -23
data/doc/fr_class_index.html +0 -34
data/doc/fr_file_index.html +0 -35
data/doc/fr_method_index.html +0 -64
data/doc/index.html +0 -24
data/doc/rdoc-style.css +0 -208

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 1d6465ee9f80195a1002053f826f1b80187020a3
+  data.tar.gz: 1218142b1d76482cf5baccd1f288934cd7a6b003
+SHA512:
+  metadata.gz: 2725ca0197ec2801836d94615e4ece0196c131a9ff500ed5837c22e320e06b33a8f609add7d41eabb8fa19114a60af71057b5bdebaf8f94e2be116148d6ad123
+  data.tar.gz: 5497c85e9759542ecb0cbb612484de0b185f7428c5a2c5222e1fbc7e1e3f69bac727bfddd883967c5eeb6c5bfaca0b9dfbe130eaaed35cc9e8cb96fb87abddc5

data/AUTHORS ADDED

@@ -0,0 +1,12 @@
+The Ruby Spider Gem would not be what it is today without the help of
+the following kind souls:
+Brian Campbell
+Henri Cook
+James Edward Gray II
+Joao Eriberto Mota Filho
+John Buckley
+John Nagro
+Mike Burns
+Matt Horan
+Sander van der Vliet

data/CHANGES CHANGED

@@ -1,3 +1,9 @@
+2016-05-13
+* fixed #1 thanks to @eribertomota
+* got it running on more recent versions of ruby
+* cleaned up the docs a bit
+* cleaned up the licensing and attribution
 2009-05-21
 * fixed an issue with robots.txt on ssl hosts
 * fixed an issue with pulling robots.txt from disallowed hosts

data/LICENSE ADDED

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2007-2016 Spider Team Authors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/{README → README.md} RENAMED

@@ -1,66 +1,80 @@
-Spider, a Web spidering library for Ruby. It handles the robots.txt,
-scraping, collecting, and looping so that you can just handle the data.
+# Spider
+_a Web spidering library for Ruby. It handles the robots.txt,
+scraping, collecting, and looping so that you can just handle the data._
-== Examples
+## Examples
-=== Crawl the Web, loading each page in turn, until you run out of memory
+### Crawl the Web, loading each page in turn, until you run out of memory
+```ruby
  require 'spider'
- Spider.start_at('http://mike-burns.com/') {}
+ Spider.start_at('http://cashcats.biz/') {}
+```
-=== To handle erroneous responses
+### To handle erroneous responses
+```ruby
  require 'spider'
- Spider.start_at('http://mike-burns.com/') do |s|
+ Spider.start_at('http://cashcats.biz/') do |s|
    s.on :failure do |a_url, resp, prior_url|
      puts "URL failed: #{a_url}"
      puts " linked from #{prior_url}"
    end
  end
+```
-=== Or handle successful responses
+### Or handle successful responses
+```ruby
  require 'spider'
- Spider.start_at('http://mike-burns.com/') do |s|
+ Spider.start_at('http://cashcats.biz/') do |s|
    s.on :success do |a_url, resp, prior_url|
      puts "#{a_url}: #{resp.code}"
      puts resp.body
      puts
    end
  end
+```
-=== Limit to just one domain
+### Limit to just one domain
+```ruby
  require 'spider'
- Spider.start_at('http://mike-burns.com/') do |s|
+ Spider.start_at('http://cashcats.biz/') do |s|
    s.add_url_check do |a_url|
-     a_url =~ %r{^http://mike-burns.com.*}
+     a_url =~ %r{^http://cashcats.biz.*}
    end
  end
+```
-=== Pass headers to some requests
+### Pass headers to some requests
+```ruby
  require 'spider'
- Spider.start_at('http://mike-burns.com/') do |s|
+ Spider.start_at('http://cashcats.biz/') do |s|
    s.setup do |a_url|
      if a_url =~ %r{^http://.*wikipedia.*}
        headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
      end
    end
  end
+```
-=== Use memcached to track cycles
+### Use memcached to track cycles
+```ruby
  require 'spider'
  require 'spider/included_in_memcached'
  SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
- Spider.start_at('http://mike-burns.com/') do |s|
+ Spider.start_at('http://cashcats.biz/') do |s|
    s.check_already_seen_with IncludedInMemcached.new(SERVERS)
  end
+```
-=== Track cycles with a custom object
+### Track cycles with a custom object
+```ruby
  require 'spider'
  class ExpireLinks < Hash
    def <<(v)
@@ -71,50 +85,58 @@ scraping, collecting, and looping so that you can just handle the data.
    end
  end
- Spider.start_at('http://mike-burns.com/') do |s|
+ Spider.start_at('http://cashcats.biz/') do |s|
    s.check_already_seen_with ExpireLinks.new
  end
+```
-=== Store nodes to visit with Amazon SQS
+### Store nodes to visit with Amazon SQS
+```ruby
  require 'spider'
  require 'spider/next_urls_in_sqs'
- Spider.start_at('http://mike-burns.com') do |s|
+ Spider.start_at('http://cashcats.biz') do |s|
    s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
  end
+```
-==== Store nodes to visit with a custom object
+### Store nodes to visit with a custom object
+```ruby
  require 'spider'
  class MyArray < Array
    def pop
 	super
    end
    def push(a_msg)
      super(a_msg)
    end
  end
- Spider.start_at('http://mike-burns.com') do |s|
+ Spider.start_at('http://cashcats.biz') do |s|
    s.store_next_urls_with MyArray.new
  end
+```
-=== Create a URL graph
+### Create a URL graph
+```ruby
  require 'spider'
  nodes = {}
- Spider.start_at('http://mike-burns.com/') do |s|
-   s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
+ Spider.start_at('http://cashcats.biz/') do |s|
+   s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
    s.on(:every) do |a_url, resp, prior_url|
      nodes[prior_url] ||= []
      nodes[prior_url] << a_url
    end
  end
+```
-=== Use a proxy
+### Use a proxy
+```ruby
  require 'net/http_configuration'
  require 'spider'
  http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
@@ -128,19 +150,4 @@ scraping, collecting, and looping so that you can just handle the data.
      end
    end
  end
-== Author
-John Nagro john.nagro@gmail.com
-Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
-Many thanks to:
-Matt Horan
-Henri Cook
-Sander van der Vliet
-John Buckley
-Brian Campbell
-With `robot_rules' from James Edward Gray II via
-http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
+```

data/lib/spider.rb CHANGED

@@ -1,41 +1,24 @@
-# Copyright 2007-2008 Mike Burns & John Nagro
-# :include: README
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#      * Redistributions of source code must retain the above copyright
-#      notice, this list of conditions and the following disclaimer.
-#      * Redistributions in binary form must reproduce the above copyright
-#      notice, this list of conditions and the following disclaimer in the
-#      documentation and/or other materials provided with the distribution.
-#      * Neither the name Mike Burns nor the
-#      names of his contributors may be used to endorse or promote products
-#      derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 require File.dirname(__FILE__)+'/spider/spider_instance'
 # A spidering library for Ruby. Handles robots.txt, scraping, finding more
 # links, and doing it all over again.
 class Spider
+  VERSION_INFO = [0, 5, 0] unless defined?(self::VERSION_INFO)
+  VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
+  def self.version
+    VERSION
+  end
   # Runs the spider starting at the given URL. Also takes a block that is given
   # the SpiderInstance. Use the block to define the rules and handlers for
   # the discovered Web pages. See SpiderInstance for the possible rules and
   # handlers.
   #
-  #  Spider.start_at('http://mike-burns.com/') do |s|
+  #  Spider.start_at('http://cashcats.biz/') do |s|
   #    s.add_url_check do |a_url|
-  #      a_url =~ %r{^http://mike-burns.com.*}
+  #      a_url =~ %r{^http://cashcats.biz.*}
   #    end
   #
   #    s.on 404 do |a_url, resp, prior_url|
@@ -52,8 +35,8 @@ class Spider
   #  end
   def self.start_at(a_url, &block)
-    rules    = RobotRules.new('Ruby Spider 1.0')
-    a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
+    rules    = RobotRules.new("Ruby Spider #{Spider::VERSION}")
+    a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, [])
     block.call(a_spider)
     a_spider.start!
   end

data/lib/spider/included_in_memcached.rb CHANGED

@@ -1,32 +1,9 @@
 # Use memcached to track cycles.
-#
-# Copyright 2007 Mike Burns
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#      * Redistributions of source code must retain the above copyright
-#      notice, this list of conditions and the following disclaimer.
-#      * Redistributions in binary form must reproduce the above copyright
-#      notice, this list of conditions and the following disclaimer in the
-#      documentation and/or other materials provided with the distribution.
-#      * Neither the name Mike Burns nor the
-#      names of his contributors may be used to endorse or promote products
-#      derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 require 'memcache'
 # A specialized class using memcached to track items stored. It supports
-# three operations: new, <<, and include? . Together these can be used to
+# three operations: new, <<, and include? . Together these can be used to
 # add items to the memcache, then determine whether the item has been added.
 #
 # To use it with Spider use the check_already_seen_with method:

data/lib/spider/next_urls_in_sqs.rb CHANGED

@@ -1,34 +1,11 @@
 # Use AmazonSQS to track nodes to visit.
-#
-# Copyright 2008 John Nagro
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#      * Redistributions of source code must retain the above copyright
-#      notice, this list of conditions and the following disclaimer.
-#      * Redistributions in binary form must reproduce the above copyright
-#      notice, this list of conditions and the following disclaimer in the
-#      documentation and/or other materials provided with the distribution.
-#      * Neither the name Mike Burns nor the
-#      names of his contributors may be used to endorse or promote products
-#      derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 require 'rubygems'
 require 'right_aws'
 require 'yaml'
 # A specialized class using AmazonSQS to track nodes to walk. It supports
-# two operations: push and pop . Together these can be used to
+# two operations: push and pop . Together these can be used to
 # add items to the queue, then pull items off the queue.
 #
 # This is useful if you want multiple Spider processes crawling the same
@@ -47,8 +24,8 @@ class NextUrlsInSQS
     @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
     @queue = @sqs.queue(queue_name)
   end
-  # Pull an item off the queue, loop until data is found. Data is
+  # Pull an item off the queue, loop until data is found. Data is
   # encoded with YAML.
   def pop
     while true
@@ -57,10 +34,10 @@ class NextUrlsInSQS
       sleep 5
     end
   end
   # Put data on the queue. Data is encoded with YAML.
   def push(a_msg)
     encoded_message = YAML::dump(a_msg)
     @queue.push(a_msg)
-  end
-end
+  end
+end

data/lib/spider/robot_rules.rb CHANGED

@@ -1,77 +1,81 @@
-# Understand robots.txt.
+#!/usr/local/bin/ruby -w
+# robot_rules.rb
+#
 #  Created by James Edward Gray II on 2006-01-31.
 #  Copyright 2006 Gray Productions. All rights reserved.
+#  https://github.com/eribertomota/robot_rules.rb
+#  https://github.com/johnnagro/spider/issues/1
 require "uri"
 # Based on Perl's WWW::RobotRules module, by Gisle Aas.
 class RobotRules
-  def initialize( user_agent )
-    @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase
-    @rules      = Hash.new { |rules, rule| rules[rule] = Array.new }
-  end
+   def initialize( user_agent )
+     @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
+"").downcase
+     @rules      = Hash.new { |rules, rule| rules[rule] = Array.new }
+   end
-  def parse( text_uri, robots_data )
-    uri      = URI.parse(text_uri)
-    location = "#{uri.host}:#{uri.port}"
-    @rules.delete(location)
+   def parse( text_uri, robots_data )
+     uri      = URI.parse(text_uri)
+     location = "#{uri.host}:#{uri.port}"
+     @rules.delete(location)
-    rules      = robots_data.split(/[\015\012]+/).map do |rule|
-      rule.sub(/\s*#.*$/, "")
-    end
-    anon_rules = Array.new
-    my_rules   = Array.new
-    current    = anon_rules
-    rules.each do |rule|
-      case rule
-      when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
-        break unless my_rules.empty?
+     rules      = robots_data.split(/[\015\012]+/).
+                              map { |rule| rule.sub(/\s*#.*$/, "") }
+     anon_rules = Array.new
+     my_rules   = Array.new
+     current    = anon_rules
+     rules.each do |rule|
+       case rule
+       when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
+         break unless my_rules.empty?
-        current = if $1 == "*"
-                    anon_rules
-                  elsif $1.downcase.index(@user_agent)
-                    my_rules
-                  else
-                    nil
-                  end
-      when /^\s*Disallow\s*:\s*(.*?)\s*$/i
-        next if current.nil?
+         current = if $1 == "*"
+           anon_rules
+         elsif $1.downcase.index(@user_agent)
+           my_rules
+         else
+           nil
+         end
+       when /^\s*Disallow\s*:\s*(.*?)\s*$/i
+         next if current.nil?
-        if $1.empty?
-          current << nil
-        else
-          disallow = URI.parse($1)
+         if $1.empty?
+           current << nil
+         else
+           disallow = URI.parse($1)
-          next unless disallow.scheme.nil? or disallow.scheme ==
-            uri.scheme
-          next unless disallow.port.nil?   or disallow.port == uri.port
-          next unless disallow.host.nil?   or
-          disallow.host.downcase == uri.host.downcase
+           next unless disallow.scheme.nil? or disallow.scheme ==
+uri.scheme
+           next unless disallow.port.nil?   or disallow.port == uri.port
+           next unless disallow.host.nil?   or
+                       disallow.host.downcase == uri.host.downcase
-          disallow = disallow.path
-          disallow = "/"            if disallow.empty?
-          disallow = "/#{disallow}" unless disallow[0] == ?/
+           disallow = disallow.path
+           disallow = "/"            if disallow.empty?
+           disallow = "/#{disallow}" unless disallow[0] == ?/
-          current << disallow
-        end
-      end
-    end
+           current << disallow
+         end
+       end
+     end
-    @rules[location] = if my_rules.empty?
-                         anon_rules.compact
-                       else
-                         my_rules.compact
-                       end
-  end
+     @rules[location] = if my_rules.empty?
+       anon_rules.compact
+     else
+       my_rules.compact
+     end
+   end
-  def allowed?( text_uri )
-    uri      = URI.parse(text_uri)
-    location = "#{uri.host}:#{uri.port}"
-    path     = uri.path
+   def allowed?( text_uri )
+     uri      = URI.parse(text_uri)
+     location = "#{uri.host}:#{uri.port}"
+     path     = uri.path
-    return true unless %w{http https}.include?(uri.scheme)
+     return true unless %w{http https}.include?(uri.scheme)
-    not @rules[location].any? { |rule| path.index(rule) == 0 }
-  end
+     not @rules[location].any? { |rule| path.index(rule) == 0 }
+   end
 end