RubyGems - webrobots - Versions diffs - 0.0.4 → 0.0.5 - Mend

webrobots 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.0.4
1	+ 0.0.5

data/lib/webrobots.rb CHANGED

@@ -13,8 +13,10 @@ class WebRobots
   #
   # * :http_get => a custom method, proc, or anything that responds to
   #   .call(uri), to be used for fetching robots.txt.  It must return
-  #   the response body if successful, or raise Net::HTTPNotFound if
-  #   the resource is not found.  Any other errror is regarded as
+  #   the response body if successful.  If the resource is not found,
+  #   it must either return nil or emulate a Net::HTTPNotFound error
+  #   that the net/http library would raise, using
+  #   Net::HTTPServerException.  Any other error raised is regarded as
   #   blanket ban.
   def initialize(user_agent, options = nil)
     @user_agent = user_agent
@@ -26,6 +28,12 @@ class WebRobots
     @robotstxt = {}
   end
+  @@anon_parser = RobotsTxt::Parser.new('Anonymous')
+  @@disallower = @@anon_parser.parse(<<-TXT, nil)
+User-Agent: *
+Disallow: /
+  TXT
   # Returns the robot name initially given.
   attr_reader :user_agent
@@ -95,16 +103,20 @@ class WebRobots
   def robots_txt(site)
     cache_robots_txt(site) {
       fetch_robots_txt(site)
-    }
+    } or @@disallower
   end
   def fetch_robots_txt(site)
-    begin
-      body = @http_get.call(site + 'robots.txt')
-    rescue Net::HTTPNotFound
-      return ''
-    end
-    @parser.parse(body, site)
+    body =
+      begin
+        @http_get.call(site + 'robots.txt')
+      rescue => e
+        if e.is_a?(Net::HTTPExceptions) && e.response.is_a?(Net::HTTPNotFound)
+          ''
+        else
+          nil
+        end
+      end and @parser.parse(body, site)
   end
   def cache_robots_txt(site, &block)

data/test/test_webrobots.rb CHANGED

@@ -25,7 +25,9 @@ class TestWebRobots < Test::Unit::TestCase
             TXT
           when 'http://site5.example.org/robots.txt'
-            raise Net::HTTPNotFound
+            raise Net::HTTPServerException.new(
+              'Not Found',
+              Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
           else
             raise "#{uri} is not supposed to be fetched"
           end
@@ -41,6 +43,42 @@ class TestWebRobots < Test::Unit::TestCase
       assert @robots.allowed?('http://site3.example.org/private/secret.txt')
       assert @robots.allowed?('http://site4.example.org/index.html')
       assert @robots.allowed?('http://site4.example.org/private/secret.txt')
+      assert @robots.allowed?('http://site5.example.org/index.html')
+      assert @robots.allowed?('http://site5.example.org/private/secret.txt')
+    end
+  end
+  context "robots.txt that cannot be fetched" do
+    setup do
+      @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
+          case uri.to_s
+          when 'http://site1.example.org/robots.txt'
+            raise Net::HTTPFatalError.new(
+              'Internal Server Error',
+              Net::HTTPInternalServerError.new('1.1', '500', 'Internal Server Error'))
+          when 'http://site2.example.org/robots.txt'
+            raise Net::HTTPRetriableError.new(
+              'Found',
+              Net::HTTPFound.new('1.1', '302', 'Found'))
+          when 'http://site3.example.org/robots.txt'
+            raise Errno::ECONNREFUSED
+          when 'http://site4.example.org/robots.txt'
+            raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
+          else
+            raise "#{uri} is not supposed to be fetched"
+          end
+        })
+    end
+    should "disallow any robot" do
+      assert @robots.disallowed?('http://site1.example.org/index.html')
+      assert @robots.disallowed?('http://site1.example.org/private/secret.txt')
+      assert @robots.disallowed?('http://site2.example.org/index.html')
+      assert @robots.disallowed?('http://site2.example.org/private/secret.txt')
+      assert @robots.disallowed?('http://site3.example.org/index.html')
+      assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
+      assert @robots.disallowed?('http://site4.example.org/index.html')
+      assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
     end
   end
@@ -264,9 +302,12 @@ Crawl-Delay: 1.5
 User-Agent: *
 Disallow: /2heavy/
 Allow: /2heavy/*.html
+# These are wrong but should be allowed
+Allow: /2heavy/%
+Crawl-Delay:
+#
 Option1: Bar
 Option3: Hi
-Crawl-Delay:
           TXT
         else
           raise "#{uri} is not supposed to be fetched"

data/webrobots.gemspec CHANGED

@@ -5,7 +5,7 @@
 Gem::Specification.new do |s|
   s.name = %q{webrobots}
-  s.version = "0.0.4"
+  s.version = "0.0.5"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Akinori MUSHA"]

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: webrobots
 version: !ruby/object:Gem::Version
-  hash: 23
+  hash: 21
   prerelease:
   segments:
   - 0
   - 0
-  - 4
-  version: 0.0.4
+  - 5
+  version: 0.0.5
 platform: ruby
 authors:
 - Akinori MUSHA