RubyGems - webrobots - Versions diffs - 0.0.5 → 0.0.6 - Mend

webrobots 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.5
1	+ 0.0.6

data/lib/webrobots.rb CHANGED Viewed

@@ -13,11 +13,9 @@ class WebRobots
   #
   # * :http_get => a custom method, proc, or anything that responds to
   #   .call(uri), to be used for fetching robots.txt.  It must return
-  #   the response body if successful.  If the resource is not found,
-  #   it must either return nil or emulate a Net::HTTPNotFound error
-  #   that the net/http library would raise, using
-  #   Net::HTTPServerException.  Any other error raised is regarded as
-  #   blanket ban.
+  #   the response body if successful, return an empty string if the
+  #   resource is not found, and return nil or raise any error on
+  #   failure.  Redirects should be handled within this proc.
   def initialize(user_agent, options = nil)
     @user_agent = user_agent
     @parser = RobotsTxt::Parser.new(user_agent)
@@ -25,14 +23,13 @@ class WebRobots
     options ||= {}
     @http_get = options[:http_get] || method(:http_get)
-    @robotstxt = {}
+    @robotstxt = create_cache()
   end
-  @@anon_parser = RobotsTxt::Parser.new('Anonymous')
-  @@disallower = @@anon_parser.parse(<<-TXT, nil)
-User-Agent: *
-Disallow: /
-  TXT
+  # :nodoc:
+  def create_cache
+    Hash.new	# Must respond to [], []=, and delete.
+  end
   # Returns the robot name initially given.
   attr_reader :user_agent
@@ -42,9 +39,9 @@ Disallow: /
   # a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
   # raised.
   def allowed?(url)
-    site, request_uri = split_uri(url)
+    robots_txt, request_uri = evaluate(url)
     return true if request_uri == '/robots.txt'
-    robots_txt(site).allow?(request_uri)
+    robots_txt.allow?(request_uri)
   end
   # Equivalent to !allowed?(url).
@@ -56,8 +53,7 @@ Disallow: /
   # with each field name lower-cased.  See allowed?() for a list of
   # errors that may be raised.
   def options(url)
-    site, = split_uri(url)
-    robots_txt(site).options
+    robots_txt_for(url).options
   end
   # Equivalent to option(url)[token.downcase].
@@ -68,8 +64,25 @@ Disallow: /
   # Returns an array of Sitemap URLs.  See allowed?() for a list of
   # errors that may be raised.
   def sitemaps(url)
+    robots_txt_for(url).sitemaps
+  end
+  # Returns an error object if there is an error in fetching or
+  # parsing robots.txt of the site +url+.
+  def error(url)
+    robots_txt_for(url).error
+  end
+  # Raises the error if there was an error in fetching or parsing
+  # robots.txt of the site +url+.
+  def error!(url)
+    robots_txt_for(url).error!
+  end
+  # Removes robots.txt cache for the site +url+.
+  def reset(url)
     site, = split_uri(url)
-    robots_txt(site).sitemaps
+    @robotstxt.delete(site)
   end
   private
@@ -100,31 +113,27 @@ Disallow: /
     return site, request_uri
   end
-  def robots_txt(site)
-    cache_robots_txt(site) {
-      fetch_robots_txt(site)
-    } or @@disallower
+  def evaluate(url)
+    site, request_uri = split_uri(url)
+    return get_robots_txt(site), request_uri
   end
-  def fetch_robots_txt(site)
-    body =
-      begin
-        @http_get.call(site + 'robots.txt')
-      rescue => e
-        if e.is_a?(Net::HTTPExceptions) && e.response.is_a?(Net::HTTPNotFound)
-          ''
-        else
-          nil
-        end
-      end and @parser.parse(body, site)
+  def robots_txt_for(url)
+    site, = split_uri(url)
+    get_robots_txt(site)
+  end
+  def get_robots_txt(site)
+    @robotstxt[site] ||= fetch_robots_txt(site)
   end
-  def cache_robots_txt(site, &block)
-    if @robotstxt.key?(site)
-      @robotstxt[site]
-    else
-      @robotstxt[site] = block.call(site)
+  def fetch_robots_txt(site)
+    begin
+      body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable'
+    rescue => e
+      return RobotsTxt.unfetchable(site, e, @user_agent)
     end
+    @parser.parse!(body, site)
   end
   def http_get(uri)
@@ -143,6 +152,8 @@ Disallow: /
       when Net::HTTPRedirection
         referer = uri.to_s
         uri = URI(response['location'])
+      when Net::HTTPNotFound
+        return ''
       else
         response.value
       end

data/lib/webrobots/robotstxt.rb CHANGED Viewed

@@ -27,8 +27,10 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
         @target = target
       end
-      def self.parse(input, target = nil)
-        new(target).parse(input)
+      def parse!(input, site)
+        parse(input, site)
+      rescue Error => e
+        RobotsTxt.new(site, nil, :error => e, :target => @target)
       end
       KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
@@ -519,11 +521,12 @@ end
 end   # class Parser
     def initialize(site, records, options = nil)
-      super()
+      @timestamp = Time.now
       @site = site
       @options = options || {}
       @last_checked = nil
+      @error = @options[:error]
       @target = @options[:target]
       @sitemaps = @options[:sitemaps] || []
@@ -542,7 +545,12 @@ end   # class Parser
       end
     end
-    attr_reader :site, :sitemaps
+    attr_reader :timestamp, :site, :sitemaps
+    attr_accessor :error
+    def error!
+      raise @error if @error
+    end
     def target(user_agent = nil)
       if user_agent
@@ -579,6 +587,17 @@ end   # class Parser
       record.options
     end
+    DISALLOW_ALL = <<-TXT
+User-Agent: *
+Disallow: /
+    TXT
+    def self.unfetchable(site, reason, target = nil)
+      Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
+        robots_txt.error = reason
+      }
+    end
     class Record
       def initialize(agentlines, rulelines)
         @patterns = agentlines.map { |agentline| agentline.pattern }

data/lib/webrobots/robotstxt.ry CHANGED Viewed

@@ -167,8 +167,10 @@ class WebRobots
         @target = target
       end
-      def self.parse(input, target = nil)
-        new(target).parse(input)
+      def parse!(input, site)
+        parse(input, site)
+      rescue Error => e
+        RobotsTxt.new(site, nil, :error => e, :target => @target)
       end
       KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
@@ -249,11 +251,12 @@ class WebRobots
 ---- footer
     def initialize(site, records, options = nil)
-      super()
+      @timestamp = Time.now
       @site = site
       @options = options || {}
       @last_checked = nil
+      @error = @options[:error]
       @target = @options[:target]
       @sitemaps = @options[:sitemaps] || []
@@ -272,7 +275,12 @@ class WebRobots
       end
     end
-    attr_reader :site, :sitemaps
+    attr_reader :timestamp, :site, :sitemaps
+    attr_accessor :error
+    def error!
+      raise @error if @error
+    end
     def target(user_agent = nil)
       if user_agent
@@ -309,6 +317,17 @@ class WebRobots
       record.options
     end
+    DISALLOW_ALL = <<-TXT
+User-Agent: *
+Disallow: /
+    TXT
+    def self.unfetchable(site, reason, target = nil)
+      Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt|
+        robots_txt.error = reason
+      }
+    end
     class Record
       def initialize(agentlines, rulelines)
         @patterns = agentlines.map { |agentline| agentline.pattern }

data/test/test_webrobots.rb CHANGED Viewed

@@ -24,10 +24,6 @@ class TestWebRobots < Test::Unit::TestCase
   #comment
             TXT
-          when 'http://site5.example.org/robots.txt'
-            raise Net::HTTPServerException.new(
-              'Not Found',
-              Net::HTTPNotFound.new('1.1', '404', 'Not Found'))
           else
             raise "#{uri} is not supposed to be fetched"
           end
@@ -43,8 +39,6 @@ class TestWebRobots < Test::Unit::TestCase
       assert @robots.allowed?('http://site3.example.org/private/secret.txt')
       assert @robots.allowed?('http://site4.example.org/index.html')
       assert @robots.allowed?('http://site4.example.org/private/secret.txt')
-      assert @robots.allowed?('http://site5.example.org/index.html')
-      assert @robots.allowed?('http://site5.example.org/private/secret.txt')
     end
   end
@@ -64,6 +58,8 @@ class TestWebRobots < Test::Unit::TestCase
             raise Errno::ECONNREFUSED
           when 'http://site4.example.org/robots.txt'
             raise SocketError, "getaddrinfo: nodename nor servname provided, or not known"
+          when 'http://site5.example.org/robots.txt'
+            nil
           else
             raise "#{uri} is not supposed to be fetched"
           end
@@ -79,6 +75,8 @@ class TestWebRobots < Test::Unit::TestCase
       assert @robots.disallowed?('http://site3.example.org/private/secret.txt')
       assert @robots.disallowed?('http://site4.example.org/index.html')
       assert @robots.disallowed?('http://site4.example.org/private/secret.txt')
+      assert @robots.disallowed?('http://site5.example.org/index.html')
+      assert @robots.disallowed?('http://site5.example.org/private/secret.txt')
     end
   end
@@ -176,8 +174,12 @@ Disallow: /~joe/index.html
     should "properly restrict access" do
       assert  @robots_good.allowed?('http://www.example.org/index.html')
       assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
+      assert  @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
+      assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))
       assert  @robots_good.allowed?('http://www.example.org/2heavy/index.html')
+      assert  @robots_good.allowed?('http://WWW.Example.Org/2heavy/index.html')
       assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm')
+      assert !@robots_good.allowed?('http://WWW.Example.Org/2heavy/index.htm')
       assert !@robots_evil.allowed?('http://www.example.org/index.html')
       assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php')
@@ -234,38 +236,73 @@ Disallow: /~joe/index.html
   context "robots.txt with errors" do
     setup do
+      @turn1 = @turn2 = 0
       @http_get = lambda { |uri|
         case uri.to_s
         when 'http://www.example.org/robots.txt'
-          <<-'TXT'
+          if (@turn1 += 1) % 2 == 1
+            <<-'TXT'
 # some comment
-User-Agent: first
+User-Agent: thebot
+# Disallow: /
+Disallow: /2heavy/
+# Allow: /2heavy/notsoheavy
+Allow: /2heavy/*.html
+User-Agent: anotherbot
+# Disallow: /
+Disallow: /2heavy/
+# Allow: /2heavy/notsoheavy
+Allow: /2heavy/*.html
+            TXT
+          else
+            <<-'TXT'
+# some comment
+User-Agent: thebot
 # Disallow: /
 Disallow: /2heavy/
 # Allow: /2heavy/notsoheavy
 Allow: /2heavy/*.html
 #
-User-Agent: next
+User-Agent: anotherbot
 # Disallow: /
 Disallow: /2heavy/
 # Allow: /2heavy/notsoheavy
 Allow: /2heavy/*.html
-          TXT
+            TXT
+          end
         when 'http://www.example.com/robots.txt'
-          <<-'TXT'
+          if (@turn2 += 1) % 2 == 1
+            <<-'TXT'
 # some comment
-#User-Agent: first
+#User-Agent: thebot
 # Disallow: /
 Disallow: /2heavy/
 # Allow: /2heavy/notsoheavy
 Allow: /2heavy/*.html
-User-Agent: next
+User-Agent: anotherbot
 # Disallow: /
 Disallow: /2heavy/
 # Allow: /2heavy/notsoheavy
 Allow: /2heavy/*.html
-          TXT
+            TXT
+          else
+            <<-'TXT'
+# some comment
+User-Agent: thebot
+# Disallow: /
+Disallow: /2heavy/
+# Allow: /2heavy/notsoheavy
+Allow: /2heavy/*.html
+User-Agent: anotherbot
+# Disallow: /
+Disallow: /2heavy/
+# Allow: /2heavy/notsoheavy
+Allow: /2heavy/*.html
+            TXT
+          end
         else
           raise "#{uri} is not supposed to be fetched"
         end
@@ -273,12 +310,54 @@ Allow: /2heavy/*.html
     end
     should "raise ParseError" do
-      robots = WebRobots.new('RandomBot', :http_get => @http_get)
+      robots = WebRobots.new('TheBot', :http_get => @http_get)
+      url = 'http://www.example.org/2heavy/index.php'
+      assert_nil robots.error(url)
+      assert !robots.allowed?(url)
+      assert_nothing_raised {
+        robots.error!(url)
+      }
+      robots.reset(url)
+      assert robots.allowed?(url)
+      assert_instance_of WebRobots::ParseError, robots.error(url)
       assert_raise(WebRobots::ParseError) {
-        robots.allowed?('http://www.example.org/2heavy/index.html')
+        robots.error!(url)
       }
+      robots.reset(url)
+      assert_nil robots.error(url)
+      assert !robots.allowed?(url)
+      assert_nothing_raised {
+        robots.error!(url)
+      }
+      url = 'http://www.example.com/2heavy/index.php'
+      assert robots.allowed?(url)
+      assert_instance_of WebRobots::ParseError, robots.error(url)
+      assert_raise(WebRobots::ParseError) {
+        robots.error!(url)
+      }
+      robots.reset(url)
+      assert_nil robots.error(url)
+      assert !robots.allowed?(url)
+      assert_nothing_raised {
+        robots.error!(url)
+      }
+      robots.reset(url)
+      assert robots.allowed?(url)
+      assert_instance_of WebRobots::ParseError, robots.error(url)
       assert_raise(WebRobots::ParseError) {
-        robots.allowed?('http://www.example.com/2heavy/index.html')
+        robots.error!(url)
       }
     end
   end

data/webrobots.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{webrobots}
-  s.version = "0.0.5"
+  s.version = "0.0.6"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Akinori MUSHA"]
-  s.date = %q{2011-01-08}
+  s.date = %q{2011-01-09}
   s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
 }
   s.email = %q{knu@idaemons.org}

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: webrobots
 version: !ruby/object:Gem::Version
-  hash: 21
+  hash: 19
   prerelease:
   segments:
   - 0
   - 0
-  - 5
-  version: 0.0.5
+  - 6
+  version: 0.0.6
 platform: ruby
 authors:
 - Akinori MUSHA
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-08 00:00:00 +09:00
+date: 2011-01-09 00:00:00 +09:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency