RubyGems - webrobots - Versions diffs - 0.0.1 - Mend

webrobots 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/test/test_webrobots.rb ADDED Viewed

@@ -0,0 +1,291 @@
+require 'helper'
+class TestWebRobots < Test::Unit::TestCase
+  context "robots.txt with no rules" do
+    setup do
+      @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
+          case uri.to_s
+          when 'http://site1.example.org/robots.txt'
+            <<-'TXT'
+            TXT
+          when 'http://site2.example.org/robots.txt'
+            <<-'TXT'
+            TXT
+          when 'http://site3.example.org/robots.txt'
+            <<-'TXT'
+  #comment
+            TXT
+          when 'http://site4.example.org/robots.txt'
+            <<-'TXT'
+  #comment
+            TXT
+          when 'http://site5.example.org/robots.txt'
+            raise Net::HTTPNotFound
+          else
+            raise "#{uri} is not supposed to be fetched"
+          end
+        })
+    end
+    should "allow any robot" do
+      assert @robots.allowed?('http://site1.example.org/index.html')
+      assert @robots.allowed?('http://site1.example.org/private/secret.txt')
+      assert @robots.allowed?('http://site2.example.org/index.html')
+      assert @robots.allowed?('http://site2.example.org/private/secret.txt')
+      assert @robots.allowed?('http://site3.example.org/index.html')
+      assert @robots.allowed?('http://site3.example.org/private/secret.txt')
+      assert @robots.allowed?('http://site4.example.org/index.html')
+      assert @robots.allowed?('http://site4.example.org/private/secret.txt')
+    end
+  end
+  context "robots.txt with some rules" do
+    setup do
+      http_get = lambda { |uri|
+        case uri.to_s
+        when 'http://www.example.org/robots.txt'
+          <<-'TXT'
+# Punish evil bots
+User-Agent: evil
+Disallow: /
+User-Agent: good
+# Be generous to good bots
+Disallow: /2heavy/
+Allow: /2heavy/*.htm
+Disallow: /2heavy/*.htm$
+User-Agent: *
+Disallow: /2heavy/
+Disallow: /index.html
+# Allow takes precedence over Disallow if the pattern lengths are the same.
+Allow: /index.html
+          TXT
+        when 'http://www.example.com/robots.txt'
+          <<-'TXT'
+# Default rule is evaluated last even if it is put first.
+User-Agent: *
+Disallow: /2heavy/
+Disallow: /index.html
+# Allow takes precedence over Disallow if the pattern lengths are the same.
+Allow: /index.html
+# Punish evil bots
+User-Agent: evil
+Disallow: /
+User-Agent: good
+# Be generous to good bots
+Disallow: /2heavy/
+Allow: /2heavy/*.htm
+Disallow: /2heavy/*.htm$
+          TXT
+        else
+          raise "#{uri} is not supposed to be fetched"
+        end
+      }
+      @robots = WebRobots.new('RandomBot', :http_get => http_get)
+      @robots_good = WebRobots.new('GoodBot', :http_get => http_get)
+      @robots_evil = WebRobots.new('EvilBot', :http_get => http_get)
+    end
+    should "properly restrict access" do
+      assert  @robots_good.allowed?('http://www.example.org/index.html')
+      assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
+      assert  @robots_good.allowed?('http://www.example.org/2heavy/index.html')
+      assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm')
+      assert !@robots_evil.allowed?('http://www.example.org/index.html')
+      assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php')
+      assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.html')
+      assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.htm')
+      assert  @robots.allowed?('http://www.example.org/index.html')
+      assert !@robots.allowed?('http://www.example.org/2heavy/index.php')
+      assert !@robots.allowed?('http://www.example.org/2heavy/index.html')
+      assert !@robots.allowed?('http://www.example.org/2heavy/index.htm')
+      assert  @robots_good.allowed?('http://www.example.com/index.html')
+      assert !@robots_good.allowed?('http://www.example.com/2heavy/index.php')
+      assert  @robots_good.allowed?('http://www.example.com/2heavy/index.html')
+      assert !@robots_good.allowed?('http://www.example.com/2heavy/index.htm')
+      assert !@robots_evil.allowed?('http://www.example.com/index.html')
+      assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.php')
+      assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.html')
+      assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.htm')
+      assert  @robots.allowed?('http://www.example.com/index.html')
+      assert !@robots.allowed?('http://www.example.com/2heavy/index.php')
+      assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
+      assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
+    end
+  end
+  context "robots.txt with errors" do
+    setup do
+      @http_get = lambda { |uri|
+        case uri.to_s
+        when 'http://www.example.org/robots.txt'
+          <<-'TXT'
+# some comment
+User-Agent: first
+# Disallow: /
+Disallow: /2heavy/
+# Allow: /2heavy/notsoheavy
+Allow: /2heavy/*.html
+#
+User-Agent: next
+# Disallow: /
+Disallow: /2heavy/
+# Allow: /2heavy/notsoheavy
+Allow: /2heavy/*.html
+          TXT
+        when 'http://www.example.com/robots.txt'
+          <<-'TXT'
+# some comment
+#User-Agent: first
+# Disallow: /
+Disallow: /2heavy/
+# Allow: /2heavy/notsoheavy
+Allow: /2heavy/*.html
+User-Agent: next
+# Disallow: /
+Disallow: /2heavy/
+# Allow: /2heavy/notsoheavy
+Allow: /2heavy/*.html
+          TXT
+        else
+          raise "#{uri} is not supposed to be fetched"
+        end
+      }
+    end
+    should "raise ParseError" do
+      robots = WebRobots.new('RandomBot', :http_get => @http_get)
+      assert_raise(WebRobots::ParseError) {
+        robots.allowed?('http://www.example.org/2heavy/index.html')
+      }
+      assert_raise(WebRobots::ParseError) {
+        robots.allowed?('http://www.example.com/2heavy/index.html')
+      }
+    end
+  end
+  context "robots.txt with options" do
+    setup do
+      http_get = lambda { |uri|
+        case uri.to_s
+        when 'http://www.example.org/robots.txt'
+          <<-'TXT'
+Sitemap: http://www.example.org/sitemap-host1.xml
+Sitemap: http://www.example.org/sitemap-host2.xml
+User-Agent: MyBot
+Disallow: /2heavy/
+Allow: /2heavy/*.html
+Option1: Foo
+Option2: Hello
+User-Agent: *
+Disallow: /2heavy/
+Allow: /2heavy/*.html
+Option1: Bar
+Option3: Hi
+          TXT
+        else
+          raise "#{uri} is not supposed to be fetched"
+        end
+      }
+      @robots_mybot = WebRobots.new('MyBot', :http_get => http_get)
+      @robots_hisbot = WebRobots.new('HisBot', :http_get => http_get)
+    end
+    should "read options" do
+      options = @robots_mybot.options('http://www.example.org/')
+      assert_equal 2, options.size
+      assert_equal 'Foo',   @robots_mybot.option('http://www.example.org/', 'Option1')
+      assert_equal 'Foo',   options['option1']
+      assert_equal 'Hello', @robots_mybot.option('http://www.example.org/', 'Option2')
+      assert_equal 'Hello', options['option2']
+      options = @robots_hisbot.options('http://www.example.org/')
+      assert_equal 2, options.size
+      assert_equal 'Bar',   @robots_hisbot.option('http://www.example.org/', 'Option1')
+      assert_equal 'Bar',   options['option1']
+      assert_equal 'Hi',    @robots_hisbot.option('http://www.example.org/', 'Option3')
+      assert_equal 'Hi',    options['option3']
+      assert_equal %w[
+        http://www.example.org/sitemap-host1.xml
+        http://www.example.org/sitemap-host2.xml
+      ], @robots_mybot.sitemaps('http://www.example.org/')
+      assert_equal %w[
+        http://www.example.org/sitemap-host1.xml
+        http://www.example.org/sitemap-host2.xml
+      ], @robots_hisbot.sitemaps('http://www.example.org/')
+    end
+  end
+  context "robots.txt with options" do
+    setup do
+      http_get = lambda { |uri|
+        case uri.to_s
+        when 'http://www.example.org/robots.txt'
+          <<-'TXT'
+User-Agent: *
+Disallow: /
+          TXT
+        else
+          raise "#{uri} is not supposed to be fetched"
+        end
+      }
+      @robots = WebRobots.new('RandomBot', :http_get => http_get)
+    end
+    should "validate URI" do
+      assert_raise(ArgumentError) {
+        @robots.allowed?('www.example.org/')
+      }
+      assert_raise(ArgumentError) {
+        @robots.allowed?('::/home/knu')
+      }
+    end
+  end
+  context "robots.txt in the real world" do
+    setup do
+      @testbot = WebRobots.new('TestBot')
+      @msnbot = WebRobots.new('TestMSNBot')	# matches msnbot
+    end
+    should "be parsed for major sites" do
+      assert_nothing_raised {
+        assert !@testbot.allowed?("http://www.google.com/search")
+        assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln")
+        assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6")
+      }
+      assert_nothing_raised {
+        assert @testbot.allowed?("http://www.yahoo.com/")
+        assert !@testbot.allowed?("http://www.yahoo.com/?")
+        assert !@testbot.allowed?("http://www.yahoo.com/p/foo")
+      }
+      assert_nothing_raised {
+        assert !@testbot.allowed?("http://store.apple.com/vieworder")
+        assert @msnbot.allowed?("http://store.apple.com/vieworder")
+      }
+#      assert_nothing_raised {
+        assert !@testbot.allowed?("http://github.com/login")
+#      }
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,155 @@
+--- !ruby/object:Gem::Specification
+name: webrobots
+version: !ruby/object:Gem::Version
+  hash: 29
+  prerelease:
+  segments:
+  - 0
+  - 0
+  - 1
+  version: 0.0.1
+platform: ruby
+authors:
+- Akinori MUSHA
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-01-03 00:00:00 +09:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  type: :runtime
+  version_requirements: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  requirement: *id001
+  prerelease: false
+  name: racc
+- !ruby/object:Gem::Dependency
+  type: :development
+  version_requirements: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  requirement: *id002
+  prerelease: false
+  name: shoulda
+- !ruby/object:Gem::Dependency
+  type: :development
+  version_requirements: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 23
+        segments:
+        - 1
+        - 0
+        - 0
+        version: 1.0.0
+  requirement: *id003
+  prerelease: false
+  name: bundler
+- !ruby/object:Gem::Dependency
+  type: :development
+  version_requirements: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 1
+        segments:
+        - 1
+        - 5
+        - 1
+        version: 1.5.1
+  requirement: *id004
+  prerelease: false
+  name: jeweler
+- !ruby/object:Gem::Dependency
+  type: :development
+  version_requirements: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  requirement: *id005
+  prerelease: false
+  name: rcov
+description: |
+  This library helps write robots.txt compliant web robots.
+email: knu@idaemons.org
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE.txt
+- README.rdoc
+files:
+- .document
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.rdoc
+- Rakefile
+- VERSION
+- lib/webrobots.rb
+- lib/webrobots/robotstxt.rb
+- lib/webrobots/robotstxt.ry
+- test/helper.rb
+- test/test_webrobots.rb
+has_rdoc: true
+homepage:
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.4.1
+signing_key:
+specification_version: 3
+summary: A library to help write robots.txt compliant web robots
+test_files:
+- test/helper.rb
+- test/test_webrobots.rb