RubyGems - webrobots - Versions diffs - 0.0.3 → 0.0.4 - Mend

webrobots 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.3
1	+ 0.0.4

data/lib/webrobots/robotstxt.rb CHANGED Viewed

@@ -46,6 +46,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
         until s.eos?
           if t = s.scan(/[ \t]*\r?\n/)
+            if value_expected
+              @q << [:VALUE, '']
+            end
             @q << [:EOL, t]
             value_expected = false
           elsif t = s.scan(/[ \t]+/)
@@ -54,6 +57,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
             @q << [t, t]
             value_expected = true
           elsif t = s.scan(/#.*/)
+            if value_expected
+              @q << [:VALUE, '']
+            end
             @q << [:COMMENT, t]
           else
             if value_expected
@@ -668,7 +674,7 @@ end   # class Parser
             re_src << '\z'
             break
           else
-            raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
+            re_src << Regexp.quote(s.scan(/./))
           end
         end
         @pattern = Regexp.new(re_src, Regexp::MULTILINE)
@@ -676,7 +682,9 @@ end   # class Parser
       end
       def match?(request_uri)
-        !@empty && !!@pattern.match(request_uri)
+        return false if @empty
+        transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
+        !!@pattern.match(transformed)
       end
     end

data/lib/webrobots/robotstxt.ry CHANGED Viewed

@@ -186,6 +186,9 @@ class WebRobots
         until s.eos?
           if t = s.scan(/[ \t]*\r?\n/)
+            if value_expected
+              @q << [:VALUE, '']
+            end
             @q << [:EOL, t]
             value_expected = false
           elsif t = s.scan(/[ \t]+/)
@@ -194,6 +197,9 @@ class WebRobots
             @q << [t, t]
             value_expected = true
           elsif t = s.scan(/#.*/)
+            if value_expected
+              @q << [:VALUE, '']
+            end
             @q << [:COMMENT, t]
           else
             if value_expected
@@ -398,7 +404,7 @@ class WebRobots
             re_src << '\z'
             break
           else
-            raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
+            re_src << Regexp.quote(s.scan(/./))
           end
         end
         @pattern = Regexp.new(re_src, Regexp::MULTILINE)
@@ -406,7 +412,9 @@ class WebRobots
       end
       def match?(request_uri)
-        !@empty && !!@pattern.match(request_uri)
+        return false if @empty
+        transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
+        !!@pattern.match(transformed)
       end
     end

data/test/test_webrobots.rb CHANGED Viewed

@@ -85,6 +85,46 @@ Disallow: /2heavy/
 Allow: /2heavy/*.htm
 Disallow: /2heavy/*.htm$
           TXT
+        when 'http://koster1.example.net/robots.txt'
+          <<-'TXT'
+User-Agent: *
+Disallow: /tmp
+          TXT
+        when 'http://koster2.example.net/robots.txt'
+          <<-'TXT'
+User-Agent: *
+Disallow: /tmp/
+          TXT
+        when 'http://koster3.example.net/robots.txt'
+          <<-'TXT'
+User-Agent: *
+Disallow: /a%3cd.html
+          TXT
+        when 'http://koster4.example.net/robots.txt'
+          <<-'TXT'
+User-Agent: *
+Disallow: /a%3Cd.html
+          TXT
+        when 'http://koster5.example.net/robots.txt'
+          <<-'TXT'
+User-Agent: *
+Disallow: /a%2fb.html
+          TXT
+        when 'http://koster6.example.net/robots.txt'
+          <<-'TXT'
+User-Agent: *
+Disallow: /a/b.html
+          TXT
+        when 'http://koster7.example.net/robots.txt'
+          <<-'TXT'
+User-Agent: *
+Disallow: /%7ejoe/index.html
+          TXT
+        when 'http://koster8.example.net/robots.txt'
+          <<-'TXT'
+User-Agent: *
+Disallow: /~joe/index.html
+          TXT
         else
           raise "#{uri} is not supposed to be fetched"
         end
@@ -126,6 +166,32 @@ Disallow: /2heavy/*.htm$
       assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
       assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
     end
+    should "follow what is said in Koster's draft" do
+      assert  @robots.disallowed?('http://koster1.example.net/tmp')
+      assert  @robots.disallowed?('http://koster1.example.net/tmp.html')
+      assert  @robots.disallowed?('http://koster1.example.net/tmp/a.html')
+      assert !@robots.disallowed?('http://koster2.example.net/tmp')
+      assert  @robots.disallowed?('http://koster2.example.net/tmp/')
+      assert  @robots.disallowed?('http://koster2.example.net/tmp/a.html')
+      assert  @robots.disallowed?('http://koster3.example.net/a%3cd.html')
+      assert  @robots.disallowed?('http://koster3.example.net/a%3Cd.html')
+      assert  @robots.disallowed?('http://koster4.example.net/a%3cd.html')
+      assert  @robots.disallowed?('http://koster4.example.net/a%3Cd.html')
+      assert  @robots.disallowed?('http://koster5.example.net/a%2fb.html')
+      assert !@robots.disallowed?('http://koster5.example.net/a/b.html')
+      assert !@robots.disallowed?('http://koster6.example.net/a%2fb.html')
+      assert  @robots.disallowed?('http://koster6.example.net/a/b.html')
+      assert  @robots.disallowed?('http://koster7.example.net/~joe/index.html')
+      assert  @robots.disallowed?('http://koster8.example.net/%7Ejoe/index.html')
+    end
   end
   context "robots.txt with errors" do
@@ -193,12 +259,14 @@ Disallow: /2heavy/
 Allow: /2heavy/*.html
 Option1: Foo
 Option2: Hello
+Crawl-Delay: 1.5
 User-Agent: *
 Disallow: /2heavy/
 Allow: /2heavy/*.html
 Option1: Bar
 Option3: Hi
+Crawl-Delay:
           TXT
         else
           raise "#{uri} is not supposed to be fetched"
@@ -232,6 +300,15 @@ Option3: Hi
         http://www.example.org/sitemap-host1.xml
         http://www.example.org/sitemap-host2.xml
       ], @robots_hisbot.sitemaps('http://www.example.org/')
+      t1 = Time.now
+      @robots_mybot.allowed?('http://www.example.org/')
+      @robots_mybot.allowed?('http://www.example.org/article1.html')
+      t2 = Time.now
+      assert_in_delta 1.5, t2 - t1, 0.1
+      @robots_mybot.allowed?('http://www.example.org/article2.html')
+      t3 = Time.now
+      assert_in_delta 1.5, t3 - t2, 0.1
     end
   end

data/webrobots.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{webrobots}
-  s.version = "0.0.3"
+  s.version = "0.0.4"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Akinori MUSHA"]
-  s.date = %q{2011-01-05}
+  s.date = %q{2011-01-08}
   s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
 }
   s.email = %q{knu@idaemons.org}

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: webrobots
 version: !ruby/object:Gem::Version
-  hash: 25
+  hash: 23
   prerelease:
   segments:
   - 0
   - 0
-  - 3
-  version: 0.0.3
+  - 4
+  version: 0.0.4
 platform: ruby
 authors:
 - Akinori MUSHA
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-05 00:00:00 +09:00
+date: 2011-01-08 00:00:00 +09:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency