webrobots 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/webrobots/robotstxt.rb +10 -2
- data/lib/webrobots/robotstxt.ry +10 -2
- data/test/test_webrobots.rb +77 -0
- data/webrobots.gemspec +2 -2
- metadata +4 -4
    
        data/VERSION
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
            0.0. | 
| 1 | 
            +
            0.0.4
         | 
    
        data/lib/webrobots/robotstxt.rb
    CHANGED
    
    | @@ -46,6 +46,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164) | |
| 46 46 |  | 
| 47 47 | 
             
                    until s.eos?
         | 
| 48 48 | 
             
                      if t = s.scan(/[ \t]*\r?\n/)
         | 
| 49 | 
            +
                        if value_expected
         | 
| 50 | 
            +
                          @q << [:VALUE, '']
         | 
| 51 | 
            +
                        end
         | 
| 49 52 | 
             
                        @q << [:EOL, t]
         | 
| 50 53 | 
             
                        value_expected = false
         | 
| 51 54 | 
             
                      elsif t = s.scan(/[ \t]+/)
         | 
| @@ -54,6 +57,9 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164) | |
| 54 57 | 
             
                        @q << [t, t]
         | 
| 55 58 | 
             
                        value_expected = true
         | 
| 56 59 | 
             
                      elsif t = s.scan(/#.*/)
         | 
| 60 | 
            +
                        if value_expected
         | 
| 61 | 
            +
                          @q << [:VALUE, '']
         | 
| 62 | 
            +
                        end
         | 
| 57 63 | 
             
                        @q << [:COMMENT, t]
         | 
| 58 64 | 
             
                      else
         | 
| 59 65 | 
             
                        if value_expected
         | 
| @@ -668,7 +674,7 @@ end   # class Parser | |
| 668 674 | 
             
                        re_src << '\z'
         | 
| 669 675 | 
             
                        break
         | 
| 670 676 | 
             
                      else
         | 
| 671 | 
            -
                         | 
| 677 | 
            +
                        re_src << Regexp.quote(s.scan(/./))
         | 
| 672 678 | 
             
                      end
         | 
| 673 679 | 
             
                    end
         | 
| 674 680 | 
             
                    @pattern = Regexp.new(re_src, Regexp::MULTILINE)
         | 
| @@ -676,7 +682,9 @@ end   # class Parser | |
| 676 682 | 
             
                  end
         | 
| 677 683 |  | 
| 678 684 | 
             
                  def match?(request_uri)
         | 
| 679 | 
            -
                     | 
| 685 | 
            +
                    return false if @empty
         | 
| 686 | 
            +
                    transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
         | 
| 687 | 
            +
                    !!@pattern.match(transformed)
         | 
| 680 688 | 
             
                  end
         | 
| 681 689 | 
             
                end
         | 
| 682 690 |  | 
    
        data/lib/webrobots/robotstxt.ry
    CHANGED
    
    | @@ -186,6 +186,9 @@ class WebRobots | |
| 186 186 |  | 
| 187 187 | 
             
                    until s.eos?
         | 
| 188 188 | 
             
                      if t = s.scan(/[ \t]*\r?\n/)
         | 
| 189 | 
            +
                        if value_expected
         | 
| 190 | 
            +
                          @q << [:VALUE, '']
         | 
| 191 | 
            +
                        end
         | 
| 189 192 | 
             
                        @q << [:EOL, t]
         | 
| 190 193 | 
             
                        value_expected = false
         | 
| 191 194 | 
             
                      elsif t = s.scan(/[ \t]+/)
         | 
| @@ -194,6 +197,9 @@ class WebRobots | |
| 194 197 | 
             
                        @q << [t, t]
         | 
| 195 198 | 
             
                        value_expected = true
         | 
| 196 199 | 
             
                      elsif t = s.scan(/#.*/)
         | 
| 200 | 
            +
                        if value_expected
         | 
| 201 | 
            +
                          @q << [:VALUE, '']
         | 
| 202 | 
            +
                        end
         | 
| 197 203 | 
             
                        @q << [:COMMENT, t]
         | 
| 198 204 | 
             
                      else
         | 
| 199 205 | 
             
                        if value_expected
         | 
| @@ -398,7 +404,7 @@ class WebRobots | |
| 398 404 | 
             
                        re_src << '\z'
         | 
| 399 405 | 
             
                        break
         | 
| 400 406 | 
             
                      else
         | 
| 401 | 
            -
                         | 
| 407 | 
            +
                        re_src << Regexp.quote(s.scan(/./))
         | 
| 402 408 | 
             
                      end
         | 
| 403 409 | 
             
                    end
         | 
| 404 410 | 
             
                    @pattern = Regexp.new(re_src, Regexp::MULTILINE)
         | 
| @@ -406,7 +412,9 @@ class WebRobots | |
| 406 412 | 
             
                  end
         | 
| 407 413 |  | 
| 408 414 | 
             
                  def match?(request_uri)
         | 
| 409 | 
            -
                     | 
| 415 | 
            +
                    return false if @empty
         | 
| 416 | 
            +
                    transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) }
         | 
| 417 | 
            +
                    !!@pattern.match(transformed)
         | 
| 410 418 | 
             
                  end
         | 
| 411 419 | 
             
                end
         | 
| 412 420 |  | 
    
        data/test/test_webrobots.rb
    CHANGED
    
    | @@ -85,6 +85,46 @@ Disallow: /2heavy/ | |
| 85 85 | 
             
            Allow: /2heavy/*.htm
         | 
| 86 86 | 
             
            Disallow: /2heavy/*.htm$
         | 
| 87 87 | 
             
                      TXT
         | 
| 88 | 
            +
                    when 'http://koster1.example.net/robots.txt'
         | 
| 89 | 
            +
                      <<-'TXT'
         | 
| 90 | 
            +
            User-Agent: *
         | 
| 91 | 
            +
            Disallow: /tmp
         | 
| 92 | 
            +
                      TXT
         | 
| 93 | 
            +
                    when 'http://koster2.example.net/robots.txt'
         | 
| 94 | 
            +
                      <<-'TXT'
         | 
| 95 | 
            +
            User-Agent: *
         | 
| 96 | 
            +
            Disallow: /tmp/
         | 
| 97 | 
            +
                      TXT
         | 
| 98 | 
            +
                    when 'http://koster3.example.net/robots.txt'
         | 
| 99 | 
            +
                      <<-'TXT'
         | 
| 100 | 
            +
            User-Agent: *
         | 
| 101 | 
            +
            Disallow: /a%3cd.html
         | 
| 102 | 
            +
                      TXT
         | 
| 103 | 
            +
                    when 'http://koster4.example.net/robots.txt'
         | 
| 104 | 
            +
                      <<-'TXT'
         | 
| 105 | 
            +
            User-Agent: *
         | 
| 106 | 
            +
            Disallow: /a%3Cd.html
         | 
| 107 | 
            +
                      TXT
         | 
| 108 | 
            +
                    when 'http://koster5.example.net/robots.txt'
         | 
| 109 | 
            +
                      <<-'TXT'
         | 
| 110 | 
            +
            User-Agent: *
         | 
| 111 | 
            +
            Disallow: /a%2fb.html
         | 
| 112 | 
            +
                      TXT
         | 
| 113 | 
            +
                    when 'http://koster6.example.net/robots.txt'
         | 
| 114 | 
            +
                      <<-'TXT'
         | 
| 115 | 
            +
            User-Agent: *
         | 
| 116 | 
            +
            Disallow: /a/b.html
         | 
| 117 | 
            +
                      TXT
         | 
| 118 | 
            +
                    when 'http://koster7.example.net/robots.txt'
         | 
| 119 | 
            +
                      <<-'TXT'
         | 
| 120 | 
            +
            User-Agent: *
         | 
| 121 | 
            +
            Disallow: /%7ejoe/index.html
         | 
| 122 | 
            +
                      TXT
         | 
| 123 | 
            +
                    when 'http://koster8.example.net/robots.txt'
         | 
| 124 | 
            +
                      <<-'TXT'
         | 
| 125 | 
            +
            User-Agent: *
         | 
| 126 | 
            +
            Disallow: /~joe/index.html
         | 
| 127 | 
            +
                      TXT
         | 
| 88 128 | 
             
                    else
         | 
| 89 129 | 
             
                      raise "#{uri} is not supposed to be fetched"
         | 
| 90 130 | 
             
                    end
         | 
| @@ -126,6 +166,32 @@ Disallow: /2heavy/*.htm$ | |
| 126 166 | 
             
                  assert !@robots.allowed?('http://www.example.com/2heavy/index.html')
         | 
| 127 167 | 
             
                  assert !@robots.allowed?('http://www.example.com/2heavy/index.htm')
         | 
| 128 168 | 
             
                end
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                should "follow what is said in Koster's draft" do
         | 
| 171 | 
            +
                  assert  @robots.disallowed?('http://koster1.example.net/tmp')
         | 
| 172 | 
            +
                  assert  @robots.disallowed?('http://koster1.example.net/tmp.html')
         | 
| 173 | 
            +
                  assert  @robots.disallowed?('http://koster1.example.net/tmp/a.html')
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                  assert !@robots.disallowed?('http://koster2.example.net/tmp')
         | 
| 176 | 
            +
                  assert  @robots.disallowed?('http://koster2.example.net/tmp/')
         | 
| 177 | 
            +
                  assert  @robots.disallowed?('http://koster2.example.net/tmp/a.html')
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                  assert  @robots.disallowed?('http://koster3.example.net/a%3cd.html')
         | 
| 180 | 
            +
                  assert  @robots.disallowed?('http://koster3.example.net/a%3Cd.html')
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                  assert  @robots.disallowed?('http://koster4.example.net/a%3cd.html')
         | 
| 183 | 
            +
                  assert  @robots.disallowed?('http://koster4.example.net/a%3Cd.html')
         | 
| 184 | 
            +
             | 
| 185 | 
            +
                  assert  @robots.disallowed?('http://koster5.example.net/a%2fb.html')
         | 
| 186 | 
            +
                  assert !@robots.disallowed?('http://koster5.example.net/a/b.html')
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                  assert !@robots.disallowed?('http://koster6.example.net/a%2fb.html')
         | 
| 189 | 
            +
                  assert  @robots.disallowed?('http://koster6.example.net/a/b.html')
         | 
| 190 | 
            +
             | 
| 191 | 
            +
                  assert  @robots.disallowed?('http://koster7.example.net/~joe/index.html')
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                  assert  @robots.disallowed?('http://koster8.example.net/%7Ejoe/index.html')
         | 
| 194 | 
            +
                end
         | 
| 129 195 | 
             
              end
         | 
| 130 196 |  | 
| 131 197 | 
             
              context "robots.txt with errors" do
         | 
| @@ -193,12 +259,14 @@ Disallow: /2heavy/ | |
| 193 259 | 
             
            Allow: /2heavy/*.html
         | 
| 194 260 | 
             
            Option1: Foo
         | 
| 195 261 | 
             
            Option2: Hello
         | 
| 262 | 
            +
            Crawl-Delay: 1.5
         | 
| 196 263 |  | 
| 197 264 | 
             
            User-Agent: *
         | 
| 198 265 | 
             
            Disallow: /2heavy/
         | 
| 199 266 | 
             
            Allow: /2heavy/*.html
         | 
| 200 267 | 
             
            Option1: Bar
         | 
| 201 268 | 
             
            Option3: Hi
         | 
| 269 | 
            +
            Crawl-Delay:
         | 
| 202 270 | 
             
                      TXT
         | 
| 203 271 | 
             
                    else
         | 
| 204 272 | 
             
                      raise "#{uri} is not supposed to be fetched"
         | 
| @@ -232,6 +300,15 @@ Option3: Hi | |
| 232 300 | 
             
                    http://www.example.org/sitemap-host1.xml
         | 
| 233 301 | 
             
                    http://www.example.org/sitemap-host2.xml
         | 
| 234 302 | 
             
                  ], @robots_hisbot.sitemaps('http://www.example.org/')
         | 
| 303 | 
            +
             | 
| 304 | 
            +
                  t1 = Time.now
         | 
| 305 | 
            +
                  @robots_mybot.allowed?('http://www.example.org/')
         | 
| 306 | 
            +
                  @robots_mybot.allowed?('http://www.example.org/article1.html')
         | 
| 307 | 
            +
                  t2 = Time.now
         | 
| 308 | 
            +
                  assert_in_delta 1.5, t2 - t1, 0.1
         | 
| 309 | 
            +
                  @robots_mybot.allowed?('http://www.example.org/article2.html')
         | 
| 310 | 
            +
                  t3 = Time.now
         | 
| 311 | 
            +
                  assert_in_delta 1.5, t3 - t2, 0.1
         | 
| 235 312 | 
             
                end
         | 
| 236 313 | 
             
              end
         | 
| 237 314 |  | 
    
        data/webrobots.gemspec
    CHANGED
    
    | @@ -5,11 +5,11 @@ | |
| 5 5 |  | 
| 6 6 | 
             
            Gem::Specification.new do |s|
         | 
| 7 7 | 
             
              s.name = %q{webrobots}
         | 
| 8 | 
            -
              s.version = "0.0. | 
| 8 | 
            +
              s.version = "0.0.4"
         | 
| 9 9 |  | 
| 10 10 | 
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         | 
| 11 11 | 
             
              s.authors = ["Akinori MUSHA"]
         | 
| 12 | 
            -
              s.date = %q{2011-01- | 
| 12 | 
            +
              s.date = %q{2011-01-08}
         | 
| 13 13 | 
             
              s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
         | 
| 14 14 | 
             
            }
         | 
| 15 15 | 
             
              s.email = %q{knu@idaemons.org}
         | 
    
        metadata
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: webrobots
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              hash:  | 
| 4 | 
            +
              hash: 23
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
              segments: 
         | 
| 7 7 | 
             
              - 0
         | 
| 8 8 | 
             
              - 0
         | 
| 9 | 
            -
              -  | 
| 10 | 
            -
              version: 0.0. | 
| 9 | 
            +
              - 4
         | 
| 10 | 
            +
              version: 0.0.4
         | 
| 11 11 | 
             
            platform: ruby
         | 
| 12 12 | 
             
            authors: 
         | 
| 13 13 | 
             
            - Akinori MUSHA
         | 
| @@ -15,7 +15,7 @@ autorequire: | |
| 15 15 | 
             
            bindir: bin
         | 
| 16 16 | 
             
            cert_chain: []
         | 
| 17 17 |  | 
| 18 | 
            -
            date: 2011-01- | 
| 18 | 
            +
            date: 2011-01-08 00:00:00 +09:00
         | 
| 19 19 | 
             
            default_executable: 
         | 
| 20 20 | 
             
            dependencies: 
         | 
| 21 21 | 
             
            - !ruby/object:Gem::Dependency 
         |