fizx-robots 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +33 -0
 - data/lib/robots.rb +113 -0
 - metadata +55 -0
 
    
        data/README
    ADDED
    
    | 
         @@ -0,0 +1,33 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            A simple Ruby library to parse robots.txt.
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            Usage:
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            	robots = Robots.new "Some User Agent"
         
     | 
| 
      
 6 
     | 
    
         
            +
            	assert robots.allowed?("http://www.yelp.com/foo")
         
     | 
| 
      
 7 
     | 
    
         
            +
            	assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
         
     | 
| 
      
 8 
     | 
    
         
            +
            	robots.other_values("http://foo.com") # gets misc. key/values (i.e. sitemaps)
         
     | 
| 
      
 9 
     | 
    
         
            +
            	
         
     | 
| 
      
 10 
     | 
    
         
            +
            If you want caching, you're on your own.  I suggest marshalling an instance of the parser.  
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            Copyright (c) 2008 Kyle Maxwell
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            Permission is hereby granted, free of charge, to any person
         
     | 
| 
      
 15 
     | 
    
         
            +
            obtaining a copy of this software and associated documentation
         
     | 
| 
      
 16 
     | 
    
         
            +
            files (the "Software"), to deal in the Software without
         
     | 
| 
      
 17 
     | 
    
         
            +
            restriction, including without limitation the rights to use,
         
     | 
| 
      
 18 
     | 
    
         
            +
            copy, modify, merge, publish, distribute, sublicense, and/or sell
         
     | 
| 
      
 19 
     | 
    
         
            +
            copies of the Software, and to permit persons to whom the
         
     | 
| 
      
 20 
     | 
    
         
            +
            Software is furnished to do so, subject to the following
         
     | 
| 
      
 21 
     | 
    
         
            +
            conditions:
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            The above copyright notice and this permission notice shall be
         
     | 
| 
      
 24 
     | 
    
         
            +
            included in all copies or substantial portions of the Software.
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
         
     | 
| 
      
 27 
     | 
    
         
            +
            EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
         
     | 
| 
      
 28 
     | 
    
         
            +
            OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         
     | 
| 
      
 29 
     | 
    
         
            +
            NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
         
     | 
| 
      
 30 
     | 
    
         
            +
            HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
         
     | 
| 
      
 31 
     | 
    
         
            +
            WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
         
     | 
| 
      
 32 
     | 
    
         
            +
            FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
         
     | 
| 
      
 33 
     | 
    
         
            +
            OTHER DEALINGS IN THE SOFTWARE.
         
     | 
    
        data/lib/robots.rb
    ADDED
    
    | 
         @@ -0,0 +1,113 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require "open-uri"
         
     | 
| 
      
 2 
     | 
    
         
            +
            require "uri"
         
     | 
| 
      
 3 
     | 
    
         
            +
            class Robots
         
     | 
| 
      
 4 
     | 
    
         
            +
              class ParsedRobots
         
     | 
| 
      
 5 
     | 
    
         
            +
                def initialize(uri)
         
     | 
| 
      
 6 
     | 
    
         
            +
                  io = open(URI.join(uri.to_s, "/robots.txt"))
         
     | 
| 
      
 7 
     | 
    
         
            +
                  return if io.content_type != "text/plain"
         
     | 
| 
      
 8 
     | 
    
         
            +
                  return if io.status != ["200", "OK"]
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
                  @other = {}
         
     | 
| 
      
 11 
     | 
    
         
            +
                  @disallows = {}
         
     | 
| 
      
 12 
     | 
    
         
            +
                  @allows = {}
         
     | 
| 
      
 13 
     | 
    
         
            +
                  agent = ""
         
     | 
| 
      
 14 
     | 
    
         
            +
                  io.each do |line|
         
     | 
| 
      
 15 
     | 
    
         
            +
                    next if line =~ /^\s*(#.*|$)/
         
     | 
| 
      
 16 
     | 
    
         
            +
                    key, value = line.split(":")
         
     | 
| 
      
 17 
     | 
    
         
            +
                    value.strip!
         
     | 
| 
      
 18 
     | 
    
         
            +
                    case key
         
     | 
| 
      
 19 
     | 
    
         
            +
                    when "User-agent":
         
     | 
| 
      
 20 
     | 
    
         
            +
                      agent = to_regex(value)
         
     | 
| 
      
 21 
     | 
    
         
            +
                    when "Allow":
         
     | 
| 
      
 22 
     | 
    
         
            +
                      @allows[agent] ||= []
         
     | 
| 
      
 23 
     | 
    
         
            +
                      @allows[agent] << to_regex(value)
         
     | 
| 
      
 24 
     | 
    
         
            +
                    when "Disallow":
         
     | 
| 
      
 25 
     | 
    
         
            +
                      @disallows[agent] ||= []
         
     | 
| 
      
 26 
     | 
    
         
            +
                      @disallows[agent] << to_regex(value)
         
     | 
| 
      
 27 
     | 
    
         
            +
                    else
         
     | 
| 
      
 28 
     | 
    
         
            +
                      @disallows[key] ||= []
         
     | 
| 
      
 29 
     | 
    
         
            +
                      @disallows[key] << value
         
     | 
| 
      
 30 
     | 
    
         
            +
                    end
         
     | 
| 
      
 31 
     | 
    
         
            +
                  end
         
     | 
| 
      
 32 
     | 
    
         
            +
                  
         
     | 
| 
      
 33 
     | 
    
         
            +
                  @parsed = true
         
     | 
| 
      
 34 
     | 
    
         
            +
                end
         
     | 
| 
      
 35 
     | 
    
         
            +
                
         
     | 
| 
      
 36 
     | 
    
         
            +
                def allowed?(uri, user_agent)
         
     | 
| 
      
 37 
     | 
    
         
            +
                  return true unless @parsed
         
     | 
| 
      
 38 
     | 
    
         
            +
                  allowed = true
         
     | 
| 
      
 39 
     | 
    
         
            +
                  path = uri.request_uri
         
     | 
| 
      
 40 
     | 
    
         
            +
                  puts "path: #{path}"
         
     | 
| 
      
 41 
     | 
    
         
            +
                  
         
     | 
| 
      
 42 
     | 
    
         
            +
                  @disallows.each do |key, value|
         
     | 
| 
      
 43 
     | 
    
         
            +
                    if user_agent =~ key
         
     | 
| 
      
 44 
     | 
    
         
            +
                      puts "matched #{key.inspect}"
         
     | 
| 
      
 45 
     | 
    
         
            +
                      value.each do |rule|
         
     | 
| 
      
 46 
     | 
    
         
            +
                        if path =~ rule
         
     | 
| 
      
 47 
     | 
    
         
            +
                          puts "matched Disallow: #{rule.inspect}"
         
     | 
| 
      
 48 
     | 
    
         
            +
                          allowed = false
         
     | 
| 
      
 49 
     | 
    
         
            +
                        end
         
     | 
| 
      
 50 
     | 
    
         
            +
                      end
         
     | 
| 
      
 51 
     | 
    
         
            +
                    end
         
     | 
| 
      
 52 
     | 
    
         
            +
                  end
         
     | 
| 
      
 53 
     | 
    
         
            +
                  
         
     | 
| 
      
 54 
     | 
    
         
            +
                  return true if allowed
         
     | 
| 
      
 55 
     | 
    
         
            +
                  
         
     | 
| 
      
 56 
     | 
    
         
            +
                  @allows.each do |key, value|
         
     | 
| 
      
 57 
     | 
    
         
            +
                    if user_agent =~ key
         
     | 
| 
      
 58 
     | 
    
         
            +
                      puts "matched #{key.inspect}"
         
     | 
| 
      
 59 
     | 
    
         
            +
                      value.each do |rule|
         
     | 
| 
      
 60 
     | 
    
         
            +
                        if path =~ rule
         
     | 
| 
      
 61 
     | 
    
         
            +
                          puts "matched Allow: #{rule.inspect}"
         
     | 
| 
      
 62 
     | 
    
         
            +
                          return true 
         
     | 
| 
      
 63 
     | 
    
         
            +
                        end
         
     | 
| 
      
 64 
     | 
    
         
            +
                      end
         
     | 
| 
      
 65 
     | 
    
         
            +
                    end
         
     | 
| 
      
 66 
     | 
    
         
            +
                  end
         
     | 
| 
      
 67 
     | 
    
         
            +
                  
         
     | 
| 
      
 68 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 69 
     | 
    
         
            +
                end
         
     | 
| 
      
 70 
     | 
    
         
            +
                
         
     | 
| 
      
 71 
     | 
    
         
            +
                def other_values
         
     | 
| 
      
 72 
     | 
    
         
            +
                  @other
         
     | 
| 
      
 73 
     | 
    
         
            +
                end
         
     | 
| 
      
 74 
     | 
    
         
            +
                
         
     | 
| 
      
 75 
     | 
    
         
            +
              protected
         
     | 
| 
      
 76 
     | 
    
         
            +
                
         
     | 
| 
      
 77 
     | 
    
         
            +
                def to_regex(pattern)
         
     | 
| 
      
 78 
     | 
    
         
            +
                  pattern = Regexp.escape(pattern)
         
     | 
| 
      
 79 
     | 
    
         
            +
                  pattern.gsub!(Regexp.escape("*"), ".*")
         
     | 
| 
      
 80 
     | 
    
         
            +
                  Regexp.compile("^#{pattern}")
         
     | 
| 
      
 81 
     | 
    
         
            +
                end
         
     | 
| 
      
 82 
     | 
    
         
            +
              end
         
     | 
| 
      
 83 
     | 
    
         
            +
              
         
     | 
| 
      
 84 
     | 
    
         
            +
              def initialize(user_agent)
         
     | 
| 
      
 85 
     | 
    
         
            +
                @user_agent = user_agent
         
     | 
| 
      
 86 
     | 
    
         
            +
                @parsed = {}
         
     | 
| 
      
 87 
     | 
    
         
            +
              end
         
     | 
| 
      
 88 
     | 
    
         
            +
              
         
     | 
| 
      
 89 
     | 
    
         
            +
              def allowed?(uri)
         
     | 
| 
      
 90 
     | 
    
         
            +
                uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
         
     | 
| 
      
 91 
     | 
    
         
            +
                host = uri.host
         
     | 
| 
      
 92 
     | 
    
         
            +
                @parsed[host] ||= ParsedRobots.new(uri)
         
     | 
| 
      
 93 
     | 
    
         
            +
                @parsed[host].allowed?(uri, @user_agent)
         
     | 
| 
      
 94 
     | 
    
         
            +
              end
         
     | 
| 
      
 95 
     | 
    
         
            +
              
         
     | 
| 
      
 96 
     | 
    
         
            +
              def other_values(uri)
         
     | 
| 
      
 97 
     | 
    
         
            +
                uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
         
     | 
| 
      
 98 
     | 
    
         
            +
                host = uri.host
         
     | 
| 
      
 99 
     | 
    
         
            +
                @parsed[host] ||= ParsedRobots.new(uri)
         
     | 
| 
      
 100 
     | 
    
         
            +
                @parsed[host].other_values
         
     | 
| 
      
 101 
     | 
    
         
            +
              end
         
     | 
| 
      
 102 
     | 
    
         
            +
            end
         
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
            if __FILE__ == $0
         
     | 
| 
      
 105 
     | 
    
         
            +
              require "test/unit"
         
     | 
| 
      
 106 
     | 
    
         
            +
              class RobotsTest < Test::Unit::TestCase 
         
     | 
| 
      
 107 
     | 
    
         
            +
                def test_robots
         
     | 
| 
      
 108 
     | 
    
         
            +
                  robots = Robots.new "Ruby-Robot.txt Parser Test Script"
         
     | 
| 
      
 109 
     | 
    
         
            +
                  assert robots.allowed?("http://www.yelp.com/foo")
         
     | 
| 
      
 110 
     | 
    
         
            +
                  assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
         
     | 
| 
      
 111 
     | 
    
         
            +
                end
         
     | 
| 
      
 112 
     | 
    
         
            +
              end  
         
     | 
| 
      
 113 
     | 
    
         
            +
            end
         
     | 
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,55 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification 
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: fizx-robots
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version 
         
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.1
         
     | 
| 
      
 5 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 6 
     | 
    
         
            +
            authors: 
         
     | 
| 
      
 7 
     | 
    
         
            +
            - Kyle Maxwell
         
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire: 
         
     | 
| 
      
 9 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 10 
     | 
    
         
            +
            cert_chain: []
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2008-08-10 00:00:00 -07:00
         
     | 
| 
      
 13 
     | 
    
         
            +
            default_executable: 
         
     | 
| 
      
 14 
     | 
    
         
            +
            dependencies: []
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            description: It parses robots.txt files
         
     | 
| 
      
 17 
     | 
    
         
            +
            email: kyle@kylemaxwell.com
         
     | 
| 
      
 18 
     | 
    
         
            +
            executables: []
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
            extra_rdoc_files: 
         
     | 
| 
      
 23 
     | 
    
         
            +
            - README
         
     | 
| 
      
 24 
     | 
    
         
            +
            files: 
         
     | 
| 
      
 25 
     | 
    
         
            +
            - README
         
     | 
| 
      
 26 
     | 
    
         
            +
            - lib/robots.rb
         
     | 
| 
      
 27 
     | 
    
         
            +
            has_rdoc: true
         
     | 
| 
      
 28 
     | 
    
         
            +
            homepage: http://github.com/fizx/robots
         
     | 
| 
      
 29 
     | 
    
         
            +
            post_install_message: 
         
     | 
| 
      
 30 
     | 
    
         
            +
            rdoc_options: 
         
     | 
| 
      
 31 
     | 
    
         
            +
            - --main
         
     | 
| 
      
 32 
     | 
    
         
            +
            - README
         
     | 
| 
      
 33 
     | 
    
         
            +
            require_paths: 
         
     | 
| 
      
 34 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 35 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 36 
     | 
    
         
            +
              requirements: 
         
     | 
| 
      
 37 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 38 
     | 
    
         
            +
                - !ruby/object:Gem::Version 
         
     | 
| 
      
 39 
     | 
    
         
            +
                  version: "0"
         
     | 
| 
      
 40 
     | 
    
         
            +
              version: 
         
     | 
| 
      
 41 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 42 
     | 
    
         
            +
              requirements: 
         
     | 
| 
      
 43 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 44 
     | 
    
         
            +
                - !ruby/object:Gem::Version 
         
     | 
| 
      
 45 
     | 
    
         
            +
                  version: "0"
         
     | 
| 
      
 46 
     | 
    
         
            +
              version: 
         
     | 
| 
      
 47 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
            rubyforge_project: 
         
     | 
| 
      
 50 
     | 
    
         
            +
            rubygems_version: 1.2.0
         
     | 
| 
      
 51 
     | 
    
         
            +
            signing_key: 
         
     | 
| 
      
 52 
     | 
    
         
            +
            specification_version: 2
         
     | 
| 
      
 53 
     | 
    
         
            +
            summary: Simple robots.txt parser
         
     | 
| 
      
 54 
     | 
    
         
            +
            test_files: []
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     |