fizx-robots 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/README +33 -0
  2. data/lib/robots.rb +113 -0
  3. metadata +55 -0
data/README ADDED
@@ -0,0 +1,33 @@
1
+ A simple Ruby library to parse robots.txt.
2
+
3
+ Usage:
4
+
5
+ robots = Robots.new "Some User Agent"
6
+ assert robots.allowed?("http://www.yelp.com/foo")
7
+ assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
8
+ robots.other_values("http://foo.com") # gets misc. key/values (i.e. sitemaps)
9
+
10
+ If you want caching, you're on your own. I suggest marshalling an instance of the parser.
11
+
12
+ Copyright (c) 2008 Kyle Maxwell
13
+
14
+ Permission is hereby granted, free of charge, to any person
15
+ obtaining a copy of this software and associated documentation
16
+ files (the "Software"), to deal in the Software without
17
+ restriction, including without limitation the rights to use,
18
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the
20
+ Software is furnished to do so, subject to the following
21
+ conditions:
22
+
23
+ The above copyright notice and this permission notice shall be
24
+ included in all copies or substantial portions of the Software.
25
+
26
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
28
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
30
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
31
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
32
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
33
+ OTHER DEALINGS IN THE SOFTWARE.
data/lib/robots.rb ADDED
@@ -0,0 +1,113 @@
1
+ require "open-uri"
2
+ require "uri"
3
+ class Robots
4
+ class ParsedRobots
5
+ def initialize(uri)
6
+ io = open(URI.join(uri.to_s, "/robots.txt"))
7
+ return if io.content_type != "text/plain"
8
+ return if io.status != ["200", "OK"]
9
+
10
+ @other = {}
11
+ @disallows = {}
12
+ @allows = {}
13
+ agent = ""
14
+ io.each do |line|
15
+ next if line =~ /^\s*(#.*|$)/
16
+ key, value = line.split(":")
17
+ value.strip!
18
+ case key
19
+ when "User-agent":
20
+ agent = to_regex(value)
21
+ when "Allow":
22
+ @allows[agent] ||= []
23
+ @allows[agent] << to_regex(value)
24
+ when "Disallow":
25
+ @disallows[agent] ||= []
26
+ @disallows[agent] << to_regex(value)
27
+ else
28
+ @disallows[key] ||= []
29
+ @disallows[key] << value
30
+ end
31
+ end
32
+
33
+ @parsed = true
34
+ end
35
+
36
+ def allowed?(uri, user_agent)
37
+ return true unless @parsed
38
+ allowed = true
39
+ path = uri.request_uri
40
+ puts "path: #{path}"
41
+
42
+ @disallows.each do |key, value|
43
+ if user_agent =~ key
44
+ puts "matched #{key.inspect}"
45
+ value.each do |rule|
46
+ if path =~ rule
47
+ puts "matched Disallow: #{rule.inspect}"
48
+ allowed = false
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ return true if allowed
55
+
56
+ @allows.each do |key, value|
57
+ if user_agent =~ key
58
+ puts "matched #{key.inspect}"
59
+ value.each do |rule|
60
+ if path =~ rule
61
+ puts "matched Allow: #{rule.inspect}"
62
+ return true
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ return false
69
+ end
70
+
71
+ def other_values
72
+ @other
73
+ end
74
+
75
+ protected
76
+
77
+ def to_regex(pattern)
78
+ pattern = Regexp.escape(pattern)
79
+ pattern.gsub!(Regexp.escape("*"), ".*")
80
+ Regexp.compile("^#{pattern}")
81
+ end
82
+ end
83
+
84
+ def initialize(user_agent)
85
+ @user_agent = user_agent
86
+ @parsed = {}
87
+ end
88
+
89
+ def allowed?(uri)
90
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
91
+ host = uri.host
92
+ @parsed[host] ||= ParsedRobots.new(uri)
93
+ @parsed[host].allowed?(uri, @user_agent)
94
+ end
95
+
96
+ def other_values(uri)
97
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
98
+ host = uri.host
99
+ @parsed[host] ||= ParsedRobots.new(uri)
100
+ @parsed[host].other_values
101
+ end
102
+ end
103
+
104
+ if __FILE__ == $0
105
+ require "test/unit"
106
+ class RobotsTest < Test::Unit::TestCase
107
+ def test_robots
108
+ robots = Robots.new "Ruby-Robot.txt Parser Test Script"
109
+ assert robots.allowed?("http://www.yelp.com/foo")
110
+ assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
111
+ end
112
+ end
113
+ end
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fizx-robots
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Kyle Maxwell
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-08-10 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: It parses robots.txt files
17
+ email: kyle@kylemaxwell.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README
24
+ files:
25
+ - README
26
+ - lib/robots.rb
27
+ has_rdoc: true
28
+ homepage: http://github.com/fizx/robots
29
+ post_install_message:
30
+ rdoc_options:
31
+ - --main
32
+ - README
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: "0"
40
+ version:
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ version:
47
+ requirements: []
48
+
49
+ rubyforge_project:
50
+ rubygems_version: 1.2.0
51
+ signing_key:
52
+ specification_version: 2
53
+ summary: Simple robots.txt parser
54
+ test_files: []
55
+