fizx-robots 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +33 -0
- data/lib/robots.rb +113 -0
- metadata +55 -0
data/README
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
A simple Ruby library to parse robots.txt.
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
|
5
|
+
robots = Robots.new "Some User Agent"
|
6
|
+
assert robots.allowed?("http://www.yelp.com/foo")
|
7
|
+
assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
|
8
|
+
robots.other_values("http://foo.com") # gets misc. key/values (i.e. sitemaps)
|
9
|
+
|
10
|
+
If you want caching, you're on your own. I suggest marshalling an instance of the parser.
|
11
|
+
|
12
|
+
Copyright (c) 2008 Kyle Maxwell
|
13
|
+
|
14
|
+
Permission is hereby granted, free of charge, to any person
|
15
|
+
obtaining a copy of this software and associated documentation
|
16
|
+
files (the "Software"), to deal in the Software without
|
17
|
+
restriction, including without limitation the rights to use,
|
18
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
19
|
+
copies of the Software, and to permit persons to whom the
|
20
|
+
Software is furnished to do so, subject to the following
|
21
|
+
conditions:
|
22
|
+
|
23
|
+
The above copyright notice and this permission notice shall be
|
24
|
+
included in all copies or substantial portions of the Software.
|
25
|
+
|
26
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
27
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
28
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
29
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
30
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
31
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
32
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
33
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/lib/robots.rb
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "uri"
|
3
|
+
class Robots
|
4
|
+
class ParsedRobots
|
5
|
+
def initialize(uri)
|
6
|
+
io = open(URI.join(uri.to_s, "/robots.txt"))
|
7
|
+
return if io.content_type != "text/plain"
|
8
|
+
return if io.status != ["200", "OK"]
|
9
|
+
|
10
|
+
@other = {}
|
11
|
+
@disallows = {}
|
12
|
+
@allows = {}
|
13
|
+
agent = ""
|
14
|
+
io.each do |line|
|
15
|
+
next if line =~ /^\s*(#.*|$)/
|
16
|
+
key, value = line.split(":")
|
17
|
+
value.strip!
|
18
|
+
case key
|
19
|
+
when "User-agent":
|
20
|
+
agent = to_regex(value)
|
21
|
+
when "Allow":
|
22
|
+
@allows[agent] ||= []
|
23
|
+
@allows[agent] << to_regex(value)
|
24
|
+
when "Disallow":
|
25
|
+
@disallows[agent] ||= []
|
26
|
+
@disallows[agent] << to_regex(value)
|
27
|
+
else
|
28
|
+
@disallows[key] ||= []
|
29
|
+
@disallows[key] << value
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
@parsed = true
|
34
|
+
end
|
35
|
+
|
36
|
+
def allowed?(uri, user_agent)
|
37
|
+
return true unless @parsed
|
38
|
+
allowed = true
|
39
|
+
path = uri.request_uri
|
40
|
+
puts "path: #{path}"
|
41
|
+
|
42
|
+
@disallows.each do |key, value|
|
43
|
+
if user_agent =~ key
|
44
|
+
puts "matched #{key.inspect}"
|
45
|
+
value.each do |rule|
|
46
|
+
if path =~ rule
|
47
|
+
puts "matched Disallow: #{rule.inspect}"
|
48
|
+
allowed = false
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
return true if allowed
|
55
|
+
|
56
|
+
@allows.each do |key, value|
|
57
|
+
if user_agent =~ key
|
58
|
+
puts "matched #{key.inspect}"
|
59
|
+
value.each do |rule|
|
60
|
+
if path =~ rule
|
61
|
+
puts "matched Allow: #{rule.inspect}"
|
62
|
+
return true
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
return false
|
69
|
+
end
|
70
|
+
|
71
|
+
def other_values
|
72
|
+
@other
|
73
|
+
end
|
74
|
+
|
75
|
+
protected
|
76
|
+
|
77
|
+
def to_regex(pattern)
|
78
|
+
pattern = Regexp.escape(pattern)
|
79
|
+
pattern.gsub!(Regexp.escape("*"), ".*")
|
80
|
+
Regexp.compile("^#{pattern}")
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def initialize(user_agent)
|
85
|
+
@user_agent = user_agent
|
86
|
+
@parsed = {}
|
87
|
+
end
|
88
|
+
|
89
|
+
def allowed?(uri)
|
90
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
91
|
+
host = uri.host
|
92
|
+
@parsed[host] ||= ParsedRobots.new(uri)
|
93
|
+
@parsed[host].allowed?(uri, @user_agent)
|
94
|
+
end
|
95
|
+
|
96
|
+
def other_values(uri)
|
97
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
98
|
+
host = uri.host
|
99
|
+
@parsed[host] ||= ParsedRobots.new(uri)
|
100
|
+
@parsed[host].other_values
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
if __FILE__ == $0
|
105
|
+
require "test/unit"
|
106
|
+
class RobotsTest < Test::Unit::TestCase
|
107
|
+
def test_robots
|
108
|
+
robots = Robots.new "Ruby-Robot.txt Parser Test Script"
|
109
|
+
assert robots.allowed?("http://www.yelp.com/foo")
|
110
|
+
assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fizx-robots
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kyle Maxwell
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-08-10 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: It parses robots.txt files
|
17
|
+
email: kyle@kylemaxwell.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
files:
|
25
|
+
- README
|
26
|
+
- lib/robots.rb
|
27
|
+
has_rdoc: true
|
28
|
+
homepage: http://github.com/fizx/robots
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options:
|
31
|
+
- --main
|
32
|
+
- README
|
33
|
+
require_paths:
|
34
|
+
- lib
|
35
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: "0"
|
40
|
+
version:
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: "0"
|
46
|
+
version:
|
47
|
+
requirements: []
|
48
|
+
|
49
|
+
rubyforge_project:
|
50
|
+
rubygems_version: 1.2.0
|
51
|
+
signing_key:
|
52
|
+
specification_version: 2
|
53
|
+
summary: Simple robots.txt parser
|
54
|
+
test_files: []
|
55
|
+
|