fizx-fizx-robots 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/README +33 -0
  2. data/lib/robots.rb +118 -0
  3. metadata +63 -0
data/README ADDED
@@ -0,0 +1,33 @@
1
+ A simple Ruby library to parse robots.txt.
2
+
3
+ Usage:
4
+
5
+ robots = Robots.new "Some User Agent"
6
+ assert robots.allowed?("http://www.yelp.com/foo")
7
+ assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
8
+ robots.other_values("http://foo.com") # gets misc. key/values (i.e. sitemaps)
9
+
10
+ If you want caching, you're on your own. I suggest marshalling an instance of the parser.
11
+
12
+ Copyright (c) 2008 Kyle Maxwell
13
+
14
+ Permission is hereby granted, free of charge, to any person
15
+ obtaining a copy of this software and associated documentation
16
+ files (the "Software"), to deal in the Software without
17
+ restriction, including without limitation the rights to use,
18
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the
20
+ Software is furnished to do so, subject to the following
21
+ conditions:
22
+
23
+ The above copyright notice and this permission notice shall be
24
+ included in all copies or substantial portions of the Software.
25
+
26
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
28
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
30
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
31
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
32
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
33
+ OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,118 @@
1
+ require "open-uri"
2
+ require "uri"
3
+ require "rubygems"
4
+ require "loggable"
5
+ class Robots
6
+ include Loggable
7
+
8
+ class ParsedRobots
9
+ include Loggable
10
+
11
+ def initialize(uri)
12
+ @last_accessed = Time.at(1)
13
+ io = open(URI.join(uri.to_s, "/robots.txt")) rescue nil
14
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
15
+ io = StringIO.new("User-agent: *\nAllow: /\n")
16
+ end
17
+
18
+ @other = {}
19
+ @disallows = {}
20
+ @allows = {}
21
+ agent = /.*/
22
+ io.each do |line|
23
+ next if line =~ /^\s*(#.*|$)/
24
+ arr = line.split(":")
25
+ key = arr.shift
26
+ value = arr.join(":").strip
27
+ value.strip!
28
+ case key
29
+ when "User-agent":
30
+ agent = to_regex(value)
31
+ when "Allow":
32
+ @allows[agent] ||= []
33
+ @allows[agent] << to_regex(value)
34
+ when "Disallow":
35
+ @disallows[agent] ||= []
36
+ @disallows[agent] << to_regex(value)
37
+ when "Crawl-delay"
38
+ @delays[agent] = value.to_i
39
+ else
40
+ @other[key] = value
41
+ end
42
+ end
43
+
44
+ @parsed = true
45
+ end
46
+
47
+ def allowed?(uri, user_agent)
48
+ return true unless @parsed
49
+ allowed = true
50
+ path = uri.request_uri
51
+ debug "path: #{path}"
52
+
53
+ @disallows.each do |key, value|
54
+ if user_agent =~ key
55
+ debug "matched #{key.inspect}"
56
+ value.each do |rule|
57
+ if path =~ rule
58
+ debug "matched Disallow: #{rule.inspect}"
59
+ allowed = false
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ @allows.each do |key, value|
66
+ unless allowed
67
+ if user_agent =~ key
68
+ debug "matched #{key.inspect}"
69
+ value.each do |rule|
70
+ if path =~ rule
71
+ debug "matched Allow: #{rule.inspect}"
72
+ allowed = true
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
78
+
79
+ if allowed && @delays[user_agent]
80
+ sleep @delays[user_agent] - (Time.now - @last_accessed)
81
+ @last_accessed = Time.now
82
+ end
83
+
84
+ return allowed
85
+ end
86
+
87
+ def other_values
88
+ @other
89
+ end
90
+
91
+ protected
92
+
93
+ def to_regex(pattern)
94
+ pattern = Regexp.escape(pattern)
95
+ pattern.gsub!(Regexp.escape("*"), ".*")
96
+ Regexp.compile("^#{pattern}")
97
+ end
98
+ end
99
+
100
+ def initialize(user_agent)
101
+ @user_agent = user_agent
102
+ @parsed = {}
103
+ end
104
+
105
+ def allowed?(uri)
106
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
107
+ host = uri.host
108
+ @parsed[host] ||= ParsedRobots.new(uri)
109
+ @parsed[host].allowed?(uri, @user_agent)
110
+ end
111
+
112
+ def other_values(uri)
113
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
114
+ host = uri.host
115
+ @parsed[host] ||= ParsedRobots.new(uri)
116
+ @parsed[host].other_values
117
+ end
118
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fizx-fizx-robots
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
+ platform: ruby
6
+ authors:
7
+ - Kyle Maxwell
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-12-10 00:00:00 -08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: fizx-loggable
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">"
21
+ - !ruby/object:Gem::Version
22
+ version: 0.0.0
23
+ version:
24
+ description: It parses robots.txt files
25
+ email: kyle@kylemaxwell.com
26
+ executables: []
27
+
28
+ extensions: []
29
+
30
+ extra_rdoc_files:
31
+ - README
32
+ files:
33
+ - README
34
+ - lib/robots.rb
35
+ has_rdoc: true
36
+ homepage: http://github.com/fizx/robots
37
+ post_install_message:
38
+ rdoc_options:
39
+ - --main
40
+ - README
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: "0"
48
+ version:
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ requirements: []
56
+
57
+ rubyforge_project:
58
+ rubygems_version: 1.2.0
59
+ signing_key:
60
+ specification_version: 2
61
+ summary: Simple robots.txt parser
62
+ test_files: []
63
+