fizx-robots 0.6.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/robots.rb +25 -4
- metadata +1 -1
data/lib/robots.rb
CHANGED
@@ -1,14 +1,27 @@
|
|
1
1
|
require "open-uri"
|
2
2
|
require "uri"
|
3
3
|
require "rubygems"
|
4
|
+
require "timeout"
|
4
5
|
|
5
6
|
class Robots
|
6
7
|
|
8
|
+
DEFAULT_TIMEOUT = 3
|
9
|
+
|
7
10
|
class ParsedRobots
|
8
11
|
|
9
|
-
def initialize(uri)
|
12
|
+
def initialize(uri, user_agent)
|
10
13
|
@last_accessed = Time.at(1)
|
11
|
-
|
14
|
+
|
15
|
+
io = nil
|
16
|
+
begin
|
17
|
+
Timeout::timeout(Robots.timeout) do
|
18
|
+
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
19
|
+
end
|
20
|
+
rescue Timeout::Error
|
21
|
+
STDERR.puts "robots.txt request timed out"
|
22
|
+
end
|
23
|
+
|
24
|
+
|
12
25
|
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
13
26
|
io = StringIO.new("User-agent: *\nAllow: /\n")
|
14
27
|
end
|
@@ -91,6 +104,14 @@ class Robots
|
|
91
104
|
end
|
92
105
|
end
|
93
106
|
|
107
|
+
def self.timeout=(t)
|
108
|
+
@timeout = t
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.timeout
|
112
|
+
@timeout || DEFAULT_TIMEOUT
|
113
|
+
end
|
114
|
+
|
94
115
|
def initialize(user_agent)
|
95
116
|
@user_agent = user_agent
|
96
117
|
@parsed = {}
|
@@ -99,14 +120,14 @@ class Robots
|
|
99
120
|
def allowed?(uri)
|
100
121
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
101
122
|
host = uri.host
|
102
|
-
@parsed[host] ||= ParsedRobots.new(uri)
|
123
|
+
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
|
103
124
|
@parsed[host].allowed?(uri, @user_agent)
|
104
125
|
end
|
105
126
|
|
106
127
|
def other_values(uri)
|
107
128
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
108
129
|
host = uri.host
|
109
|
-
@parsed[host] ||= ParsedRobots.new(uri)
|
130
|
+
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
|
110
131
|
@parsed[host].other_values
|
111
132
|
end
|
112
133
|
end
|