fizx-robots 0.6.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/robots.rb +25 -4
- metadata +1 -1
data/lib/robots.rb
CHANGED
@@ -1,14 +1,27 @@
|
|
1
1
|
require "open-uri"
|
2
2
|
require "uri"
|
3
3
|
require "rubygems"
|
4
|
+
require "timeout"
|
4
5
|
|
5
6
|
class Robots
|
6
7
|
|
8
|
+
DEFAULT_TIMEOUT = 3
|
9
|
+
|
7
10
|
class ParsedRobots
|
8
11
|
|
9
|
-
def initialize(uri)
|
12
|
+
def initialize(uri, user_agent)
|
10
13
|
@last_accessed = Time.at(1)
|
11
|
-
|
14
|
+
|
15
|
+
io = nil
|
16
|
+
begin
|
17
|
+
Timeout::timeout(Robots.timeout) do
|
18
|
+
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
19
|
+
end
|
20
|
+
rescue Timeout::Error
|
21
|
+
STDERR.puts "robots.txt request timed out"
|
22
|
+
end
|
23
|
+
|
24
|
+
|
12
25
|
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
13
26
|
io = StringIO.new("User-agent: *\nAllow: /\n")
|
14
27
|
end
|
@@ -91,6 +104,14 @@ class Robots
|
|
91
104
|
end
|
92
105
|
end
|
93
106
|
|
107
|
+
def self.timeout=(t)
|
108
|
+
@timeout = t
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.timeout
|
112
|
+
@timeout || DEFAULT_TIMEOUT
|
113
|
+
end
|
114
|
+
|
94
115
|
def initialize(user_agent)
|
95
116
|
@user_agent = user_agent
|
96
117
|
@parsed = {}
|
@@ -99,14 +120,14 @@ class Robots
|
|
99
120
|
def allowed?(uri)
|
100
121
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
101
122
|
host = uri.host
|
102
|
-
@parsed[host] ||= ParsedRobots.new(uri)
|
123
|
+
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
|
103
124
|
@parsed[host].allowed?(uri, @user_agent)
|
104
125
|
end
|
105
126
|
|
106
127
|
def other_values(uri)
|
107
128
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
108
129
|
host = uri.host
|
109
|
-
@parsed[host] ||= ParsedRobots.new(uri)
|
130
|
+
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
|
110
131
|
@parsed[host].other_values
|
111
132
|
end
|
112
133
|
end
|