fizx-robots 0.3.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/robots.rb +23 -23
- metadata +4 -2
data/lib/robots.rb
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
require "open-uri"
|
2
2
|
require "uri"
|
3
3
|
require "rubygems"
|
4
|
-
|
4
|
+
|
5
5
|
class Robots
|
6
|
-
include Loggable
|
7
6
|
|
8
7
|
class ParsedRobots
|
9
|
-
include Loggable
|
10
8
|
|
11
9
|
def initialize(uri)
|
10
|
+
@last_accessed = Time.at(1)
|
12
11
|
io = open(URI.join(uri.to_s, "/robots.txt")) rescue nil
|
13
12
|
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
14
13
|
io = StringIO.new("User-agent: *\nAllow: /\n")
|
@@ -17,10 +16,13 @@ class Robots
|
|
17
16
|
@other = {}
|
18
17
|
@disallows = {}
|
19
18
|
@allows = {}
|
19
|
+
@delays = {} # added delays to make it work
|
20
20
|
agent = /.*/
|
21
21
|
io.each do |line|
|
22
22
|
next if line =~ /^\s*(#.*|$)/
|
23
|
-
|
23
|
+
arr = line.split(":")
|
24
|
+
key = arr.shift
|
25
|
+
value = arr.join(":").strip
|
24
26
|
value.strip!
|
25
27
|
case key
|
26
28
|
when "User-agent":
|
@@ -31,9 +33,10 @@ class Robots
|
|
31
33
|
when "Disallow":
|
32
34
|
@disallows[agent] ||= []
|
33
35
|
@disallows[agent] << to_regex(value)
|
36
|
+
when "Crawl-delay"
|
37
|
+
@delays[agent] = value.to_i
|
34
38
|
else
|
35
|
-
@
|
36
|
-
@disallows[agent] << to_regex(value)
|
39
|
+
@other[key] = value
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
@@ -44,38 +47,35 @@ class Robots
|
|
44
47
|
return true unless @parsed
|
45
48
|
allowed = true
|
46
49
|
path = uri.request_uri
|
47
|
-
|
48
|
-
puts "--------"
|
49
|
-
puts @disallows.inspect
|
50
|
-
puts "--------"
|
50
|
+
|
51
51
|
@disallows.each do |key, value|
|
52
|
-
puts ">>>>>>>#{key.inspect}<<<<<<<<"
|
53
52
|
if user_agent =~ key
|
54
|
-
debug "matched #{key.inspect}"
|
55
53
|
value.each do |rule|
|
56
54
|
if path =~ rule
|
57
|
-
debug "matched Disallow: #{rule.inspect}"
|
58
55
|
allowed = false
|
59
56
|
end
|
60
57
|
end
|
61
58
|
end
|
62
59
|
end
|
63
60
|
|
64
|
-
return true if allowed
|
65
|
-
|
66
61
|
@allows.each do |key, value|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
62
|
+
unless allowed
|
63
|
+
if user_agent =~ key
|
64
|
+
value.each do |rule|
|
65
|
+
if path =~ rule
|
66
|
+
allowed = true
|
67
|
+
end
|
73
68
|
end
|
74
69
|
end
|
75
70
|
end
|
76
71
|
end
|
77
72
|
|
78
|
-
|
73
|
+
if allowed && @delays[user_agent]
|
74
|
+
sleep @delays[user_agent] - (Time.now - @last_accessed)
|
75
|
+
@last_accessed = Time.now
|
76
|
+
end
|
77
|
+
|
78
|
+
return allowed
|
79
79
|
end
|
80
80
|
|
81
81
|
def other_values
|
@@ -109,4 +109,4 @@ class Robots
|
|
109
109
|
@parsed[host] ||= ParsedRobots.new(uri)
|
110
110
|
@parsed[host].other_values
|
111
111
|
end
|
112
|
-
end
|
112
|
+
end
|
metadata
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fizx-robots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kyle Maxwell
|
8
|
+
- Sausheong Chang
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
12
|
|
12
|
-
date: 2008-
|
13
|
+
date: 2008-12-10 00:00:00 -08:00
|
13
14
|
default_executable:
|
14
15
|
dependencies:
|
15
16
|
- !ruby/object:Gem::Dependency
|
16
17
|
name: fizx-loggable
|
18
|
+
type: :runtime
|
17
19
|
version_requirement:
|
18
20
|
version_requirements: !ruby/object:Gem::Requirement
|
19
21
|
requirements:
|