fizx-robots 0.3.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/robots.rb +23 -23
  2. metadata +4 -2
@@ -1,14 +1,13 @@
1
1
  require "open-uri"
2
2
  require "uri"
3
3
  require "rubygems"
4
- require "loggable"
4
+
5
5
  class Robots
6
- include Loggable
7
6
 
8
7
  class ParsedRobots
9
- include Loggable
10
8
 
11
9
  def initialize(uri)
10
+ @last_accessed = Time.at(1)
12
11
  io = open(URI.join(uri.to_s, "/robots.txt")) rescue nil
13
12
  if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
14
13
  io = StringIO.new("User-agent: *\nAllow: /\n")
@@ -17,10 +16,13 @@ class Robots
17
16
  @other = {}
18
17
  @disallows = {}
19
18
  @allows = {}
19
+ @delays = {} # added delays to make it work
20
20
  agent = /.*/
21
21
  io.each do |line|
22
22
  next if line =~ /^\s*(#.*|$)/
23
- key, value = line.split(":")
23
+ arr = line.split(":")
24
+ key = arr.shift
25
+ value = arr.join(":").strip
24
26
  value.strip!
25
27
  case key
26
28
  when "User-agent":
@@ -31,9 +33,10 @@ class Robots
31
33
  when "Disallow":
32
34
  @disallows[agent] ||= []
33
35
  @disallows[agent] << to_regex(value)
36
+ when "Crawl-delay"
37
+ @delays[agent] = value.to_i
34
38
  else
35
- @disallows[agent] ||= []
36
- @disallows[agent] << to_regex(value)
39
+ @other[key] = value
37
40
  end
38
41
  end
39
42
 
@@ -44,38 +47,35 @@ class Robots
44
47
  return true unless @parsed
45
48
  allowed = true
46
49
  path = uri.request_uri
47
- debug "path: #{path}"
48
- puts "--------"
49
- puts @disallows.inspect
50
- puts "--------"
50
+
51
51
  @disallows.each do |key, value|
52
- puts ">>>>>>>#{key.inspect}<<<<<<<<"
53
52
  if user_agent =~ key
54
- debug "matched #{key.inspect}"
55
53
  value.each do |rule|
56
54
  if path =~ rule
57
- debug "matched Disallow: #{rule.inspect}"
58
55
  allowed = false
59
56
  end
60
57
  end
61
58
  end
62
59
  end
63
60
 
64
- return true if allowed
65
-
66
61
  @allows.each do |key, value|
67
- if user_agent =~ key
68
- debug "matched #{key.inspect}"
69
- value.each do |rule|
70
- if path =~ rule
71
- debug "matched Allow: #{rule.inspect}"
72
- return true
62
+ unless allowed
63
+ if user_agent =~ key
64
+ value.each do |rule|
65
+ if path =~ rule
66
+ allowed = true
67
+ end
73
68
  end
74
69
  end
75
70
  end
76
71
  end
77
72
 
78
- return false
73
+ if allowed && @delays[user_agent]
74
+ sleep @delays[user_agent] - (Time.now - @last_accessed)
75
+ @last_accessed = Time.now
76
+ end
77
+
78
+ return allowed
79
79
  end
80
80
 
81
81
  def other_values
@@ -109,4 +109,4 @@ class Robots
109
109
  @parsed[host] ||= ParsedRobots.new(uri)
110
110
  @parsed[host].other_values
111
111
  end
112
- end
112
+ end
metadata CHANGED
@@ -1,19 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fizx-robots
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kyle Maxwell
8
+ - Sausheong Chang
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
12
 
12
- date: 2008-08-10 00:00:00 -07:00
13
+ date: 2008-12-10 00:00:00 -08:00
13
14
  default_executable:
14
15
  dependencies:
15
16
  - !ruby/object:Gem::Dependency
16
17
  name: fizx-loggable
18
+ type: :runtime
17
19
  version_requirement:
18
20
  version_requirements: !ruby/object:Gem::Requirement
19
21
  requirements: