sausheong-robots 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/README +33 -0
  2. data/lib/robots.rb +112 -0
  3. metadata +65 -0
data/README ADDED
@@ -0,0 +1,33 @@
1
+ A simple Ruby library to parse robots.txt.
2
+
3
+ Usage:
4
+
5
+ robots = Robots.new "Some User Agent"
6
+ assert robots.allowed?("http://www.yelp.com/foo")
7
+ assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
8
+ robots.other_values("http://foo.com") # gets misc. key/values (i.e. sitemaps)
9
+
10
+ If you want caching, you're on your own. I suggest marshalling an instance of the parser.
11
+
12
+ Copyright (c) 2008 Kyle Maxwell
13
+
14
+ Permission is hereby granted, free of charge, to any person
15
+ obtaining a copy of this software and associated documentation
16
+ files (the "Software"), to deal in the Software without
17
+ restriction, including without limitation the rights to use,
18
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the
20
+ Software is furnished to do so, subject to the following
21
+ conditions:
22
+
23
+ The above copyright notice and this permission notice shall be
24
+ included in all copies or substantial portions of the Software.
25
+
26
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
28
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
30
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
31
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
32
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
33
+ OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,112 @@
1
+ require "open-uri"
2
+ require "uri"
3
+ require "rubygems"
4
+
5
+ class Robots
6
+
7
+ class ParsedRobots
8
+
9
+ def initialize(uri)
10
+ @last_accessed = Time.at(1)
11
+ io = open(URI.join(uri.to_s, "/robots.txt")) rescue nil
12
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
13
+ io = StringIO.new("User-agent: *\nAllow: /\n")
14
+ end
15
+
16
+ @other = {}
17
+ @disallows = {}
18
+ @allows = {}
19
+ @delays = {} # added delays to make it work
20
+ agent = /.*/
21
+ io.each do |line|
22
+ next if line =~ /^\s*(#.*|$)/
23
+ arr = line.split(":")
24
+ key = arr.shift
25
+ value = arr.join(":").strip
26
+ value.strip!
27
+ case key
28
+ when "User-agent":
29
+ agent = to_regex(value)
30
+ when "Allow":
31
+ @allows[agent] ||= []
32
+ @allows[agent] << to_regex(value)
33
+ when "Disallow":
34
+ @disallows[agent] ||= []
35
+ @disallows[agent] << to_regex(value)
36
+ when "Crawl-delay"
37
+ @delays[agent] = value.to_i
38
+ else
39
+ @other[key] = value
40
+ end
41
+ end
42
+
43
+ @parsed = true
44
+ end
45
+
46
+ def allowed?(uri, user_agent)
47
+ return true unless @parsed
48
+ allowed = true
49
+ path = uri.request_uri
50
+
51
+ @disallows.each do |key, value|
52
+ if user_agent =~ key
53
+ value.each do |rule|
54
+ if path =~ rule
55
+ allowed = false
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ @allows.each do |key, value|
62
+ unless allowed
63
+ if user_agent =~ key
64
+ value.each do |rule|
65
+ if path =~ rule
66
+ allowed = true
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ if allowed && @delays[user_agent]
74
+ sleep @delays[user_agent] - (Time.now - @last_accessed)
75
+ @last_accessed = Time.now
76
+ end
77
+
78
+ return allowed
79
+ end
80
+
81
+ def other_values
82
+ @other
83
+ end
84
+
85
+ protected
86
+
87
+ def to_regex(pattern)
88
+ pattern = Regexp.escape(pattern)
89
+ pattern.gsub!(Regexp.escape("*"), ".*")
90
+ Regexp.compile("^#{pattern}")
91
+ end
92
+ end
93
+
94
+ def initialize(user_agent)
95
+ @user_agent = user_agent
96
+ @parsed = {}
97
+ end
98
+
99
+ def allowed?(uri)
100
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
101
+ host = uri.host
102
+ @parsed[host] ||= ParsedRobots.new(uri)
103
+ @parsed[host].allowed?(uri, @user_agent)
104
+ end
105
+
106
+ def other_values(uri)
107
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
108
+ host = uri.host
109
+ @parsed[host] ||= ParsedRobots.new(uri)
110
+ @parsed[host].other_values
111
+ end
112
+ end
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sausheong-robots
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.6.0
5
+ platform: ruby
6
+ authors:
7
+ - Kyle Maxwell
8
+ - Sausheong Chang
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2008-12-10 00:00:00 -08:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: fizx-loggable
18
+ type: :runtime
19
+ version_requirement:
20
+ version_requirements: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ description: It parses robots.txt files
27
+ email: kyle@kylemaxwell.com
28
+ executables: []
29
+
30
+ extensions: []
31
+
32
+ extra_rdoc_files:
33
+ - README
34
+ files:
35
+ - README
36
+ - lib/robots.rb
37
+ has_rdoc: true
38
+ homepage: http://github.com/fizx/robots
39
+ post_install_message:
40
+ rdoc_options:
41
+ - --main
42
+ - README
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: "0"
56
+ version:
57
+ requirements: []
58
+
59
+ rubyforge_project:
60
+ rubygems_version: 1.2.0
61
+ signing_key:
62
+ specification_version: 2
63
+ summary: Simple robots.txt parser
64
+ test_files: []
65
+