robotex 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +12 -0
- data/LICENSE +24 -0
- data/README.rdoc +14 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/lib/robotex.rb +152 -0
- data/spec/robotex_spec.rb +87 -0
- data/spec/spec_helper.rb +8 -0
- metadata +103 -0
data/CHANGELOG.rdoc
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
Copyright (c) 2012 Chris Kite
|
2
|
+
Copyright (c) 2008-2011 Kyle Maxwell, contributors
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person
|
5
|
+
obtaining a copy of this software and associated documentation
|
6
|
+
files (the "Software"), to deal in the Software without
|
7
|
+
restriction, including without limitation the rights to use,
|
8
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the
|
10
|
+
Software is furnished to do so, subject to the following
|
11
|
+
conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
18
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
20
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
21
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
22
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
23
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
24
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
= Robotex
|
2
|
+
== Obey Robots.txt
|
3
|
+
|
4
|
+
With one line of code, Robotex (pronounced like "robotics") will download and parse the robots.txt file and let you know if your program is allowed to visit a given link.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
|
8
|
+
robotex = Robotex.new "My User Agent"
|
9
|
+
robotex.allowed?("http://www.example.com/foo")
|
10
|
+
robotex.delay! # wait until any specified Crawl-Delay has passed
|
11
|
+
|
12
|
+
== Acknowledgements
|
13
|
+
|
14
|
+
Robotex is a modified version of Kyle Maxwell's excellent Robots library. Some folks were unable to use that gem due to packaging issues, so I used his code to create Robotex.
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
require 'rake/rdoctask'
|
3
|
+
|
4
|
+
desc "Run all specs"
|
5
|
+
RSpec::Core::RakeTask.new(:rspec) do |spec|
|
6
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
7
|
+
end
|
8
|
+
|
9
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
10
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
11
|
+
spec.rcov = true
|
12
|
+
end
|
13
|
+
|
14
|
+
task :default => :rspec
|
15
|
+
|
16
|
+
Rake::RDocTask.new(:rdoc) do |rdoc|
|
17
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
18
|
+
|
19
|
+
rdoc.rdoc_dir = 'rdoc'
|
20
|
+
rdoc.title = "Robotex #{version}"
|
21
|
+
rdoc.rdoc_files.include('README*')
|
22
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
23
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
data/lib/robotex.rb
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
# Author (2012): Chris Kite
|
2
|
+
# Original Author (2008-2011): Kyle Maxwell
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'uri'
|
7
|
+
require 'timeout'
|
8
|
+
|
9
|
+
class Robotex
|
10
|
+
|
11
|
+
VERSION = '1.0.0'
|
12
|
+
DEFAULT_TIMEOUT = 3
|
13
|
+
|
14
|
+
attr_reader :user_agent
|
15
|
+
|
16
|
+
class ParsedRobots
|
17
|
+
|
18
|
+
def initialize(uri, user_agent)
|
19
|
+
io = Robotex.get_robots_txt(uri, user_agent)
|
20
|
+
|
21
|
+
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
22
|
+
io = StringIO.new("User-agent: *\nAllow: /\n")
|
23
|
+
end
|
24
|
+
|
25
|
+
@disallows = {}
|
26
|
+
@allows = {}
|
27
|
+
@delays = {}
|
28
|
+
agent = /.*/
|
29
|
+
io.each do |line|
|
30
|
+
next if line =~ /^\s*(#.*|$)/
|
31
|
+
arr = line.split(":")
|
32
|
+
key = arr.shift
|
33
|
+
value = arr.join(":").strip
|
34
|
+
value.strip!
|
35
|
+
case key.downcase
|
36
|
+
when "user-agent"
|
37
|
+
agent = to_regex(value)
|
38
|
+
when "allow"
|
39
|
+
@allows[agent] ||= []
|
40
|
+
@allows[agent] << to_regex(value)
|
41
|
+
when "disallow"
|
42
|
+
@disallows[agent] ||= []
|
43
|
+
@disallows[agent] << to_regex(value)
|
44
|
+
when "crawl-delay"
|
45
|
+
@delays[agent] = value.to_i
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
@parsed = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def allowed?(uri, user_agent)
|
53
|
+
return true unless @parsed
|
54
|
+
allowed = true
|
55
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
56
|
+
path = uri.request_uri
|
57
|
+
|
58
|
+
@allows.each do |key, value|
|
59
|
+
unless allowed
|
60
|
+
if user_agent =~ key
|
61
|
+
value.each do |rule|
|
62
|
+
if path =~ rule
|
63
|
+
allowed = true
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
@disallows.each do |key, value|
|
71
|
+
if user_agent =~ key
|
72
|
+
value.each do |rule|
|
73
|
+
if path =~ rule
|
74
|
+
allowed = false
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
return allowed
|
81
|
+
end
|
82
|
+
|
83
|
+
def delay(user_agent)
|
84
|
+
@delays.each do |agent, delay|
|
85
|
+
return delay if agent =~ user_agent
|
86
|
+
end
|
87
|
+
nil
|
88
|
+
end
|
89
|
+
|
90
|
+
protected
|
91
|
+
|
92
|
+
def to_regex(pattern)
|
93
|
+
return /should-not-match-anything-123456789/ if pattern.strip.empty?
|
94
|
+
pattern = Regexp.escape(pattern)
|
95
|
+
pattern.gsub!(Regexp.escape("*"), ".*")
|
96
|
+
Regexp.compile("^#{pattern}")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.get_robots_txt(uri, user_agent)
|
101
|
+
begin
|
102
|
+
Timeout::timeout(Robotex.timeout) do
|
103
|
+
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
104
|
+
end
|
105
|
+
rescue Timeout::Error
|
106
|
+
STDERR.puts "robots.txt request timed out"
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.timeout=(t)
|
111
|
+
@timeout = t
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.timeout
|
115
|
+
@timeout || DEFAULT_TIMEOUT
|
116
|
+
end
|
117
|
+
|
118
|
+
def initialize(user_agent = nil)
|
119
|
+
user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
|
120
|
+
@user_agent = user_agent
|
121
|
+
@last_accessed = Time.at(1)
|
122
|
+
@parsed = {}
|
123
|
+
end
|
124
|
+
|
125
|
+
def parse_host(uri)
|
126
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
127
|
+
@parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
|
128
|
+
end
|
129
|
+
|
130
|
+
#
|
131
|
+
# Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
|
132
|
+
#
|
133
|
+
def allowed?(uri)
|
134
|
+
parse_host(uri).allowed?(uri, @user_agent)
|
135
|
+
end
|
136
|
+
|
137
|
+
#
|
138
|
+
# Return the value of the Crawl-Delay directive, or nil if none
|
139
|
+
def delay(uri)
|
140
|
+
parse_host(uri).delay(@user_agent)
|
141
|
+
end
|
142
|
+
|
143
|
+
#
|
144
|
+
# Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
|
145
|
+
#
|
146
|
+
def delay!(uri)
|
147
|
+
delay = delay(uri)
|
148
|
+
sleep delay - (Time.now - @last_accessed) if !!delay
|
149
|
+
@last_accessed = Time.now
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Robotex do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
FakeWeb.allow_net_connect = false
|
7
|
+
robots = <<-END
|
8
|
+
User-Agent: msnbot
|
9
|
+
Crawl-Delay: 20
|
10
|
+
|
11
|
+
User-Agent: bender
|
12
|
+
Disallow: /my_shiny_metal_ass
|
13
|
+
|
14
|
+
User-Agent: *
|
15
|
+
Disallow: /login
|
16
|
+
Allow: /
|
17
|
+
|
18
|
+
Disallow: /locked
|
19
|
+
Allow: /locked
|
20
|
+
END
|
21
|
+
options = {:body => robots, :content_type => 'text/plain', :status => [200, "OK"]}
|
22
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', options)
|
23
|
+
end
|
24
|
+
|
25
|
+
describe '#initialize' do
|
26
|
+
context 'when no arguments are supplied' do
|
27
|
+
it 'returns a Robotex with the default user-agent' do
|
28
|
+
Robotex.new.user_agent.should == "Robotex/#{Robotex::VERSION} (http://www.github.com/chriskite/robotex)"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'when a user-agent is specified' do
|
33
|
+
it 'returns a Robotex with the specified user-agent' do
|
34
|
+
ua = 'My User Agent'
|
35
|
+
Robotex.new(ua).user_agent.should == ua
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe '#allowed?' do
|
41
|
+
context 'when the robots.txt disallows the user-agent to the url' do
|
42
|
+
it 'returns false' do
|
43
|
+
robotex = Robotex.new('bender')
|
44
|
+
robotex.allowed?(SPEC_DOMAIN + 'my_shiny_metal_ass').should be_false
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
|
49
|
+
it 'returns true' do
|
50
|
+
robotex = Robotex.new('bender')
|
51
|
+
robotex.allowed?(SPEC_DOMAIN + 'cigars').should be_true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context 'when the robots.txt disallows any user-agent to the url' do
|
56
|
+
it 'returns false' do
|
57
|
+
robotex = Robotex.new
|
58
|
+
robotex.allowed?(SPEC_DOMAIN + 'login').should be_false
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context 'when the robots.txt disallows and then allows the url' do
|
63
|
+
it 'returns false' do
|
64
|
+
robotex = Robotex.new
|
65
|
+
robotex.allowed?(SPEC_DOMAIN + 'locked').should be_false
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe '#delay' do
|
71
|
+
context 'when no Crawl-Delay is specified for the user-agent' do
|
72
|
+
it 'returns nil' do
|
73
|
+
robotex = Robotex.new
|
74
|
+
robotex.delay(SPEC_DOMAIN).should be_nil
|
75
|
+
end
|
76
|
+
|
77
|
+
context 'when Crawl-Delay is specified for the user-agent' do
|
78
|
+
it 'returns the delay as a Fixnum' do
|
79
|
+
robotex = Robotex.new('msnbot')
|
80
|
+
robotex.delay(SPEC_DOMAIN).should == 20
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: robotex
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Chris Kite
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-01-20 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: &21394420 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.9.2
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *21394420
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rdoc
|
27
|
+
requirement: &21393840 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '3.12'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *21393840
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
requirement: &21393260 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 2.8.0
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *21393260
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: fakeweb
|
49
|
+
requirement: &21392680 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.3.0
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *21392680
|
58
|
+
description:
|
59
|
+
email:
|
60
|
+
executables: []
|
61
|
+
extensions: []
|
62
|
+
extra_rdoc_files:
|
63
|
+
- README.rdoc
|
64
|
+
files:
|
65
|
+
- VERSION
|
66
|
+
- LICENSE
|
67
|
+
- CHANGELOG.rdoc
|
68
|
+
- README.rdoc
|
69
|
+
- Rakefile
|
70
|
+
- lib/robotex.rb
|
71
|
+
- spec/spec_helper.rb
|
72
|
+
- spec/robotex_spec.rb
|
73
|
+
homepage: http://www.github.com/chriskite/robotex
|
74
|
+
licenses: []
|
75
|
+
post_install_message:
|
76
|
+
rdoc_options:
|
77
|
+
- -m
|
78
|
+
- README.rdoc
|
79
|
+
- -t
|
80
|
+
- Robotex
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
91
|
+
requirements:
|
92
|
+
- - ! '>='
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
requirements: []
|
96
|
+
rubyforge_project:
|
97
|
+
rubygems_version: 1.8.15
|
98
|
+
signing_key:
|
99
|
+
specification_version: 3
|
100
|
+
summary: Obey Robots.txt
|
101
|
+
test_files:
|
102
|
+
- spec/spec_helper.rb
|
103
|
+
- spec/robotex_spec.rb
|