robotex 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +12 -0
- data/LICENSE +24 -0
- data/README.rdoc +14 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/lib/robotex.rb +152 -0
- data/spec/robotex_spec.rb +87 -0
- data/spec/spec_helper.rb +8 -0
- metadata +103 -0
data/CHANGELOG.rdoc
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
Copyright (c) 2012 Chris Kite
|
2
|
+
Copyright (c) 2008-2011 Kyle Maxwell, contributors
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person
|
5
|
+
obtaining a copy of this software and associated documentation
|
6
|
+
files (the "Software"), to deal in the Software without
|
7
|
+
restriction, including without limitation the rights to use,
|
8
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the
|
10
|
+
Software is furnished to do so, subject to the following
|
11
|
+
conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
18
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
20
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
21
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
22
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
23
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
24
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
= Robotex
|
2
|
+
== Obey Robots.txt
|
3
|
+
|
4
|
+
With one line of code, Robotex (pronounced like "robotics") will download and parse the robots.txt file and let you know if your program is allowed to visit a given link.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
|
8
|
+
robotex = Robotex.new "My User Agent"
|
9
|
+
robotex.allowed?("http://www.example.com/foo")
|
10
|
+
robotex.delay! # wait until any specified Crawl-Delay has passed
|
11
|
+
|
12
|
+
== Acknowledgements
|
13
|
+
|
14
|
+
Robotex is a modified version of Kyle Maxwell's excellent Robots library. Some folks were unable to use that gem due to packaging issues, so I used his code to create Robotex.
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
require 'rake/rdoctask'
|
3
|
+
|
4
|
+
desc "Run all specs"
|
5
|
+
RSpec::Core::RakeTask.new(:rspec) do |spec|
|
6
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
7
|
+
end
|
8
|
+
|
9
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
10
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
11
|
+
spec.rcov = true
|
12
|
+
end
|
13
|
+
|
14
|
+
task :default => :rspec
|
15
|
+
|
16
|
+
Rake::RDocTask.new(:rdoc) do |rdoc|
|
17
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
18
|
+
|
19
|
+
rdoc.rdoc_dir = 'rdoc'
|
20
|
+
rdoc.title = "Robotex #{version}"
|
21
|
+
rdoc.rdoc_files.include('README*')
|
22
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
23
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
data/lib/robotex.rb
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
# Author (2012): Chris Kite
|
2
|
+
# Original Author (2008-2011): Kyle Maxwell
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'uri'
|
7
|
+
require 'timeout'
|
8
|
+
|
9
|
+
class Robotex
|
10
|
+
|
11
|
+
VERSION = '1.0.0'
|
12
|
+
DEFAULT_TIMEOUT = 3
|
13
|
+
|
14
|
+
attr_reader :user_agent
|
15
|
+
|
16
|
+
class ParsedRobots
|
17
|
+
|
18
|
+
def initialize(uri, user_agent)
|
19
|
+
io = Robotex.get_robots_txt(uri, user_agent)
|
20
|
+
|
21
|
+
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
22
|
+
io = StringIO.new("User-agent: *\nAllow: /\n")
|
23
|
+
end
|
24
|
+
|
25
|
+
@disallows = {}
|
26
|
+
@allows = {}
|
27
|
+
@delays = {}
|
28
|
+
agent = /.*/
|
29
|
+
io.each do |line|
|
30
|
+
next if line =~ /^\s*(#.*|$)/
|
31
|
+
arr = line.split(":")
|
32
|
+
key = arr.shift
|
33
|
+
value = arr.join(":").strip
|
34
|
+
value.strip!
|
35
|
+
case key.downcase
|
36
|
+
when "user-agent"
|
37
|
+
agent = to_regex(value)
|
38
|
+
when "allow"
|
39
|
+
@allows[agent] ||= []
|
40
|
+
@allows[agent] << to_regex(value)
|
41
|
+
when "disallow"
|
42
|
+
@disallows[agent] ||= []
|
43
|
+
@disallows[agent] << to_regex(value)
|
44
|
+
when "crawl-delay"
|
45
|
+
@delays[agent] = value.to_i
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
@parsed = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def allowed?(uri, user_agent)
|
53
|
+
return true unless @parsed
|
54
|
+
allowed = true
|
55
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
56
|
+
path = uri.request_uri
|
57
|
+
|
58
|
+
@allows.each do |key, value|
|
59
|
+
unless allowed
|
60
|
+
if user_agent =~ key
|
61
|
+
value.each do |rule|
|
62
|
+
if path =~ rule
|
63
|
+
allowed = true
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
@disallows.each do |key, value|
|
71
|
+
if user_agent =~ key
|
72
|
+
value.each do |rule|
|
73
|
+
if path =~ rule
|
74
|
+
allowed = false
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
return allowed
|
81
|
+
end
|
82
|
+
|
83
|
+
def delay(user_agent)
|
84
|
+
@delays.each do |agent, delay|
|
85
|
+
return delay if agent =~ user_agent
|
86
|
+
end
|
87
|
+
nil
|
88
|
+
end
|
89
|
+
|
90
|
+
protected
|
91
|
+
|
92
|
+
def to_regex(pattern)
|
93
|
+
return /should-not-match-anything-123456789/ if pattern.strip.empty?
|
94
|
+
pattern = Regexp.escape(pattern)
|
95
|
+
pattern.gsub!(Regexp.escape("*"), ".*")
|
96
|
+
Regexp.compile("^#{pattern}")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.get_robots_txt(uri, user_agent)
|
101
|
+
begin
|
102
|
+
Timeout::timeout(Robotex.timeout) do
|
103
|
+
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
104
|
+
end
|
105
|
+
rescue Timeout::Error
|
106
|
+
STDERR.puts "robots.txt request timed out"
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.timeout=(t)
|
111
|
+
@timeout = t
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.timeout
|
115
|
+
@timeout || DEFAULT_TIMEOUT
|
116
|
+
end
|
117
|
+
|
118
|
+
def initialize(user_agent = nil)
|
119
|
+
user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
|
120
|
+
@user_agent = user_agent
|
121
|
+
@last_accessed = Time.at(1)
|
122
|
+
@parsed = {}
|
123
|
+
end
|
124
|
+
|
125
|
+
def parse_host(uri)
|
126
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
127
|
+
@parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
|
128
|
+
end
|
129
|
+
|
130
|
+
#
|
131
|
+
# Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
|
132
|
+
#
|
133
|
+
def allowed?(uri)
|
134
|
+
parse_host(uri).allowed?(uri, @user_agent)
|
135
|
+
end
|
136
|
+
|
137
|
+
#
|
138
|
+
# Return the value of the Crawl-Delay directive, or nil if none
|
139
|
+
def delay(uri)
|
140
|
+
parse_host(uri).delay(@user_agent)
|
141
|
+
end
|
142
|
+
|
143
|
+
#
|
144
|
+
# Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
|
145
|
+
#
|
146
|
+
def delay!(uri)
|
147
|
+
delay = delay(uri)
|
148
|
+
sleep delay - (Time.now - @last_accessed) if !!delay
|
149
|
+
@last_accessed = Time.now
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Robotex do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
FakeWeb.allow_net_connect = false
|
7
|
+
robots = <<-END
|
8
|
+
User-Agent: msnbot
|
9
|
+
Crawl-Delay: 20
|
10
|
+
|
11
|
+
User-Agent: bender
|
12
|
+
Disallow: /my_shiny_metal_ass
|
13
|
+
|
14
|
+
User-Agent: *
|
15
|
+
Disallow: /login
|
16
|
+
Allow: /
|
17
|
+
|
18
|
+
Disallow: /locked
|
19
|
+
Allow: /locked
|
20
|
+
END
|
21
|
+
options = {:body => robots, :content_type => 'text/plain', :status => [200, "OK"]}
|
22
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', options)
|
23
|
+
end
|
24
|
+
|
25
|
+
describe '#initialize' do
|
26
|
+
context 'when no arguments are supplied' do
|
27
|
+
it 'returns a Robotex with the default user-agent' do
|
28
|
+
Robotex.new.user_agent.should == "Robotex/#{Robotex::VERSION} (http://www.github.com/chriskite/robotex)"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'when a user-agent is specified' do
|
33
|
+
it 'returns a Robotex with the specified user-agent' do
|
34
|
+
ua = 'My User Agent'
|
35
|
+
Robotex.new(ua).user_agent.should == ua
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe '#allowed?' do
|
41
|
+
context 'when the robots.txt disallows the user-agent to the url' do
|
42
|
+
it 'returns false' do
|
43
|
+
robotex = Robotex.new('bender')
|
44
|
+
robotex.allowed?(SPEC_DOMAIN + 'my_shiny_metal_ass').should be_false
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
|
49
|
+
it 'returns true' do
|
50
|
+
robotex = Robotex.new('bender')
|
51
|
+
robotex.allowed?(SPEC_DOMAIN + 'cigars').should be_true
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context 'when the robots.txt disallows any user-agent to the url' do
|
56
|
+
it 'returns false' do
|
57
|
+
robotex = Robotex.new
|
58
|
+
robotex.allowed?(SPEC_DOMAIN + 'login').should be_false
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context 'when the robots.txt disallows and then allows the url' do
|
63
|
+
it 'returns false' do
|
64
|
+
robotex = Robotex.new
|
65
|
+
robotex.allowed?(SPEC_DOMAIN + 'locked').should be_false
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe '#delay' do
|
71
|
+
context 'when no Crawl-Delay is specified for the user-agent' do
|
72
|
+
it 'returns nil' do
|
73
|
+
robotex = Robotex.new
|
74
|
+
robotex.delay(SPEC_DOMAIN).should be_nil
|
75
|
+
end
|
76
|
+
|
77
|
+
context 'when Crawl-Delay is specified for the user-agent' do
|
78
|
+
it 'returns the delay as a Fixnum' do
|
79
|
+
robotex = Robotex.new('msnbot')
|
80
|
+
robotex.delay(SPEC_DOMAIN).should == 20
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: robotex
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Chris Kite
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-01-20 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: &21394420 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.9.2
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *21394420
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rdoc
|
27
|
+
requirement: &21393840 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '3.12'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *21393840
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
requirement: &21393260 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 2.8.0
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *21393260
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: fakeweb
|
49
|
+
requirement: &21392680 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.3.0
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *21392680
|
58
|
+
description:
|
59
|
+
email:
|
60
|
+
executables: []
|
61
|
+
extensions: []
|
62
|
+
extra_rdoc_files:
|
63
|
+
- README.rdoc
|
64
|
+
files:
|
65
|
+
- VERSION
|
66
|
+
- LICENSE
|
67
|
+
- CHANGELOG.rdoc
|
68
|
+
- README.rdoc
|
69
|
+
- Rakefile
|
70
|
+
- lib/robotex.rb
|
71
|
+
- spec/spec_helper.rb
|
72
|
+
- spec/robotex_spec.rb
|
73
|
+
homepage: http://www.github.com/chriskite/robotex
|
74
|
+
licenses: []
|
75
|
+
post_install_message:
|
76
|
+
rdoc_options:
|
77
|
+
- -m
|
78
|
+
- README.rdoc
|
79
|
+
- -t
|
80
|
+
- Robotex
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
91
|
+
requirements:
|
92
|
+
- - ! '>='
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
requirements: []
|
96
|
+
rubyforge_project:
|
97
|
+
rubygems_version: 1.8.15
|
98
|
+
signing_key:
|
99
|
+
specification_version: 3
|
100
|
+
summary: Obey Robots.txt
|
101
|
+
test_files:
|
102
|
+
- spec/spec_helper.rb
|
103
|
+
- spec/robotex_spec.rb
|