robotex 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+ == 1.0.0 / 2012-01-20
2
+
3
+ * Minor enhancements
4
+
5
+ * Move delay sleep out of allowed? into its own method (delay!)
6
+ * Add rspec suite
7
+ * Add Gemfile
8
+
9
+ * Bug fixes
10
+
11
+ * Fix handling of priority for allow vs disallow
12
+ * Fix permission issue with gem
data/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ Copyright (c) 2012 Chris Kite
2
+ Copyright (c) 2008-2011 Kyle Maxwell, contributors
3
+
4
+ Permission is hereby granted, free of charge, to any person
5
+ obtaining a copy of this software and associated documentation
6
+ files (the "Software"), to deal in the Software without
7
+ restriction, including without limitation the rights to use,
8
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the
10
+ Software is furnished to do so, subject to the following
11
+ conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
+ OTHER DEALINGS IN THE SOFTWARE.
24
+
@@ -0,0 +1,14 @@
1
+ = Robotex
2
+ == Obey Robots.txt
3
+
4
+ With one line of code, Robotex (pronounced like "robotics") will download and parse the robots.txt file and let you know if your program is allowed to visit a given link.
5
+
6
+ Usage:
7
+
8
+ robotex = Robotex.new "My User Agent"
9
+ robotex.allowed?("http://www.example.com/foo")
10
+ robotex.delay! # wait until any specified Crawl-Delay has passed
11
+
12
+ == Acknowledgements
13
+
14
+ Robotex is a modified version of Kyle Maxwell's excellent Robots library. Some folks were unable to use that gem due to packaging issues, so I used his code to create Robotex.
@@ -0,0 +1,23 @@
1
+ require 'rspec/core/rake_task'
2
+ require 'rake/rdoctask'
3
+
4
+ desc "Run all specs"
5
+ RSpec::Core::RakeTask.new(:rspec) do |spec|
6
+ spec.pattern = 'spec/**/*_spec.rb'
7
+ end
8
+
9
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
10
+ spec.pattern = 'spec/**/*_spec.rb'
11
+ spec.rcov = true
12
+ end
13
+
14
+ task :default => :rspec
15
+
16
+ Rake::RDocTask.new(:rdoc) do |rdoc|
17
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
18
+
19
+ rdoc.rdoc_dir = 'rdoc'
20
+ rdoc.title = "Robotex #{version}"
21
+ rdoc.rdoc_files.include('README*')
22
+ rdoc.rdoc_files.include('lib/**/*.rb')
23
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,152 @@
1
+ # Author (2012): Chris Kite
2
+ # Original Author (2008-2011): Kyle Maxwell
3
+
4
+ require 'rubygems'
5
+ require 'open-uri'
6
+ require 'uri'
7
+ require 'timeout'
8
+
9
+ class Robotex
10
+
11
+ VERSION = '1.0.0'
12
+ DEFAULT_TIMEOUT = 3
13
+
14
+ attr_reader :user_agent
15
+
16
+ class ParsedRobots
17
+
18
+ def initialize(uri, user_agent)
19
+ io = Robotex.get_robots_txt(uri, user_agent)
20
+
21
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
22
+ io = StringIO.new("User-agent: *\nAllow: /\n")
23
+ end
24
+
25
+ @disallows = {}
26
+ @allows = {}
27
+ @delays = {}
28
+ agent = /.*/
29
+ io.each do |line|
30
+ next if line =~ /^\s*(#.*|$)/
31
+ arr = line.split(":")
32
+ key = arr.shift
33
+ value = arr.join(":").strip
34
+ value.strip!
35
+ case key.downcase
36
+ when "user-agent"
37
+ agent = to_regex(value)
38
+ when "allow"
39
+ @allows[agent] ||= []
40
+ @allows[agent] << to_regex(value)
41
+ when "disallow"
42
+ @disallows[agent] ||= []
43
+ @disallows[agent] << to_regex(value)
44
+ when "crawl-delay"
45
+ @delays[agent] = value.to_i
46
+ end
47
+ end
48
+
49
+ @parsed = true
50
+ end
51
+
52
+ def allowed?(uri, user_agent)
53
+ return true unless @parsed
54
+ allowed = true
55
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
56
+ path = uri.request_uri
57
+
58
+ @allows.each do |key, value|
59
+ unless allowed
60
+ if user_agent =~ key
61
+ value.each do |rule|
62
+ if path =~ rule
63
+ allowed = true
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+
70
+ @disallows.each do |key, value|
71
+ if user_agent =~ key
72
+ value.each do |rule|
73
+ if path =~ rule
74
+ allowed = false
75
+ end
76
+ end
77
+ end
78
+ end
79
+
80
+ return allowed
81
+ end
82
+
83
+ def delay(user_agent)
84
+ @delays.each do |agent, delay|
85
+ return delay if agent =~ user_agent
86
+ end
87
+ nil
88
+ end
89
+
90
+ protected
91
+
92
+ def to_regex(pattern)
93
+ return /should-not-match-anything-123456789/ if pattern.strip.empty?
94
+ pattern = Regexp.escape(pattern)
95
+ pattern.gsub!(Regexp.escape("*"), ".*")
96
+ Regexp.compile("^#{pattern}")
97
+ end
98
+ end
99
+
100
+ def self.get_robots_txt(uri, user_agent)
101
+ begin
102
+ Timeout::timeout(Robotex.timeout) do
103
+ io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
104
+ end
105
+ rescue Timeout::Error
106
+ STDERR.puts "robots.txt request timed out"
107
+ end
108
+ end
109
+
110
+ def self.timeout=(t)
111
+ @timeout = t
112
+ end
113
+
114
+ def self.timeout
115
+ @timeout || DEFAULT_TIMEOUT
116
+ end
117
+
118
+ def initialize(user_agent = nil)
119
+ user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
120
+ @user_agent = user_agent
121
+ @last_accessed = Time.at(1)
122
+ @parsed = {}
123
+ end
124
+
125
+ def parse_host(uri)
126
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
127
+ @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
128
+ end
129
+
130
+ #
131
+ # Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
132
+ #
133
+ def allowed?(uri)
134
+ parse_host(uri).allowed?(uri, @user_agent)
135
+ end
136
+
137
+ #
138
+ # Return the value of the Crawl-Delay directive, or nil if none
139
+ def delay(uri)
140
+ parse_host(uri).delay(@user_agent)
141
+ end
142
+
143
+ #
144
+ # Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
145
+ #
146
+ def delay!(uri)
147
+ delay = delay(uri)
148
+ sleep delay - (Time.now - @last_accessed) if !!delay
149
+ @last_accessed = Time.now
150
+ end
151
+
152
+ end
@@ -0,0 +1,87 @@
1
+ require 'spec_helper'
2
+
3
+ describe Robotex do
4
+
5
+ before(:all) do
6
+ FakeWeb.allow_net_connect = false
7
+ robots = <<-END
8
+ User-Agent: msnbot
9
+ Crawl-Delay: 20
10
+
11
+ User-Agent: bender
12
+ Disallow: /my_shiny_metal_ass
13
+
14
+ User-Agent: *
15
+ Disallow: /login
16
+ Allow: /
17
+
18
+ Disallow: /locked
19
+ Allow: /locked
20
+ END
21
+ options = {:body => robots, :content_type => 'text/plain', :status => [200, "OK"]}
22
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', options)
23
+ end
24
+
25
+ describe '#initialize' do
26
+ context 'when no arguments are supplied' do
27
+ it 'returns a Robotex with the default user-agent' do
28
+ Robotex.new.user_agent.should == "Robotex/#{Robotex::VERSION} (http://www.github.com/chriskite/robotex)"
29
+ end
30
+ end
31
+
32
+ context 'when a user-agent is specified' do
33
+ it 'returns a Robotex with the specified user-agent' do
34
+ ua = 'My User Agent'
35
+ Robotex.new(ua).user_agent.should == ua
36
+ end
37
+ end
38
+ end
39
+
40
+ describe '#allowed?' do
41
+ context 'when the robots.txt disallows the user-agent to the url' do
42
+ it 'returns false' do
43
+ robotex = Robotex.new('bender')
44
+ robotex.allowed?(SPEC_DOMAIN + 'my_shiny_metal_ass').should be_false
45
+ end
46
+ end
47
+
48
+ context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
49
+ it 'returns true' do
50
+ robotex = Robotex.new('bender')
51
+ robotex.allowed?(SPEC_DOMAIN + 'cigars').should be_true
52
+ end
53
+ end
54
+
55
+ context 'when the robots.txt disallows any user-agent to the url' do
56
+ it 'returns false' do
57
+ robotex = Robotex.new
58
+ robotex.allowed?(SPEC_DOMAIN + 'login').should be_false
59
+ end
60
+ end
61
+
62
+ context 'when the robots.txt disallows and then allows the url' do
63
+ it 'returns false' do
64
+ robotex = Robotex.new
65
+ robotex.allowed?(SPEC_DOMAIN + 'locked').should be_false
66
+ end
67
+ end
68
+ end
69
+
70
+ describe '#delay' do
71
+ context 'when no Crawl-Delay is specified for the user-agent' do
72
+ it 'returns nil' do
73
+ robotex = Robotex.new
74
+ robotex.delay(SPEC_DOMAIN).should be_nil
75
+ end
76
+
77
+ context 'when Crawl-Delay is specified for the user-agent' do
78
+ it 'returns the delay as a Fixnum' do
79
+ robotex = Robotex.new('msnbot')
80
+ robotex.delay(SPEC_DOMAIN).should == 20
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ end
87
+
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ $:.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
4
+ require 'robotex'
5
+ require 'fakeweb'
6
+
7
+ SPEC_DOMAIN = 'http://www.example.com/'
8
+
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: robotex
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Chris Kite
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-01-20 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: &21394420 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 0.9.2
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *21394420
25
+ - !ruby/object:Gem::Dependency
26
+ name: rdoc
27
+ requirement: &21393840 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '3.12'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *21393840
36
+ - !ruby/object:Gem::Dependency
37
+ name: rspec
38
+ requirement: &21393260 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: 2.8.0
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *21393260
47
+ - !ruby/object:Gem::Dependency
48
+ name: fakeweb
49
+ requirement: &21392680 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.3.0
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *21392680
58
+ description:
59
+ email:
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files:
63
+ - README.rdoc
64
+ files:
65
+ - VERSION
66
+ - LICENSE
67
+ - CHANGELOG.rdoc
68
+ - README.rdoc
69
+ - Rakefile
70
+ - lib/robotex.rb
71
+ - spec/spec_helper.rb
72
+ - spec/robotex_spec.rb
73
+ homepage: http://www.github.com/chriskite/robotex
74
+ licenses: []
75
+ post_install_message:
76
+ rdoc_options:
77
+ - -m
78
+ - README.rdoc
79
+ - -t
80
+ - Robotex
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ none: false
91
+ requirements:
92
+ - - ! '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project:
97
+ rubygems_version: 1.8.15
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: Obey Robots.txt
101
+ test_files:
102
+ - spec/spec_helper.rb
103
+ - spec/robotex_spec.rb