robotex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ == 1.0.0 / 2012-01-20
2
+
3
+ * Minor enhancements
4
+
5
+ * Move delay sleep out of allowed? into its own method (delay!)
6
+ * Add rspec suite
7
+ * Add Gemfile
8
+
9
+ * Bug fixes
10
+
11
+ * Fix handling of priority for allow vs disallow
12
+ * Fix permission issue with gem
data/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ Copyright (c) 2012 Chris Kite
2
+ Copyright (c) 2008-2011 Kyle Maxwell, contributors
3
+
4
+ Permission is hereby granted, free of charge, to any person
5
+ obtaining a copy of this software and associated documentation
6
+ files (the "Software"), to deal in the Software without
7
+ restriction, including without limitation the rights to use,
8
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the
10
+ Software is furnished to do so, subject to the following
11
+ conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
+ OTHER DEALINGS IN THE SOFTWARE.
24
+
@@ -0,0 +1,14 @@
1
+ = Robotex
2
+ == Obey Robots.txt
3
+
4
+ With one line of code, Robotex (pronounced like "robotics") will download and parse the robots.txt file and let you know if your program is allowed to visit a given link.
5
+
6
+ Usage:
7
+
8
+ robotex = Robotex.new "My User Agent"
9
+ robotex.allowed?("http://www.example.com/foo")
10
+ robotex.delay! # wait until any specified Crawl-Delay has passed
11
+
12
+ == Acknowledgements
13
+
14
+ Robotex is a modified version of Kyle Maxwell's excellent Robots library. Some folks were unable to use that gem due to packaging issues, so I used his code to create Robotex.
@@ -0,0 +1,23 @@
1
+ require 'rspec/core/rake_task'
2
+ require 'rake/rdoctask'
3
+
4
+ desc "Run all specs"
5
+ RSpec::Core::RakeTask.new(:rspec) do |spec|
6
+ spec.pattern = 'spec/**/*_spec.rb'
7
+ end
8
+
9
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
10
+ spec.pattern = 'spec/**/*_spec.rb'
11
+ spec.rcov = true
12
+ end
13
+
14
+ task :default => :rspec
15
+
16
+ Rake::RDocTask.new(:rdoc) do |rdoc|
17
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
18
+
19
+ rdoc.rdoc_dir = 'rdoc'
20
+ rdoc.title = "Robotex #{version}"
21
+ rdoc.rdoc_files.include('README*')
22
+ rdoc.rdoc_files.include('lib/**/*.rb')
23
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,152 @@
1
+ # Author (2012): Chris Kite
2
+ # Original Author (2008-2011): Kyle Maxwell
3
+
4
+ require 'rubygems'
5
+ require 'open-uri'
6
+ require 'uri'
7
+ require 'timeout'
8
+
9
+ class Robotex
10
+
11
+ VERSION = '1.0.0'
12
+ DEFAULT_TIMEOUT = 3
13
+
14
+ attr_reader :user_agent
15
+
16
+ class ParsedRobots
17
+
18
+ def initialize(uri, user_agent)
19
+ io = Robotex.get_robots_txt(uri, user_agent)
20
+
21
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
22
+ io = StringIO.new("User-agent: *\nAllow: /\n")
23
+ end
24
+
25
+ @disallows = {}
26
+ @allows = {}
27
+ @delays = {}
28
+ agent = /.*/
29
+ io.each do |line|
30
+ next if line =~ /^\s*(#.*|$)/
31
+ arr = line.split(":")
32
+ key = arr.shift
33
+ value = arr.join(":").strip
34
+ value.strip!
35
+ case key.downcase
36
+ when "user-agent"
37
+ agent = to_regex(value)
38
+ when "allow"
39
+ @allows[agent] ||= []
40
+ @allows[agent] << to_regex(value)
41
+ when "disallow"
42
+ @disallows[agent] ||= []
43
+ @disallows[agent] << to_regex(value)
44
+ when "crawl-delay"
45
+ @delays[agent] = value.to_i
46
+ end
47
+ end
48
+
49
+ @parsed = true
50
+ end
51
+
52
+ def allowed?(uri, user_agent)
53
+ return true unless @parsed
54
+ allowed = true
55
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
56
+ path = uri.request_uri
57
+
58
+ @allows.each do |key, value|
59
+ unless allowed
60
+ if user_agent =~ key
61
+ value.each do |rule|
62
+ if path =~ rule
63
+ allowed = true
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+
70
+ @disallows.each do |key, value|
71
+ if user_agent =~ key
72
+ value.each do |rule|
73
+ if path =~ rule
74
+ allowed = false
75
+ end
76
+ end
77
+ end
78
+ end
79
+
80
+ return allowed
81
+ end
82
+
83
+ def delay(user_agent)
84
+ @delays.each do |agent, delay|
85
+ return delay if agent =~ user_agent
86
+ end
87
+ nil
88
+ end
89
+
90
+ protected
91
+
92
+ def to_regex(pattern)
93
+ return /should-not-match-anything-123456789/ if pattern.strip.empty?
94
+ pattern = Regexp.escape(pattern)
95
+ pattern.gsub!(Regexp.escape("*"), ".*")
96
+ Regexp.compile("^#{pattern}")
97
+ end
98
+ end
99
+
100
+ def self.get_robots_txt(uri, user_agent)
101
+ begin
102
+ Timeout::timeout(Robotex.timeout) do
103
+ io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
104
+ end
105
+ rescue Timeout::Error
106
+ STDERR.puts "robots.txt request timed out"
107
+ end
108
+ end
109
+
110
+ def self.timeout=(t)
111
+ @timeout = t
112
+ end
113
+
114
+ def self.timeout
115
+ @timeout || DEFAULT_TIMEOUT
116
+ end
117
+
118
+ def initialize(user_agent = nil)
119
+ user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
120
+ @user_agent = user_agent
121
+ @last_accessed = Time.at(1)
122
+ @parsed = {}
123
+ end
124
+
125
+ def parse_host(uri)
126
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
127
+ @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
128
+ end
129
+
130
+ #
131
+ # Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
132
+ #
133
+ def allowed?(uri)
134
+ parse_host(uri).allowed?(uri, @user_agent)
135
+ end
136
+
137
+ #
138
+ # Return the value of the Crawl-Delay directive, or nil if none
139
+ def delay(uri)
140
+ parse_host(uri).delay(@user_agent)
141
+ end
142
+
143
+ #
144
+ # Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
145
+ #
146
+ def delay!(uri)
147
+ delay = delay(uri)
148
+ sleep delay - (Time.now - @last_accessed) if !!delay
149
+ @last_accessed = Time.now
150
+ end
151
+
152
+ end
@@ -0,0 +1,87 @@
1
+ require 'spec_helper'
2
+
3
+ describe Robotex do
4
+
5
+ before(:all) do
6
+ FakeWeb.allow_net_connect = false
7
+ robots = <<-END
8
+ User-Agent: msnbot
9
+ Crawl-Delay: 20
10
+
11
+ User-Agent: bender
12
+ Disallow: /my_shiny_metal_ass
13
+
14
+ User-Agent: *
15
+ Disallow: /login
16
+ Allow: /
17
+
18
+ Disallow: /locked
19
+ Allow: /locked
20
+ END
21
+ options = {:body => robots, :content_type => 'text/plain', :status => [200, "OK"]}
22
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', options)
23
+ end
24
+
25
+ describe '#initialize' do
26
+ context 'when no arguments are supplied' do
27
+ it 'returns a Robotex with the default user-agent' do
28
+ Robotex.new.user_agent.should == "Robotex/#{Robotex::VERSION} (http://www.github.com/chriskite/robotex)"
29
+ end
30
+ end
31
+
32
+ context 'when a user-agent is specified' do
33
+ it 'returns a Robotex with the specified user-agent' do
34
+ ua = 'My User Agent'
35
+ Robotex.new(ua).user_agent.should == ua
36
+ end
37
+ end
38
+ end
39
+
40
+ describe '#allowed?' do
41
+ context 'when the robots.txt disallows the user-agent to the url' do
42
+ it 'returns false' do
43
+ robotex = Robotex.new('bender')
44
+ robotex.allowed?(SPEC_DOMAIN + 'my_shiny_metal_ass').should be_false
45
+ end
46
+ end
47
+
48
+ context 'when the robots.txt disallows the user-agent to some urls, but allows this one' do
49
+ it 'returns true' do
50
+ robotex = Robotex.new('bender')
51
+ robotex.allowed?(SPEC_DOMAIN + 'cigars').should be_true
52
+ end
53
+ end
54
+
55
+ context 'when the robots.txt disallows any user-agent to the url' do
56
+ it 'returns false' do
57
+ robotex = Robotex.new
58
+ robotex.allowed?(SPEC_DOMAIN + 'login').should be_false
59
+ end
60
+ end
61
+
62
+ context 'when the robots.txt disallows and then allows the url' do
63
+ it 'returns false' do
64
+ robotex = Robotex.new
65
+ robotex.allowed?(SPEC_DOMAIN + 'locked').should be_false
66
+ end
67
+ end
68
+ end
69
+
70
+ describe '#delay' do
71
+ context 'when no Crawl-Delay is specified for the user-agent' do
72
+ it 'returns nil' do
73
+ robotex = Robotex.new
74
+ robotex.delay(SPEC_DOMAIN).should be_nil
75
+ end
76
+
77
+ context 'when Crawl-Delay is specified for the user-agent' do
78
+ it 'returns the delay as a Fixnum' do
79
+ robotex = Robotex.new('msnbot')
80
+ robotex.delay(SPEC_DOMAIN).should == 20
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ end
87
+
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ $:.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
4
+ require 'robotex'
5
+ require 'fakeweb'
6
+
7
+ SPEC_DOMAIN = 'http://www.example.com/'
8
+
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: robotex
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Chris Kite
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-01-20 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: &21394420 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 0.9.2
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *21394420
25
+ - !ruby/object:Gem::Dependency
26
+ name: rdoc
27
+ requirement: &21393840 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '3.12'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *21393840
36
+ - !ruby/object:Gem::Dependency
37
+ name: rspec
38
+ requirement: &21393260 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: 2.8.0
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *21393260
47
+ - !ruby/object:Gem::Dependency
48
+ name: fakeweb
49
+ requirement: &21392680 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.3.0
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *21392680
58
+ description:
59
+ email:
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files:
63
+ - README.rdoc
64
+ files:
65
+ - VERSION
66
+ - LICENSE
67
+ - CHANGELOG.rdoc
68
+ - README.rdoc
69
+ - Rakefile
70
+ - lib/robotex.rb
71
+ - spec/spec_helper.rb
72
+ - spec/robotex_spec.rb
73
+ homepage: http://www.github.com/chriskite/robotex
74
+ licenses: []
75
+ post_install_message:
76
+ rdoc_options:
77
+ - -m
78
+ - README.rdoc
79
+ - -t
80
+ - Robotex
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ none: false
91
+ requirements:
92
+ - - ! '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project:
97
+ rubygems_version: 1.8.15
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: Obey Robots.txt
101
+ test_files:
102
+ - spec/spec_helper.rb
103
+ - spec/robotex_spec.rb