jeremyf-robot_rules 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/LICENSE +20 -0
- data/README.markdown +62 -0
- data/Rakefile +56 -0
- data/VERSION.yml +4 -0
- data/lib/robot_rules.rb +76 -0
- data/robot_rules.gemspec +46 -0
- data/test/robot_rules_test.rb +37 -0
- data/test/test_helper.rb +9 -0
- metadata +66 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 James Edward Gray II and Jeremy Friesen
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
robot_rules
|
2
|
+
===========
|
3
|
+
|
4
|
+
A tool to determine if the robots.txt would prevent a given user agent
|
5
|
+
from making a request to a given URI.
|
6
|
+
|
7
|
+
Example
|
8
|
+
-------
|
9
|
+
|
10
|
+
Given the following:
|
11
|
+
#!/usr/local/bin/ruby -w
|
12
|
+
|
13
|
+
require "robot_rules"
|
14
|
+
require "open-uri"
|
15
|
+
|
16
|
+
rules = RobotRules.new("RubyQuizBrowser 1.0")
|
17
|
+
robots_url = "http://pragmaticprogrammer.com/robots.txt"
|
18
|
+
|
19
|
+
open(robots_url) do |url|
|
20
|
+
data = url.read
|
21
|
+
|
22
|
+
puts "/robots.txt:"
|
23
|
+
puts data
|
24
|
+
puts
|
25
|
+
|
26
|
+
rules.parse(robots_url, data)
|
27
|
+
end
|
28
|
+
|
29
|
+
puts "URL tests:"
|
30
|
+
%w{ http://pragmaticprogrammer.com/images/dave.jpg
|
31
|
+
http://pragmaticprogrammer.com/imagination }.each do |test|
|
32
|
+
puts "rules.allowed?( #{test.inspect} )"
|
33
|
+
puts rules.allowed?(test)
|
34
|
+
end
|
35
|
+
|
36
|
+
__END__
|
37
|
+
|
38
|
+
This script will print
|
39
|
+
|
40
|
+
/robots.txt:
|
41
|
+
User-agent: *
|
42
|
+
Disallow: images
|
43
|
+
|
44
|
+
URL tests:
|
45
|
+
rules.allowed?( "http://pragmaticprogrammer.com/images/dave.jpg" )
|
46
|
+
false
|
47
|
+
rules.allowed?( "http://pragmaticprogrammer.com/imagination" )
|
48
|
+
true
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
History
|
53
|
+
-------
|
54
|
+
|
55
|
+
RobotRules was created by James Edward Gray II as a response to "Port a
|
56
|
+
Library" Ruby Quiz #64. A few years later, Jeremy Friesen wrapped the
|
57
|
+
library up into a gem and added some tests.
|
58
|
+
|
59
|
+
Copyright
|
60
|
+
---------
|
61
|
+
|
62
|
+
Copyright (c) 2009 James Edward Gray II and Jeremy Friesen. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "robot_rules"
|
8
|
+
gem.summary = %Q{A tool to determine if the robots.txt would prevent a given user agent from making a request to a given URI.}
|
9
|
+
gem.email = "jeremy.n.friesen@gmail.com"
|
10
|
+
gem.homepage = "http://github.com/jeremyf/robot_rules"
|
11
|
+
gem.authors = ["James Edward Gray II", "Jeremy Friesen"]
|
12
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
13
|
+
end
|
14
|
+
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'rake/testtask'
|
20
|
+
Rake::TestTask.new(:test) do |test|
|
21
|
+
test.libs << 'lib' << 'test'
|
22
|
+
test.pattern = 'test/**/*_test.rb'
|
23
|
+
test.verbose = true
|
24
|
+
end
|
25
|
+
|
26
|
+
begin
|
27
|
+
require 'rcov/rcovtask'
|
28
|
+
Rcov::RcovTask.new do |test|
|
29
|
+
test.libs << 'test'
|
30
|
+
test.pattern = 'test/**/*_test.rb'
|
31
|
+
test.verbose = true
|
32
|
+
end
|
33
|
+
rescue LoadError
|
34
|
+
task :rcov do
|
35
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
task :default => :test
|
41
|
+
|
42
|
+
require 'rake/rdoctask'
|
43
|
+
Rake::RDocTask.new do |rdoc|
|
44
|
+
if File.exist?('VERSION.yml')
|
45
|
+
config = YAML.load(File.read('VERSION.yml'))
|
46
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
47
|
+
else
|
48
|
+
version = ""
|
49
|
+
end
|
50
|
+
|
51
|
+
rdoc.rdoc_dir = 'rdoc'
|
52
|
+
rdoc.title = "robot_rules #{version}"
|
53
|
+
rdoc.rdoc_files.include('README*')
|
54
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
55
|
+
end
|
56
|
+
|
data/VERSION.yml
ADDED
data/lib/robot_rules.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
#!/usr/local/bin/ruby -w
|
2
|
+
|
3
|
+
# robot_rules.rb
|
4
|
+
#
|
5
|
+
# Created by James Edward Gray II on 2006-01-31.
|
6
|
+
# Copyright 2006 Gray Productions. All rights reserved.
|
7
|
+
|
8
|
+
require "uri"
|
9
|
+
|
10
|
+
# Based on Perl's WWW::RobotRules module, by Gisle Aas.
|
11
|
+
class RobotRules
|
12
|
+
def initialize( user_agent )
|
13
|
+
@user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
|
14
|
+
"").downcase
|
15
|
+
@rules = Hash.new { |rules, rule| rules[rule] = Array.new }
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse( text_uri, robots_data )
|
19
|
+
uri = URI.parse(text_uri)
|
20
|
+
location = "#{uri.host}:#{uri.port}"
|
21
|
+
@rules.delete(location)
|
22
|
+
|
23
|
+
rules = robots_data.split(/[\015\012]+/).map { |rule| rule.sub(/\s*#.*$/, "") }
|
24
|
+
anon_rules = Array.new
|
25
|
+
my_rules = Array.new
|
26
|
+
current = anon_rules
|
27
|
+
rules.each do |rule|
|
28
|
+
case rule
|
29
|
+
when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
|
30
|
+
break unless my_rules.empty?
|
31
|
+
|
32
|
+
current = if $1 == "*"
|
33
|
+
anon_rules
|
34
|
+
elsif $1.downcase.index(@user_agent)
|
35
|
+
my_rules
|
36
|
+
else
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
when /^\s*Disallow\s*:\s*(.*?)\s*$/i
|
40
|
+
next if current.nil?
|
41
|
+
|
42
|
+
if $1.empty?
|
43
|
+
current << nil
|
44
|
+
else
|
45
|
+
disallow = URI.parse($1)
|
46
|
+
|
47
|
+
next unless disallow.scheme.nil? or disallow.scheme == uri.scheme
|
48
|
+
next unless disallow.port.nil? or disallow.port == uri.port
|
49
|
+
next unless disallow.host.nil? or disallow.host.downcase == uri.host.downcase
|
50
|
+
|
51
|
+
disallow = disallow.path
|
52
|
+
disallow = "/" if disallow.empty?
|
53
|
+
disallow = "/#{disallow}" unless disallow[0] == ?/
|
54
|
+
|
55
|
+
current << disallow
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
@rules[location] = if my_rules.empty?
|
61
|
+
anon_rules.compact
|
62
|
+
else
|
63
|
+
my_rules.compact
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def allowed?( text_uri )
|
68
|
+
uri = URI.parse(text_uri)
|
69
|
+
location = "#{uri.host}:#{uri.port}"
|
70
|
+
path = uri.path
|
71
|
+
|
72
|
+
return true unless %w{http https}.include?(uri.scheme)
|
73
|
+
|
74
|
+
not @rules[location].any? { |rule| path.index(rule) == 0 }
|
75
|
+
end
|
76
|
+
end
|
data/robot_rules.gemspec
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{robot_rules}
|
5
|
+
s.version = "0.9.1"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["James Edward Gray II", "Jeremy Friesen"]
|
9
|
+
s.date = %q{2009-07-28}
|
10
|
+
s.email = %q{jeremy.n.friesen@gmail.com}
|
11
|
+
s.extra_rdoc_files = [
|
12
|
+
"LICENSE",
|
13
|
+
"README.markdown"
|
14
|
+
]
|
15
|
+
s.files = [
|
16
|
+
".document",
|
17
|
+
".gitignore",
|
18
|
+
"LICENSE",
|
19
|
+
"README.markdown",
|
20
|
+
"Rakefile",
|
21
|
+
"VERSION.yml",
|
22
|
+
"lib/robot_rules.rb",
|
23
|
+
"robot_rules.gemspec",
|
24
|
+
"test/robot_rules_test.rb",
|
25
|
+
"test/test_helper.rb"
|
26
|
+
]
|
27
|
+
s.homepage = %q{http://github.com/jeremyf/robot_rules}
|
28
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
29
|
+
s.require_paths = ["lib"]
|
30
|
+
s.rubygems_version = %q{1.3.4}
|
31
|
+
s.summary = %q{A tool to determine if the robots.txt would prevent a given user agent from making a request to a given URI.}
|
32
|
+
s.test_files = [
|
33
|
+
"test/robot_rules_test.rb",
|
34
|
+
"test/test_helper.rb"
|
35
|
+
]
|
36
|
+
|
37
|
+
if s.respond_to? :specification_version then
|
38
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
39
|
+
s.specification_version = 3
|
40
|
+
|
41
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
42
|
+
else
|
43
|
+
end
|
44
|
+
else
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class RobotRulesTest < Test::Unit::TestCase
|
4
|
+
SITE_URL = "http://www.example.com"
|
5
|
+
def setup
|
6
|
+
@robot_rule = RobotRules.new('Ruby Spider 1.0')
|
7
|
+
@robot_rule.parse(File.join(SITE_URL,'robots.txt'), %(User-agent: *\nDisallow: images))
|
8
|
+
end
|
9
|
+
def test_should_allow_path_imagination
|
10
|
+
assert_equal true, @robot_rule.allowed?(File.join(SITE_URL, 'imagination/me.jpg'))
|
11
|
+
end
|
12
|
+
def test_should_disallow_path_images
|
13
|
+
assert_equal false, @robot_rule.allowed?(File.join(SITE_URL, 'images/me.jpg'))
|
14
|
+
end
|
15
|
+
def test_should_allow_path_images_for_other_site
|
16
|
+
assert_equal true, @robot_rule.allowed?(File.join("http://google.com", 'images/me.jpg'))
|
17
|
+
end
|
18
|
+
def test_should_disallow_path_images_for_other_site
|
19
|
+
assert_equal true, @robot_rule.allowed?(File.join("http://google.com", 'images/me.jpg'))
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_should_abide_by_disallowed_user_agent
|
23
|
+
@robot_rule = RobotRules.new('Microsoft')
|
24
|
+
robots_txt = %(/robots.txt:\nUser-agent: Microsoft\nDisallow: google\nUser-agent: *\nDisallow: images)
|
25
|
+
@robot_rule.parse(File.join(SITE_URL,'robots.txt'), robots_txt)
|
26
|
+
|
27
|
+
assert_equal false, @robot_rule.allowed?(File.join(SITE_URL, 'google/hellow_world.txt'))
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_should_allow_user_agent_to_specified_path
|
31
|
+
@robot_rule = RobotRules.new('Google')
|
32
|
+
robots_txt = %(/robots.txt:\nUser-agent: Microsoft\nDisallow: google\nUser-agent: *\nDisallow: images)
|
33
|
+
@robot_rule.parse(File.join(SITE_URL,'robots.txt'), robots_txt)
|
34
|
+
|
35
|
+
assert_equal true, @robot_rule.allowed?(File.join(SITE_URL, 'google/hellow_world.txt'))
|
36
|
+
end
|
37
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jeremyf-robot_rules
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.9.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- James Edward Gray II
|
8
|
+
- Jeremy Friesen
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2009-07-28 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description:
|
18
|
+
email: jeremy.n.friesen@gmail.com
|
19
|
+
executables: []
|
20
|
+
|
21
|
+
extensions: []
|
22
|
+
|
23
|
+
extra_rdoc_files:
|
24
|
+
- LICENSE
|
25
|
+
- README.markdown
|
26
|
+
files:
|
27
|
+
- .document
|
28
|
+
- .gitignore
|
29
|
+
- LICENSE
|
30
|
+
- README.markdown
|
31
|
+
- Rakefile
|
32
|
+
- VERSION.yml
|
33
|
+
- lib/robot_rules.rb
|
34
|
+
- robot_rules.gemspec
|
35
|
+
- test/robot_rules_test.rb
|
36
|
+
- test/test_helper.rb
|
37
|
+
has_rdoc: false
|
38
|
+
homepage: http://github.com/jeremyf/robot_rules
|
39
|
+
licenses:
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options:
|
42
|
+
- --charset=UTF-8
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: "0"
|
56
|
+
version:
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 1.3.5
|
61
|
+
signing_key:
|
62
|
+
specification_version: 3
|
63
|
+
summary: A tool to determine if the robots.txt would prevent a given user agent from making a request to a given URI.
|
64
|
+
test_files:
|
65
|
+
- test/robot_rules_test.rb
|
66
|
+
- test/test_helper.rb
|