robots 0.0.0 → 0.7.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/CHANGELOG +19 -0
- data/README +33 -0
- data/Rakefile +57 -0
- data/VERSION +1 -0
- data/lib/robots.rb +133 -0
- data/robots.gemspec +50 -0
- data/test/fixtures/robots1.txt +0 -0
- data/test/test_robots.rb +45 -0
- metadata +33 -16
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.gem
|
data/CHANGELOG
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
0.7.3
|
2
|
+
- Move to jeweler
|
3
|
+
0.7.2
|
4
|
+
- Add Ruby 1.9 compatibility
|
5
|
+
0.5-0.7.1
|
6
|
+
- Lost the changelog information :/
|
7
|
+
0.4.0
|
8
|
+
- Fixed other_values bug
|
9
|
+
- added crawl-delay support
|
10
|
+
0.3.2
|
11
|
+
- fixed breaking on reddit.com
|
12
|
+
0.3.1
|
13
|
+
- fixed bug in disallows handling
|
14
|
+
- partially mocked out open-uri
|
15
|
+
0.3.0
|
16
|
+
- added loggable dependency
|
17
|
+
0.2.0
|
18
|
+
- IF robot.txt 404s, assume allowed.
|
19
|
+
- Added CHANGELOG
|
data/README
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
A simple Ruby library to parse robots.txt.
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
|
5
|
+
robots = Robots.new "Some User Agent"
|
6
|
+
assert robots.allowed?("http://www.yelp.com/foo")
|
7
|
+
assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
|
8
|
+
robots.other_values("http://foo.com") # gets misc. key/values (i.e. sitemaps)
|
9
|
+
|
10
|
+
If you want caching, you're on your own. I suggest marshalling an instance of the parser.
|
11
|
+
|
12
|
+
Copyright (c) 2008 Kyle Maxwell
|
13
|
+
|
14
|
+
Permission is hereby granted, free of charge, to any person
|
15
|
+
obtaining a copy of this software and associated documentation
|
16
|
+
files (the "Software"), to deal in the Software without
|
17
|
+
restriction, including without limitation the rights to use,
|
18
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
19
|
+
copies of the Software, and to permit persons to whom the
|
20
|
+
Software is furnished to do so, subject to the following
|
21
|
+
conditions:
|
22
|
+
|
23
|
+
The above copyright notice and this permission notice shall be
|
24
|
+
included in all copies or substantial portions of the Software.
|
25
|
+
|
26
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
27
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
28
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
29
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
30
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
31
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
32
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
33
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "robots"
|
8
|
+
gem.summary = "Simple robots.txt parser"
|
9
|
+
gem.description = "It parses robots.txt files"
|
10
|
+
gem.email = "kyle@kylemaxwell.com"
|
11
|
+
gem.homepage = "http://github.com/fizx/robots"
|
12
|
+
gem.authors = ["Kyle Maxwell"]
|
13
|
+
gem.add_development_dependency "thoughtbot-shoulda"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'rake/testtask'
|
22
|
+
Rake::TestTask.new(:test) do |test|
|
23
|
+
test.libs << 'lib' << 'test'
|
24
|
+
test.pattern = 'test/**/*_test.rb'
|
25
|
+
test.verbose = true
|
26
|
+
end
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'rcov/rcovtask'
|
30
|
+
Rcov::RcovTask.new do |test|
|
31
|
+
test.libs << 'test'
|
32
|
+
test.pattern = 'test/**/*_test.rb'
|
33
|
+
test.verbose = true
|
34
|
+
end
|
35
|
+
rescue LoadError
|
36
|
+
task :rcov do
|
37
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
task :test => :check_dependencies
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
if File.exist?('VERSION')
|
48
|
+
version = File.read('VERSION')
|
49
|
+
else
|
50
|
+
version = ""
|
51
|
+
end
|
52
|
+
|
53
|
+
rdoc.rdoc_dir = 'rdoc'
|
54
|
+
rdoc.title = "robots #{version}"
|
55
|
+
rdoc.rdoc_files.include('README*')
|
56
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
57
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.7.3
|
data/lib/robots.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "uri"
|
3
|
+
require "rubygems"
|
4
|
+
require "timeout"
|
5
|
+
|
6
|
+
class Robots
|
7
|
+
|
8
|
+
DEFAULT_TIMEOUT = 3
|
9
|
+
|
10
|
+
class ParsedRobots
|
11
|
+
|
12
|
+
def initialize(uri, user_agent)
|
13
|
+
@last_accessed = Time.at(1)
|
14
|
+
|
15
|
+
io = nil
|
16
|
+
begin
|
17
|
+
Timeout::timeout(Robots.timeout) do
|
18
|
+
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
19
|
+
end
|
20
|
+
rescue Timeout::Error
|
21
|
+
STDERR.puts "robots.txt request timed out"
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
26
|
+
io = StringIO.new("User-agent: *\nAllow: /\n")
|
27
|
+
end
|
28
|
+
|
29
|
+
@other = {}
|
30
|
+
@disallows = {}
|
31
|
+
@allows = {}
|
32
|
+
@delays = {} # added delays to make it work
|
33
|
+
agent = /.*/
|
34
|
+
io.each do |line|
|
35
|
+
next if line =~ /^\s*(#.*|$)/
|
36
|
+
arr = line.split(":")
|
37
|
+
key = arr.shift
|
38
|
+
value = arr.join(":").strip
|
39
|
+
value.strip!
|
40
|
+
case key
|
41
|
+
when "User-agent"
|
42
|
+
agent = to_regex(value)
|
43
|
+
when "Allow"
|
44
|
+
@allows[agent] ||= []
|
45
|
+
@allows[agent] << to_regex(value)
|
46
|
+
when "Disallow"
|
47
|
+
@disallows[agent] ||= []
|
48
|
+
@disallows[agent] << to_regex(value)
|
49
|
+
when "Crawl-delay"
|
50
|
+
@delays[agent] = value.to_i
|
51
|
+
else
|
52
|
+
@other[key] = value
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
@parsed = true
|
57
|
+
end
|
58
|
+
|
59
|
+
def allowed?(uri, user_agent)
|
60
|
+
return true unless @parsed
|
61
|
+
allowed = true
|
62
|
+
path = uri.request_uri
|
63
|
+
|
64
|
+
@disallows.each do |key, value|
|
65
|
+
if user_agent =~ key
|
66
|
+
value.each do |rule|
|
67
|
+
if path =~ rule
|
68
|
+
allowed = false
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
@allows.each do |key, value|
|
75
|
+
unless allowed
|
76
|
+
if user_agent =~ key
|
77
|
+
value.each do |rule|
|
78
|
+
if path =~ rule
|
79
|
+
allowed = true
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
if allowed && @delays[user_agent]
|
87
|
+
sleep @delays[user_agent] - (Time.now - @last_accessed)
|
88
|
+
@last_accessed = Time.now
|
89
|
+
end
|
90
|
+
|
91
|
+
return allowed
|
92
|
+
end
|
93
|
+
|
94
|
+
def other_values
|
95
|
+
@other
|
96
|
+
end
|
97
|
+
|
98
|
+
protected
|
99
|
+
|
100
|
+
def to_regex(pattern)
|
101
|
+
pattern = Regexp.escape(pattern)
|
102
|
+
pattern.gsub!(Regexp.escape("*"), ".*")
|
103
|
+
Regexp.compile("^#{pattern}")
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def self.timeout=(t)
|
108
|
+
@timeout = t
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.timeout
|
112
|
+
@timeout || DEFAULT_TIMEOUT
|
113
|
+
end
|
114
|
+
|
115
|
+
def initialize(user_agent)
|
116
|
+
@user_agent = user_agent
|
117
|
+
@parsed = {}
|
118
|
+
end
|
119
|
+
|
120
|
+
def allowed?(uri)
|
121
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
122
|
+
host = uri.host
|
123
|
+
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
|
124
|
+
@parsed[host].allowed?(uri, @user_agent)
|
125
|
+
end
|
126
|
+
|
127
|
+
def other_values(uri)
|
128
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
129
|
+
host = uri.host
|
130
|
+
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
|
131
|
+
@parsed[host].other_values
|
132
|
+
end
|
133
|
+
end
|
data/robots.gemspec
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{robots}
|
8
|
+
s.version = "0.7.3"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Kyle Maxwell"]
|
12
|
+
s.date = %q{2009-10-18}
|
13
|
+
s.description = %q{It parses robots.txt files}
|
14
|
+
s.email = %q{kyle@kylemaxwell.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"README"
|
17
|
+
]
|
18
|
+
s.files = [
|
19
|
+
".gitignore",
|
20
|
+
"CHANGELOG",
|
21
|
+
"README",
|
22
|
+
"Rakefile",
|
23
|
+
"VERSION",
|
24
|
+
"lib/robots.rb",
|
25
|
+
"robots.gemspec",
|
26
|
+
"test/fixtures/robots1.txt",
|
27
|
+
"test/test_robots.rb"
|
28
|
+
]
|
29
|
+
s.homepage = %q{http://github.com/fizx/robots}
|
30
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
31
|
+
s.require_paths = ["lib"]
|
32
|
+
s.rubygems_version = %q{1.3.4}
|
33
|
+
s.summary = %q{Simple robots.txt parser}
|
34
|
+
s.test_files = [
|
35
|
+
"test/test_robots.rb"
|
36
|
+
]
|
37
|
+
|
38
|
+
if s.respond_to? :specification_version then
|
39
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
40
|
+
s.specification_version = 3
|
41
|
+
|
42
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
43
|
+
s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
44
|
+
else
|
45
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
46
|
+
end
|
47
|
+
else
|
48
|
+
s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
|
49
|
+
end
|
50
|
+
end
|
File without changes
|
data/test/test_robots.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "test/unit"
|
3
|
+
require File.dirname(__FILE__) + "/../lib/robots"
|
4
|
+
|
5
|
+
module Kernel
|
6
|
+
alias_method :open_old, :open
|
7
|
+
|
8
|
+
def set_open(key, value)
|
9
|
+
@fake_open_values ||= {}
|
10
|
+
@fake_open_values[key] = value
|
11
|
+
end
|
12
|
+
|
13
|
+
def open(*args)
|
14
|
+
@fake_open_values ||= {}
|
15
|
+
@fake_open_values[args.first] || open_old(*args)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class TestRobots < Test::Unit::TestCase
|
20
|
+
def setup
|
21
|
+
@robots = Robots.new "Ruby-Robot.txt Parser Test Script"
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_allowed_if_no_robots
|
25
|
+
assert @robots.allowed?("http://www.yahoo.com")
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_reddit
|
29
|
+
assert @robots.allowed?("http://reddit.com")
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_other
|
33
|
+
assert @robots.allowed?("http://www.yelp.com/foo")
|
34
|
+
assert !@robots.allowed?("http://www.yelp.com/mail?foo=bar")
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_site_with_disallowed
|
38
|
+
assert @robots.allowed?("http://www.google.com/")
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_other_values
|
42
|
+
sitemap = {"Sitemap" => "http://www.eventbrite.com/sitemap_index.xml"}
|
43
|
+
assert_equal(sitemap, @robots.other_values("http://eventbrite.com"))
|
44
|
+
end
|
45
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: robots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kyle Maxwell
|
@@ -9,27 +9,44 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-10-
|
12
|
+
date: 2009-10-18 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: thoughtbot-shoulda
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
description: It parses robots.txt files
|
26
|
+
email: kyle@kylemaxwell.com
|
18
27
|
executables: []
|
19
28
|
|
20
29
|
extensions: []
|
21
30
|
|
22
|
-
extra_rdoc_files:
|
23
|
-
|
24
|
-
files:
|
25
|
-
|
31
|
+
extra_rdoc_files:
|
32
|
+
- README
|
33
|
+
files:
|
34
|
+
- .gitignore
|
35
|
+
- CHANGELOG
|
36
|
+
- README
|
37
|
+
- Rakefile
|
38
|
+
- VERSION
|
39
|
+
- lib/robots.rb
|
40
|
+
- robots.gemspec
|
41
|
+
- test/fixtures/robots1.txt
|
42
|
+
- test/test_robots.rb
|
26
43
|
has_rdoc: true
|
27
|
-
homepage:
|
44
|
+
homepage: http://github.com/fizx/robots
|
28
45
|
licenses: []
|
29
46
|
|
30
47
|
post_install_message:
|
31
|
-
rdoc_options:
|
32
|
-
|
48
|
+
rdoc_options:
|
49
|
+
- --charset=UTF-8
|
33
50
|
require_paths:
|
34
51
|
- lib
|
35
52
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -50,6 +67,6 @@ rubyforge_project:
|
|
50
67
|
rubygems_version: 1.3.4
|
51
68
|
signing_key:
|
52
69
|
specification_version: 3
|
53
|
-
summary:
|
54
|
-
test_files:
|
55
|
-
|
70
|
+
summary: Simple robots.txt parser
|
71
|
+
test_files:
|
72
|
+
- test/test_robots.rb
|