robotstxt 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +45 -0
- data/lib/robotstxt.rb +55 -0
- data/lib/robotstxt/robotstxtistance.rb +176 -0
- data/test/robotstxt_test.rb +19 -0
- data/test/robotstxtistance_test.rb +43 -0
- metadata +58 -0
data/README.rdoc
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
= Robotstxt
|
2
|
+
|
3
|
+
Robotstxt is an Ruby robots.txt file parser.
|
4
|
+
|
5
|
+
Robotstxt Parser allows you to the check the accessibility of URLs and get other data.
|
6
|
+
|
7
|
+
Full support for the robots.txt RFC, wildcards and Sitemap: rules.
|
8
|
+
|
9
|
+
|
10
|
+
== Features
|
11
|
+
|
12
|
+
* Check if the URL is allowed to be crawled from your Robot
|
13
|
+
* Analyze the robots.txt file to return an Array containing the list of XML Sitemaps URLs
|
14
|
+
|
15
|
+
== Requirements
|
16
|
+
|
17
|
+
* Ruby >= 1.8.7
|
18
|
+
|
19
|
+
|
20
|
+
== Installation
|
21
|
+
|
22
|
+
This library is intended to be installed via the
|
23
|
+
RubyGems[http://rubyforge.org/projects/rubygems/] system.
|
24
|
+
|
25
|
+
$ gem install robotstxt
|
26
|
+
|
27
|
+
You might need administrator privileges on your system to install it.
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
== Author
|
32
|
+
|
33
|
+
Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
|
34
|
+
|
35
|
+
|
36
|
+
== Resources
|
37
|
+
|
38
|
+
* {Homepage}[http://www.simonerinzivillo.it/]
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
== License
|
43
|
+
|
44
|
+
Copyright (c) 2009 Simone Rinzivillo, Robotstxt is released under the MIT license.
|
45
|
+
|
data/lib/robotstxt.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby Robotstxt
|
3
|
+
#
|
4
|
+
# An Ruby Robots.txt parser.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: Robotstxt
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
|
17
|
+
require 'robotstxt/robotstxtistance'
|
18
|
+
require 'uri'
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
module Robotstxt
|
23
|
+
|
24
|
+
NAME = 'Robotstxt'
|
25
|
+
GEM = 'robotstxt'
|
26
|
+
AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
|
27
|
+
VERSION = '0.5.0'
|
28
|
+
|
29
|
+
|
30
|
+
# Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
|
31
|
+
# Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
|
32
|
+
#
|
33
|
+
# <tt>Robotstxt.allowed?('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
|
34
|
+
#
|
35
|
+
def self.allowed?(url, robot_id)
|
36
|
+
allowed = false
|
37
|
+
u = URI.parse(url)
|
38
|
+
r = Robotstxt::Robotstxtistance.new(robot_id)
|
39
|
+
return r.allowed?(url) if r.get(u.scheme + '://' + u.host)
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
# Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
|
44
|
+
#
|
45
|
+
# <tt>Robotstxt.sitemaps('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
|
46
|
+
#
|
47
|
+
def self.sitemaps(url, robot_id)
|
48
|
+
allowed = false
|
49
|
+
u = URI.parse(url)
|
50
|
+
r = Robotstxt::Robotstxtistance.new(robot_id)
|
51
|
+
return r.sitemaps if r.get(u.scheme + '://' + u.host)
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby Robotstxt
|
3
|
+
#
|
4
|
+
# An Ruby Robots.txt parser.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: Robotstxt
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
require 'rubygems'
|
17
|
+
require 'net/http'
|
18
|
+
require 'uri'
|
19
|
+
|
20
|
+
|
21
|
+
module Robotstxt
|
22
|
+
class Robotstxtistance
|
23
|
+
attr_accessor :robot_id
|
24
|
+
attr_reader :found, :body, :sitemaps, :rules
|
25
|
+
|
26
|
+
# Initializes a new Robots::Robotstxtistance with <tt>robot_id</tt> option.
|
27
|
+
#
|
28
|
+
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
29
|
+
#
|
30
|
+
def initialize(robot_id = nil)
|
31
|
+
@robot_id = '*'
|
32
|
+
@rules = []
|
33
|
+
@sitemaps = []
|
34
|
+
@robot_id = robot_id.downcase if !robot_id.nil?
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
# Requires and parses the Robots.txt file for the <tt>hostname</tt>.
|
40
|
+
#
|
41
|
+
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
42
|
+
#
|
43
|
+
# <tt>client.get('http:// www.simonerinzivillo.it')</tt>
|
44
|
+
#
|
45
|
+
#
|
46
|
+
# This method returns <tt>true</tt> if the parsing is gone.
|
47
|
+
#
|
48
|
+
def get(hostname)
|
49
|
+
@ehttp = true
|
50
|
+
url = URI.parse(hostname)
|
51
|
+
|
52
|
+
begin
|
53
|
+
http = Net::HTTP.new(url.host, url.port)
|
54
|
+
if url.scheme == 'https'
|
55
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
56
|
+
http.use_ssl = true
|
57
|
+
end
|
58
|
+
|
59
|
+
response = http.request(Net::HTTP::Get.new('/robots.txt'))
|
60
|
+
|
61
|
+
case response
|
62
|
+
when Net::HTTPSuccess then
|
63
|
+
@found = true
|
64
|
+
@body = response.body
|
65
|
+
parse()
|
66
|
+
|
67
|
+
else
|
68
|
+
@found = false
|
69
|
+
end
|
70
|
+
|
71
|
+
return @found
|
72
|
+
|
73
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET => e
|
74
|
+
if @ehttp
|
75
|
+
@ettp = false
|
76
|
+
retry
|
77
|
+
else
|
78
|
+
return nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
# Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
|
85
|
+
#
|
86
|
+
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
87
|
+
#
|
88
|
+
# <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
|
89
|
+
#
|
90
|
+
# <tt> client.allowed?('http:// www.simonerinzivillo.it/no-dir/')</tt>
|
91
|
+
#
|
92
|
+
# <tt>end</tt>
|
93
|
+
#
|
94
|
+
# This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
|
95
|
+
#
|
96
|
+
def allowed?(var)
|
97
|
+
is_allow = true
|
98
|
+
url = URI.parse(var)
|
99
|
+
querystring = (!url.query.nil?) ? '?' + url.query : ''
|
100
|
+
url_path = url.path + querystring
|
101
|
+
@rules.each {|ua|
|
102
|
+
|
103
|
+
if @robot_id == ua[0] || ua[0] == '*'
|
104
|
+
|
105
|
+
ua[1].each {|d|
|
106
|
+
|
107
|
+
is_allow = false if url_path.match('^' + d ) || d == '/'
|
108
|
+
|
109
|
+
}
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
}
|
114
|
+
return is_allow
|
115
|
+
end
|
116
|
+
|
117
|
+
# Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
|
118
|
+
#
|
119
|
+
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
120
|
+
#
|
121
|
+
# <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
|
122
|
+
#
|
123
|
+
# <tt> client.sitemaps.each{ |url|</tt>
|
124
|
+
#
|
125
|
+
# <tt> puts url</tt>
|
126
|
+
#
|
127
|
+
#
|
128
|
+
# <tt> }</tt>
|
129
|
+
#
|
130
|
+
# <tt>end</tt>
|
131
|
+
#
|
132
|
+
def sitemaps()
|
133
|
+
return @sitemaps
|
134
|
+
end
|
135
|
+
|
136
|
+
# This method returns <tt>true</tt> if the Robots.txt parsing is gone.
|
137
|
+
#
|
138
|
+
def found?()
|
139
|
+
return @found
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def parse()
|
146
|
+
@body = @body.downcase
|
147
|
+
|
148
|
+
@body.each_line {|r|
|
149
|
+
|
150
|
+
case r
|
151
|
+
when /^#.+$/
|
152
|
+
|
153
|
+
when /^\s*user-agent\s*:.+$/
|
154
|
+
|
155
|
+
@rules << [ r.split(':')[1].strip, [], []]
|
156
|
+
|
157
|
+
when /^\s*disallow\s*:.+$/
|
158
|
+
r = r.split(':')[1].strip
|
159
|
+
@rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0
|
160
|
+
|
161
|
+
when /^\s*allow\s*:.+$/
|
162
|
+
r = r.split(':')[1].strip
|
163
|
+
@rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0
|
164
|
+
|
165
|
+
when /^\s*sitemap\s*:.+$/
|
166
|
+
@sitemaps<< r.split(':')[1].strip + r.split(':')[2].strip if r.length > 0
|
167
|
+
|
168
|
+
end
|
169
|
+
|
170
|
+
}
|
171
|
+
|
172
|
+
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'Robotstxt'
|
5
|
+
|
6
|
+
class TestRobotstxt < Test::Unit::TestCase
|
7
|
+
|
8
|
+
|
9
|
+
def test_allowed
|
10
|
+
assert true == Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
|
11
|
+
assert false == Robotstxt.allowed?('http://www.simonerinzivillo.it/no-dir/', 'rubytest')
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_sitemaps
|
15
|
+
assert Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest').length > 0
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'Robotstxt'
|
5
|
+
|
6
|
+
class TestRobotstxtistance < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@client = Robotstxt::Robotstxtistance.new('rubytest')
|
10
|
+
@client.get('http://www.simonerinzivillo.it')
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_initialize
|
14
|
+
client = Robotstxt::Robotstxtistance.new('*')
|
15
|
+
assert_instance_of Robotstxt::Robotstxtistance, client
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_get_file_robotstxt
|
19
|
+
assert @client.get('http://www.simonerinzivillo.it')
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_robotstxt_isfound
|
23
|
+
assert @client.found?()
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_url_allowed
|
27
|
+
assert true == @client.allowed?('http://www.simonerinzivillo.it/')
|
28
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/no-dir/')
|
29
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/')
|
30
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/page.html')
|
31
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php')
|
32
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/page.php?var=0')
|
33
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php?var=0')
|
34
|
+
assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/')
|
35
|
+
assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php')
|
36
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php?var=0')
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_sitemaps
|
40
|
+
assert @client.sitemaps.length() > 0
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: robotstxt
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.5.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simone Rinzivillo
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-06 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: srinzivillo@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README.rdoc
|
24
|
+
files:
|
25
|
+
- lib/robotstxt/robotstxtistance.rb
|
26
|
+
- lib/robotstxt.rb
|
27
|
+
- README.rdoc
|
28
|
+
has_rdoc: true
|
29
|
+
homepage: http://www.simonerinzivillo.it
|
30
|
+
licenses: []
|
31
|
+
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
|
35
|
+
require_paths:
|
36
|
+
- lib
|
37
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 1.8.7
|
42
|
+
version:
|
43
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: "0"
|
48
|
+
version:
|
49
|
+
requirements: []
|
50
|
+
|
51
|
+
rubyforge_project:
|
52
|
+
rubygems_version: 1.3.5
|
53
|
+
signing_key:
|
54
|
+
specification_version: 3
|
55
|
+
summary: Robotstxt is an Ruby robots.txt file parser
|
56
|
+
test_files:
|
57
|
+
- test/robotstxt_test.rb
|
58
|
+
- test/robotstxtistance_test.rb
|