robotstxt 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +45 -0
- data/lib/robotstxt.rb +55 -0
- data/lib/robotstxt/robotstxtistance.rb +176 -0
- data/test/robotstxt_test.rb +19 -0
- data/test/robotstxtistance_test.rb +43 -0
- metadata +58 -0
data/README.rdoc
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
= Robotstxt
|
2
|
+
|
3
|
+
Robotstxt is an Ruby robots.txt file parser.
|
4
|
+
|
5
|
+
Robotstxt Parser allows you to the check the accessibility of URLs and get other data.
|
6
|
+
|
7
|
+
Full support for the robots.txt RFC, wildcards and Sitemap: rules.
|
8
|
+
|
9
|
+
|
10
|
+
== Features
|
11
|
+
|
12
|
+
* Check if the URL is allowed to be crawled from your Robot
|
13
|
+
* Analyze the robots.txt file to return an Array containing the list of XML Sitemaps URLs
|
14
|
+
|
15
|
+
== Requirements
|
16
|
+
|
17
|
+
* Ruby >= 1.8.7
|
18
|
+
|
19
|
+
|
20
|
+
== Installation
|
21
|
+
|
22
|
+
This library is intended to be installed via the
|
23
|
+
RubyGems[http://rubyforge.org/projects/rubygems/] system.
|
24
|
+
|
25
|
+
$ gem install robotstxt
|
26
|
+
|
27
|
+
You might need administrator privileges on your system to install it.
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
== Author
|
32
|
+
|
33
|
+
Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
|
34
|
+
|
35
|
+
|
36
|
+
== Resources
|
37
|
+
|
38
|
+
* {Homepage}[http://www.simonerinzivillo.it/]
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
== License
|
43
|
+
|
44
|
+
Copyright (c) 2009 Simone Rinzivillo, Robotstxt is released under the MIT license.
|
45
|
+
|
data/lib/robotstxt.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby Robotstxt
|
3
|
+
#
|
4
|
+
# An Ruby Robots.txt parser.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: Robotstxt
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
|
17
|
+
require 'robotstxt/robotstxtistance'
|
18
|
+
require 'uri'
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
module Robotstxt
|
23
|
+
|
24
|
+
NAME = 'Robotstxt'
|
25
|
+
GEM = 'robotstxt'
|
26
|
+
AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
|
27
|
+
VERSION = '0.5.0'
|
28
|
+
|
29
|
+
|
30
|
+
# Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
|
31
|
+
# Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
|
32
|
+
#
|
33
|
+
# <tt>Robotstxt.allowed?('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
|
34
|
+
#
|
35
|
+
def self.allowed?(url, robot_id)
|
36
|
+
allowed = false
|
37
|
+
u = URI.parse(url)
|
38
|
+
r = Robotstxt::Robotstxtistance.new(robot_id)
|
39
|
+
return r.allowed?(url) if r.get(u.scheme + '://' + u.host)
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
# Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
|
44
|
+
#
|
45
|
+
# <tt>Robotstxt.sitemaps('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
|
46
|
+
#
|
47
|
+
def self.sitemaps(url, robot_id)
|
48
|
+
allowed = false
|
49
|
+
u = URI.parse(url)
|
50
|
+
r = Robotstxt::Robotstxtistance.new(robot_id)
|
51
|
+
return r.sitemaps if r.get(u.scheme + '://' + u.host)
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby Robotstxt
|
3
|
+
#
|
4
|
+
# An Ruby Robots.txt parser.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: Robotstxt
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
require 'rubygems'
|
17
|
+
require 'net/http'
|
18
|
+
require 'uri'
|
19
|
+
|
20
|
+
|
21
|
+
module Robotstxt
|
22
|
+
class Robotstxtistance
|
23
|
+
attr_accessor :robot_id
|
24
|
+
attr_reader :found, :body, :sitemaps, :rules
|
25
|
+
|
26
|
+
# Initializes a new Robots::Robotstxtistance with <tt>robot_id</tt> option.
|
27
|
+
#
|
28
|
+
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
29
|
+
#
|
30
|
+
def initialize(robot_id = nil)
|
31
|
+
@robot_id = '*'
|
32
|
+
@rules = []
|
33
|
+
@sitemaps = []
|
34
|
+
@robot_id = robot_id.downcase if !robot_id.nil?
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
# Requires and parses the Robots.txt file for the <tt>hostname</tt>.
|
40
|
+
#
|
41
|
+
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
42
|
+
#
|
43
|
+
# <tt>client.get('http:// www.simonerinzivillo.it')</tt>
|
44
|
+
#
|
45
|
+
#
|
46
|
+
# This method returns <tt>true</tt> if the parsing is gone.
|
47
|
+
#
|
48
|
+
def get(hostname)
|
49
|
+
@ehttp = true
|
50
|
+
url = URI.parse(hostname)
|
51
|
+
|
52
|
+
begin
|
53
|
+
http = Net::HTTP.new(url.host, url.port)
|
54
|
+
if url.scheme == 'https'
|
55
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
56
|
+
http.use_ssl = true
|
57
|
+
end
|
58
|
+
|
59
|
+
response = http.request(Net::HTTP::Get.new('/robots.txt'))
|
60
|
+
|
61
|
+
case response
|
62
|
+
when Net::HTTPSuccess then
|
63
|
+
@found = true
|
64
|
+
@body = response.body
|
65
|
+
parse()
|
66
|
+
|
67
|
+
else
|
68
|
+
@found = false
|
69
|
+
end
|
70
|
+
|
71
|
+
return @found
|
72
|
+
|
73
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET => e
|
74
|
+
if @ehttp
|
75
|
+
@ettp = false
|
76
|
+
retry
|
77
|
+
else
|
78
|
+
return nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
# Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
|
85
|
+
#
|
86
|
+
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
87
|
+
#
|
88
|
+
# <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
|
89
|
+
#
|
90
|
+
# <tt> client.allowed?('http:// www.simonerinzivillo.it/no-dir/')</tt>
|
91
|
+
#
|
92
|
+
# <tt>end</tt>
|
93
|
+
#
|
94
|
+
# This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
|
95
|
+
#
|
96
|
+
def allowed?(var)
|
97
|
+
is_allow = true
|
98
|
+
url = URI.parse(var)
|
99
|
+
querystring = (!url.query.nil?) ? '?' + url.query : ''
|
100
|
+
url_path = url.path + querystring
|
101
|
+
@rules.each {|ua|
|
102
|
+
|
103
|
+
if @robot_id == ua[0] || ua[0] == '*'
|
104
|
+
|
105
|
+
ua[1].each {|d|
|
106
|
+
|
107
|
+
is_allow = false if url_path.match('^' + d ) || d == '/'
|
108
|
+
|
109
|
+
}
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
}
|
114
|
+
return is_allow
|
115
|
+
end
|
116
|
+
|
117
|
+
# Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
|
118
|
+
#
|
119
|
+
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
120
|
+
#
|
121
|
+
# <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
|
122
|
+
#
|
123
|
+
# <tt> client.sitemaps.each{ |url|</tt>
|
124
|
+
#
|
125
|
+
# <tt> puts url</tt>
|
126
|
+
#
|
127
|
+
#
|
128
|
+
# <tt> }</tt>
|
129
|
+
#
|
130
|
+
# <tt>end</tt>
|
131
|
+
#
|
132
|
+
def sitemaps()
|
133
|
+
return @sitemaps
|
134
|
+
end
|
135
|
+
|
136
|
+
# This method returns <tt>true</tt> if the Robots.txt parsing is gone.
|
137
|
+
#
|
138
|
+
def found?()
|
139
|
+
return @found
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def parse()
|
146
|
+
@body = @body.downcase
|
147
|
+
|
148
|
+
@body.each_line {|r|
|
149
|
+
|
150
|
+
case r
|
151
|
+
when /^#.+$/
|
152
|
+
|
153
|
+
when /^\s*user-agent\s*:.+$/
|
154
|
+
|
155
|
+
@rules << [ r.split(':')[1].strip, [], []]
|
156
|
+
|
157
|
+
when /^\s*disallow\s*:.+$/
|
158
|
+
r = r.split(':')[1].strip
|
159
|
+
@rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0
|
160
|
+
|
161
|
+
when /^\s*allow\s*:.+$/
|
162
|
+
r = r.split(':')[1].strip
|
163
|
+
@rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0
|
164
|
+
|
165
|
+
when /^\s*sitemap\s*:.+$/
|
166
|
+
@sitemaps<< r.split(':')[1].strip + r.split(':')[2].strip if r.length > 0
|
167
|
+
|
168
|
+
end
|
169
|
+
|
170
|
+
}
|
171
|
+
|
172
|
+
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'Robotstxt'
|
5
|
+
|
6
|
+
class TestRobotstxt < Test::Unit::TestCase
|
7
|
+
|
8
|
+
|
9
|
+
def test_allowed
|
10
|
+
assert true == Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
|
11
|
+
assert false == Robotstxt.allowed?('http://www.simonerinzivillo.it/no-dir/', 'rubytest')
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_sitemaps
|
15
|
+
assert Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest').length > 0
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'Robotstxt'
|
5
|
+
|
6
|
+
class TestRobotstxtistance < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@client = Robotstxt::Robotstxtistance.new('rubytest')
|
10
|
+
@client.get('http://www.simonerinzivillo.it')
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_initialize
|
14
|
+
client = Robotstxt::Robotstxtistance.new('*')
|
15
|
+
assert_instance_of Robotstxt::Robotstxtistance, client
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_get_file_robotstxt
|
19
|
+
assert @client.get('http://www.simonerinzivillo.it')
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_robotstxt_isfound
|
23
|
+
assert @client.found?()
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_url_allowed
|
27
|
+
assert true == @client.allowed?('http://www.simonerinzivillo.it/')
|
28
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/no-dir/')
|
29
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/')
|
30
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/page.html')
|
31
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php')
|
32
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/page.php?var=0')
|
33
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php?var=0')
|
34
|
+
assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/')
|
35
|
+
assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php')
|
36
|
+
assert false == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php?var=0')
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_sitemaps
|
40
|
+
assert @client.sitemaps.length() > 0
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: robotstxt
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.5.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simone Rinzivillo
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-06 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: srinzivillo@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README.rdoc
|
24
|
+
files:
|
25
|
+
- lib/robotstxt/robotstxtistance.rb
|
26
|
+
- lib/robotstxt.rb
|
27
|
+
- README.rdoc
|
28
|
+
has_rdoc: true
|
29
|
+
homepage: http://www.simonerinzivillo.it
|
30
|
+
licenses: []
|
31
|
+
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
|
35
|
+
require_paths:
|
36
|
+
- lib
|
37
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 1.8.7
|
42
|
+
version:
|
43
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: "0"
|
48
|
+
version:
|
49
|
+
requirements: []
|
50
|
+
|
51
|
+
rubyforge_project:
|
52
|
+
rubygems_version: 1.3.5
|
53
|
+
signing_key:
|
54
|
+
specification_version: 3
|
55
|
+
summary: Robotstxt is an Ruby robots.txt file parser
|
56
|
+
test_files:
|
57
|
+
- test/robotstxt_test.rb
|
58
|
+
- test/robotstxtistance_test.rb
|