robotstxt 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,45 @@
1
+ = Robotstxt
2
+
3
+ Robotstxt is an Ruby robots.txt file parser.
4
+
5
+ Robotstxt Parser allows you to the check the accessibility of URLs and get other data.
6
+
7
+ Full support for the robots.txt RFC, wildcards and Sitemap: rules.
8
+
9
+
10
+ == Features
11
+
12
+ * Check if the URL is allowed to be crawled from your Robot
13
+ * Analyze the robots.txt file to return an Array containing the list of XML Sitemaps URLs
14
+
15
+ == Requirements
16
+
17
+ * Ruby >= 1.8.7
18
+
19
+
20
+ == Installation
21
+
22
+ This library is intended to be installed via the
23
+ RubyGems[http://rubyforge.org/projects/rubygems/] system.
24
+
25
+ $ gem install robotstxt
26
+
27
+ You might need administrator privileges on your system to install it.
28
+
29
+
30
+
31
+ == Author
32
+
33
+ Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
34
+
35
+
36
+ == Resources
37
+
38
+ * {Homepage}[http://www.simonerinzivillo.it/]
39
+
40
+
41
+
42
+ == License
43
+
44
+ Copyright (c) 2009 Simone Rinzivillo, Robotstxt is released under the MIT license.
45
+
@@ -0,0 +1,55 @@
1
+ #
2
+ # = Ruby Robotstxt
3
+ #
4
+ # An Ruby Robots.txt parser.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: Robotstxt
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+
17
+ require 'robotstxt/robotstxtistance'
18
+ require 'uri'
19
+
20
+
21
+
22
+ module Robotstxt
23
+
24
+ NAME = 'Robotstxt'
25
+ GEM = 'robotstxt'
26
+ AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
27
+ VERSION = '0.5.0'
28
+
29
+
30
+ # Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
31
+ # Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
32
+ #
33
+ # <tt>Robotstxt.allowed?('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
34
+ #
35
+ def self.allowed?(url, robot_id)
36
+ allowed = false
37
+ u = URI.parse(url)
38
+ r = Robotstxt::Robotstxtistance.new(robot_id)
39
+ return r.allowed?(url) if r.get(u.scheme + '://' + u.host)
40
+
41
+ end
42
+
43
+ # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
44
+ #
45
+ # <tt>Robotstxt.sitemaps('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
46
+ #
47
+ def self.sitemaps(url, robot_id)
48
+ allowed = false
49
+ u = URI.parse(url)
50
+ r = Robotstxt::Robotstxtistance.new(robot_id)
51
+ return r.sitemaps if r.get(u.scheme + '://' + u.host)
52
+
53
+ end
54
+
55
+ end
@@ -0,0 +1,176 @@
1
+ #
2
+ # = Ruby Robotstxt
3
+ #
4
+ # An Ruby Robots.txt parser.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: Robotstxt
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+ require 'rubygems'
17
+ require 'net/http'
18
+ require 'uri'
19
+
20
+
21
+ module Robotstxt
22
+ class Robotstxtistance
23
+ attr_accessor :robot_id
24
+ attr_reader :found, :body, :sitemaps, :rules
25
+
26
+ # Initializes a new Robots::Robotstxtistance with <tt>robot_id</tt> option.
27
+ #
28
+ # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
29
+ #
30
+ def initialize(robot_id = nil)
31
+ @robot_id = '*'
32
+ @rules = []
33
+ @sitemaps = []
34
+ @robot_id = robot_id.downcase if !robot_id.nil?
35
+
36
+ end
37
+
38
+
39
+ # Requires and parses the Robots.txt file for the <tt>hostname</tt>.
40
+ #
41
+ # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
42
+ #
43
+ # <tt>client.get('http:// www.simonerinzivillo.it')</tt>
44
+ #
45
+ #
46
+ # This method returns <tt>true</tt> if the parsing is gone.
47
+ #
48
+ def get(hostname)
49
+ @ehttp = true
50
+ url = URI.parse(hostname)
51
+
52
+ begin
53
+ http = Net::HTTP.new(url.host, url.port)
54
+ if url.scheme == 'https'
55
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
56
+ http.use_ssl = true
57
+ end
58
+
59
+ response = http.request(Net::HTTP::Get.new('/robots.txt'))
60
+
61
+ case response
62
+ when Net::HTTPSuccess then
63
+ @found = true
64
+ @body = response.body
65
+ parse()
66
+
67
+ else
68
+ @found = false
69
+ end
70
+
71
+ return @found
72
+
73
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET => e
74
+ if @ehttp
75
+ @ettp = false
76
+ retry
77
+ else
78
+ return nil
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+ # Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
85
+ #
86
+ # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
87
+ #
88
+ # <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
89
+ #
90
+ # <tt> client.allowed?('http:// www.simonerinzivillo.it/no-dir/')</tt>
91
+ #
92
+ # <tt>end</tt>
93
+ #
94
+ # This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
95
+ #
96
+ def allowed?(var)
97
+ is_allow = true
98
+ url = URI.parse(var)
99
+ querystring = (!url.query.nil?) ? '?' + url.query : ''
100
+ url_path = url.path + querystring
101
+ @rules.each {|ua|
102
+
103
+ if @robot_id == ua[0] || ua[0] == '*'
104
+
105
+ ua[1].each {|d|
106
+
107
+ is_allow = false if url_path.match('^' + d ) || d == '/'
108
+
109
+ }
110
+
111
+ end
112
+
113
+ }
114
+ return is_allow
115
+ end
116
+
117
+ # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
118
+ #
119
+ # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
120
+ #
121
+ # <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
122
+ #
123
+ # <tt> client.sitemaps.each{ |url|</tt>
124
+ #
125
+ # <tt> puts url</tt>
126
+ #
127
+ #
128
+ # <tt> }</tt>
129
+ #
130
+ # <tt>end</tt>
131
+ #
132
+ def sitemaps()
133
+ return @sitemaps
134
+ end
135
+
136
+ # This method returns <tt>true</tt> if the Robots.txt parsing is gone.
137
+ #
138
+ def found?()
139
+ return @found
140
+ end
141
+
142
+
143
+ private
144
+
145
+ def parse()
146
+ @body = @body.downcase
147
+
148
+ @body.each_line {|r|
149
+
150
+ case r
151
+ when /^#.+$/
152
+
153
+ when /^\s*user-agent\s*:.+$/
154
+
155
+ @rules << [ r.split(':')[1].strip, [], []]
156
+
157
+ when /^\s*disallow\s*:.+$/
158
+ r = r.split(':')[1].strip
159
+ @rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0
160
+
161
+ when /^\s*allow\s*:.+$/
162
+ r = r.split(':')[1].strip
163
+ @rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0
164
+
165
+ when /^\s*sitemap\s*:.+$/
166
+ @sitemaps<< r.split(':')[1].strip + r.split(':')[2].strip if r.length > 0
167
+
168
+ end
169
+
170
+ }
171
+
172
+
173
+ end
174
+
175
+ end
176
+ end
@@ -0,0 +1,19 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'Robotstxt'
5
+
6
+ class TestRobotstxt < Test::Unit::TestCase
7
+
8
+
9
+ def test_allowed
10
+ assert true == Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
11
+ assert false == Robotstxt.allowed?('http://www.simonerinzivillo.it/no-dir/', 'rubytest')
12
+ end
13
+
14
+ def test_sitemaps
15
+ assert Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest').length > 0
16
+ end
17
+
18
+
19
+ end
@@ -0,0 +1,43 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'Robotstxt'
5
+
6
+ class TestRobotstxtistance < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @client = Robotstxt::Robotstxtistance.new('rubytest')
10
+ @client.get('http://www.simonerinzivillo.it')
11
+ end
12
+
13
+ def test_initialize
14
+ client = Robotstxt::Robotstxtistance.new('*')
15
+ assert_instance_of Robotstxt::Robotstxtistance, client
16
+ end
17
+
18
+ def test_get_file_robotstxt
19
+ assert @client.get('http://www.simonerinzivillo.it')
20
+ end
21
+
22
+ def test_robotstxt_isfound
23
+ assert @client.found?()
24
+ end
25
+
26
+ def test_url_allowed
27
+ assert true == @client.allowed?('http://www.simonerinzivillo.it/')
28
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/no-dir/')
29
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/')
30
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/page.html')
31
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php')
32
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/page.php?var=0')
33
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php?var=0')
34
+ assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/')
35
+ assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php')
36
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php?var=0')
37
+ end
38
+
39
+ def test_sitemaps
40
+ assert @client.sitemaps.length() > 0
41
+ end
42
+
43
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: robotstxt
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.0
5
+ platform: ruby
6
+ authors:
7
+ - Simone Rinzivillo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-12-06 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: srinzivillo@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README.rdoc
24
+ files:
25
+ - lib/robotstxt/robotstxtistance.rb
26
+ - lib/robotstxt.rb
27
+ - README.rdoc
28
+ has_rdoc: true
29
+ homepage: http://www.simonerinzivillo.it
30
+ licenses: []
31
+
32
+ post_install_message:
33
+ rdoc_options: []
34
+
35
+ require_paths:
36
+ - lib
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: 1.8.7
42
+ version:
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: "0"
48
+ version:
49
+ requirements: []
50
+
51
+ rubyforge_project:
52
+ rubygems_version: 1.3.5
53
+ signing_key:
54
+ specification_version: 3
55
+ summary: Robotstxt is an Ruby robots.txt file parser
56
+ test_files:
57
+ - test/robotstxt_test.rb
58
+ - test/robotstxtistance_test.rb