robotstxt 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ = Robotstxt
2
+
3
+ Robotstxt is an Ruby robots.txt file parser.
4
+
5
+ Robotstxt Parser allows you to the check the accessibility of URLs and get other data.
6
+
7
+ Full support for the robots.txt RFC, wildcards and Sitemap: rules.
8
+
9
+
10
+ == Features
11
+
12
+ * Check if the URL is allowed to be crawled from your Robot
13
+ * Analyze the robots.txt file to return an Array containing the list of XML Sitemaps URLs
14
+
15
+ == Requirements
16
+
17
+ * Ruby >= 1.8.7
18
+
19
+
20
+ == Installation
21
+
22
+ This library is intended to be installed via the
23
+ RubyGems[http://rubyforge.org/projects/rubygems/] system.
24
+
25
+ $ gem install robotstxt
26
+
27
+ You might need administrator privileges on your system to install it.
28
+
29
+
30
+
31
+ == Author
32
+
33
+ Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
34
+
35
+
36
+ == Resources
37
+
38
+ * {Homepage}[http://www.simonerinzivillo.it/]
39
+
40
+
41
+
42
+ == License
43
+
44
+ Copyright (c) 2009 Simone Rinzivillo, Robotstxt is released under the MIT license.
45
+
@@ -0,0 +1,55 @@
1
+ #
2
+ # = Ruby Robotstxt
3
+ #
4
+ # An Ruby Robots.txt parser.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: Robotstxt
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+
17
+ require 'robotstxt/robotstxtistance'
18
+ require 'uri'
19
+
20
+
21
+
22
+ module Robotstxt
23
+
24
+ NAME = 'Robotstxt'
25
+ GEM = 'robotstxt'
26
+ AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
27
+ VERSION = '0.5.0'
28
+
29
+
30
+ # Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
31
+ # Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
32
+ #
33
+ # <tt>Robotstxt.allowed?('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
34
+ #
35
+ def self.allowed?(url, robot_id)
36
+ allowed = false
37
+ u = URI.parse(url)
38
+ r = Robotstxt::Robotstxtistance.new(robot_id)
39
+ return r.allowed?(url) if r.get(u.scheme + '://' + u.host)
40
+
41
+ end
42
+
43
+ # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
44
+ #
45
+ # <tt>Robotstxt.sitemaps('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
46
+ #
47
+ def self.sitemaps(url, robot_id)
48
+ allowed = false
49
+ u = URI.parse(url)
50
+ r = Robotstxt::Robotstxtistance.new(robot_id)
51
+ return r.sitemaps if r.get(u.scheme + '://' + u.host)
52
+
53
+ end
54
+
55
+ end
@@ -0,0 +1,176 @@
1
+ #
2
+ # = Ruby Robotstxt
3
+ #
4
+ # An Ruby Robots.txt parser.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: Robotstxt
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+ require 'rubygems'
17
+ require 'net/http'
18
+ require 'uri'
19
+
20
+
21
+ module Robotstxt
22
+ class Robotstxtistance
23
+ attr_accessor :robot_id
24
+ attr_reader :found, :body, :sitemaps, :rules
25
+
26
+ # Initializes a new Robots::Robotstxtistance with <tt>robot_id</tt> option.
27
+ #
28
+ # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
29
+ #
30
+ def initialize(robot_id = nil)
31
+ @robot_id = '*'
32
+ @rules = []
33
+ @sitemaps = []
34
+ @robot_id = robot_id.downcase if !robot_id.nil?
35
+
36
+ end
37
+
38
+
39
+ # Requires and parses the Robots.txt file for the <tt>hostname</tt>.
40
+ #
41
+ # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
42
+ #
43
+ # <tt>client.get('http:// www.simonerinzivillo.it')</tt>
44
+ #
45
+ #
46
+ # This method returns <tt>true</tt> if the parsing is gone.
47
+ #
48
+ def get(hostname)
49
+ @ehttp = true
50
+ url = URI.parse(hostname)
51
+
52
+ begin
53
+ http = Net::HTTP.new(url.host, url.port)
54
+ if url.scheme == 'https'
55
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
56
+ http.use_ssl = true
57
+ end
58
+
59
+ response = http.request(Net::HTTP::Get.new('/robots.txt'))
60
+
61
+ case response
62
+ when Net::HTTPSuccess then
63
+ @found = true
64
+ @body = response.body
65
+ parse()
66
+
67
+ else
68
+ @found = false
69
+ end
70
+
71
+ return @found
72
+
73
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET => e
74
+ if @ehttp
75
+ @ettp = false
76
+ retry
77
+ else
78
+ return nil
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+ # Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
85
+ #
86
+ # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
87
+ #
88
+ # <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
89
+ #
90
+ # <tt> client.allowed?('http:// www.simonerinzivillo.it/no-dir/')</tt>
91
+ #
92
+ # <tt>end</tt>
93
+ #
94
+ # This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
95
+ #
96
+ def allowed?(var)
97
+ is_allow = true
98
+ url = URI.parse(var)
99
+ querystring = (!url.query.nil?) ? '?' + url.query : ''
100
+ url_path = url.path + querystring
101
+ @rules.each {|ua|
102
+
103
+ if @robot_id == ua[0] || ua[0] == '*'
104
+
105
+ ua[1].each {|d|
106
+
107
+ is_allow = false if url_path.match('^' + d ) || d == '/'
108
+
109
+ }
110
+
111
+ end
112
+
113
+ }
114
+ return is_allow
115
+ end
116
+
117
+ # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
118
+ #
119
+ # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
120
+ #
121
+ # <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
122
+ #
123
+ # <tt> client.sitemaps.each{ |url|</tt>
124
+ #
125
+ # <tt> puts url</tt>
126
+ #
127
+ #
128
+ # <tt> }</tt>
129
+ #
130
+ # <tt>end</tt>
131
+ #
132
+ def sitemaps()
133
+ return @sitemaps
134
+ end
135
+
136
+ # This method returns <tt>true</tt> if the Robots.txt parsing is gone.
137
+ #
138
+ def found?()
139
+ return @found
140
+ end
141
+
142
+
143
+ private
144
+
145
+ def parse()
146
+ @body = @body.downcase
147
+
148
+ @body.each_line {|r|
149
+
150
+ case r
151
+ when /^#.+$/
152
+
153
+ when /^\s*user-agent\s*:.+$/
154
+
155
+ @rules << [ r.split(':')[1].strip, [], []]
156
+
157
+ when /^\s*disallow\s*:.+$/
158
+ r = r.split(':')[1].strip
159
+ @rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0
160
+
161
+ when /^\s*allow\s*:.+$/
162
+ r = r.split(':')[1].strip
163
+ @rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0
164
+
165
+ when /^\s*sitemap\s*:.+$/
166
+ @sitemaps<< r.split(':')[1].strip + r.split(':')[2].strip if r.length > 0
167
+
168
+ end
169
+
170
+ }
171
+
172
+
173
+ end
174
+
175
+ end
176
+ end
@@ -0,0 +1,19 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'Robotstxt'
5
+
6
+ class TestRobotstxt < Test::Unit::TestCase
7
+
8
+
9
+ def test_allowed
10
+ assert true == Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
11
+ assert false == Robotstxt.allowed?('http://www.simonerinzivillo.it/no-dir/', 'rubytest')
12
+ end
13
+
14
+ def test_sitemaps
15
+ assert Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest').length > 0
16
+ end
17
+
18
+
19
+ end
@@ -0,0 +1,43 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'Robotstxt'
5
+
6
+ class TestRobotstxtistance < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @client = Robotstxt::Robotstxtistance.new('rubytest')
10
+ @client.get('http://www.simonerinzivillo.it')
11
+ end
12
+
13
+ def test_initialize
14
+ client = Robotstxt::Robotstxtistance.new('*')
15
+ assert_instance_of Robotstxt::Robotstxtistance, client
16
+ end
17
+
18
+ def test_get_file_robotstxt
19
+ assert @client.get('http://www.simonerinzivillo.it')
20
+ end
21
+
22
+ def test_robotstxt_isfound
23
+ assert @client.found?()
24
+ end
25
+
26
+ def test_url_allowed
27
+ assert true == @client.allowed?('http://www.simonerinzivillo.it/')
28
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/no-dir/')
29
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/')
30
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/page.html')
31
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php')
32
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/page.php?var=0')
33
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php?var=0')
34
+ assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/')
35
+ assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php')
36
+ assert false == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php?var=0')
37
+ end
38
+
39
+ def test_sitemaps
40
+ assert @client.sitemaps.length() > 0
41
+ end
42
+
43
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: robotstxt
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.0
5
+ platform: ruby
6
+ authors:
7
+ - Simone Rinzivillo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-12-06 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: srinzivillo@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README.rdoc
24
+ files:
25
+ - lib/robotstxt/robotstxtistance.rb
26
+ - lib/robotstxt.rb
27
+ - README.rdoc
28
+ has_rdoc: true
29
+ homepage: http://www.simonerinzivillo.it
30
+ licenses: []
31
+
32
+ post_install_message:
33
+ rdoc_options: []
34
+
35
+ require_paths:
36
+ - lib
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: 1.8.7
42
+ version:
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: "0"
48
+ version:
49
+ requirements: []
50
+
51
+ rubyforge_project:
52
+ rubygems_version: 1.3.5
53
+ signing_key:
54
+ specification_version: 3
55
+ summary: Robotstxt is an Ruby robots.txt file parser
56
+ test_files:
57
+ - test/robotstxt_test.rb
58
+ - test/robotstxtistance_test.rb