robotstxt-parser 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +26 -0
- data/.travis.yml +6 -0
- data/Gemfile +3 -0
- data/LICENSE.rdoc +26 -0
- data/README.rdoc +199 -0
- data/Rakefile +12 -0
- data/lib/robotstxt.rb +93 -0
- data/lib/robotstxt/common.rb +25 -0
- data/lib/robotstxt/getter.rb +79 -0
- data/lib/robotstxt/parser.rb +256 -0
- data/robotstxt.gemspec +19 -0
- data/test/getter_test.rb +74 -0
- data/test/parser_test.rb +114 -0
- metadata +86 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ab84cf493844dcbd92489c277344cf25746adff2
|
4
|
+
data.tar.gz: 285a77121e447cbec3f192eed1a3a7de03bc1bdc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 567cff3966ac583e462b7e8ab337d27695f76a20a015ddb2ca42c8052823fb5cef03b80000974ce39b2bcbc42d4ad707a826dd87cd2b1468f02fa996a3a4dcee
|
7
|
+
data.tar.gz: 054054c786da1d87adc3f853c4cfa0fc6a24238c2dbd2cb47ce6d3e25babb2bb315e59625cfacfd2d97e2aaad55e39d116f24670ebb09eb1ccdc20f48af625cd
|
data/.gitignore
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
coverage
|
6
|
+
InstalledFiles
|
7
|
+
lib/bundler/man
|
8
|
+
pkg
|
9
|
+
rdoc
|
10
|
+
spec/reports
|
11
|
+
test/tmp
|
12
|
+
test/version_tmp
|
13
|
+
tmp
|
14
|
+
|
15
|
+
Gemfile.lock
|
16
|
+
out/
|
17
|
+
sample.rb
|
18
|
+
run_sample.rb
|
19
|
+
src/
|
20
|
+
docs/
|
21
|
+
|
22
|
+
# YARD artifacts
|
23
|
+
.yardoc
|
24
|
+
_yardoc
|
25
|
+
doc/
|
26
|
+
.DS_Store
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.rdoc
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
= License
|
2
|
+
|
3
|
+
(The MIT License)
|
4
|
+
|
5
|
+
Copyright (c) 2010 Conrad Irwin <conrad@rapportive.com>
|
6
|
+
Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
|
7
|
+
|
8
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
9
|
+
a copy of this software and associated documentation files (the
|
10
|
+
"Software"), to deal in the Software without restriction, including
|
11
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
12
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
13
|
+
permit persons to whom the Software is furnished to do so, subject to
|
14
|
+
the following conditions:
|
15
|
+
|
16
|
+
The above copyright notice and this permission notice shall be
|
17
|
+
included in all copies or substantial portions of the Software.
|
18
|
+
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
20
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
21
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
22
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
23
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
24
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
25
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
26
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
= Robotstxt
|
2
|
+
|
3
|
+
Robotstxt is an Ruby robots.txt file parser.
|
4
|
+
|
5
|
+
The robots.txt exclusion protocol is a simple mechanism whereby site-owners can guide
|
6
|
+
any automated crawlers to relevant parts of their site, and prevent them accessing content
|
7
|
+
which is intended only for other eyes. For more information, see http://www.robotstxt.org/.
|
8
|
+
|
9
|
+
This library provides mechanisms for obtaining and parsing the robots.txt file from
|
10
|
+
websites. As there is no official "standard" it tries to do something sensible,
|
11
|
+
though inspiration was taken from:
|
12
|
+
|
13
|
+
- http://www.robotstxt.org/orig.html
|
14
|
+
- http://www.robotstxt.org/norobots-rfc.txt
|
15
|
+
- http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237
|
16
|
+
- http://nikitathespider.com/articles/RobotsTxt.html
|
17
|
+
|
18
|
+
While the parsing semantics of this library are explained below, you should not
|
19
|
+
write sitemaps that depend on all robots acting the same -- they simply won't.
|
20
|
+
Even the various different ruby libraries support very different subsets of
|
21
|
+
functionality.
|
22
|
+
|
23
|
+
This gem builds on the work of Simone Rinzivillo, and is released under the MIT
|
24
|
+
license -- see the LICENSE file.
|
25
|
+
|
26
|
+
== Usage
|
27
|
+
|
28
|
+
There are two public points of interest, firstly the Robotstxt module, and
|
29
|
+
secondly the Robotstxt::Parser class.
|
30
|
+
|
31
|
+
The Robotstxt module has three public methods:
|
32
|
+
|
33
|
+
- Robotstxt.get source, user_agent, (options)
|
34
|
+
Returns a Robotstxt::Parser for the robots.txt obtained from source.
|
35
|
+
|
36
|
+
- Robotstxt.parse robots_txt, user_agent
|
37
|
+
Returns a Robotstxt::Parser for the robots.txt passed in
|
38
|
+
|
39
|
+
- Robotstxt.get_allowed? urlish, user_agent, (options)
|
40
|
+
Returns true iff the robots.txt obtained from the host identified by the
|
41
|
+
urlish allows the given user agent access to the url.
|
42
|
+
|
43
|
+
The Robotstxt::Parser class contains two pieces of state, the user_agent and the
|
44
|
+
text of the robots.txt. In addition its instances have two public methods:
|
45
|
+
|
46
|
+
- Robotstxt::Parser#allowed? urlish
|
47
|
+
Returns true iff the robots.txt file allows this user_agent access to that
|
48
|
+
url.
|
49
|
+
|
50
|
+
- Robotstxt::Parser#sitemaps
|
51
|
+
Returns a list of the sitemaps listed in the robots.txt file.
|
52
|
+
|
53
|
+
In the above there are five kinds of parameter,
|
54
|
+
|
55
|
+
A "urlish" is either a String that represents a URL (suitable for passing to
|
56
|
+
URI.parse) or a URI object, i.e.
|
57
|
+
|
58
|
+
urlish = "http://www.example.com/"
|
59
|
+
urlish = "/index.html"
|
60
|
+
urlish = https://compicat.ed/home?action=fire#joking"
|
61
|
+
urlish = URI.parse("http://example.co.uk")
|
62
|
+
|
63
|
+
A "source" is either a "urlish", or a Net::HTTP connection. This allows the
|
64
|
+
library to re-use the same connection when the server respects Keep-alive:
|
65
|
+
headers, i.e.
|
66
|
+
|
67
|
+
source = Net::HTTP.new("example.com", 80)
|
68
|
+
Net::HTTP.start("example.co.uk", 80) do |http|
|
69
|
+
source = http
|
70
|
+
end
|
71
|
+
source = "http://www.example.com/index.html"
|
72
|
+
|
73
|
+
When a "urlish" is provided, only the host and port sections are used, and
|
74
|
+
the path is forced to "/robots.txt".
|
75
|
+
|
76
|
+
A "robots_txt" is the textual content of a robots.txt file that is in the
|
77
|
+
same encoding as the urls you will be fetching (normally utf8).
|
78
|
+
|
79
|
+
A "user_agent" is the string value you use in your User-agent: header.
|
80
|
+
|
81
|
+
The "options" is an optional hash containing
|
82
|
+
:num_redirects (5) - the number of redirects to follow before giving up.
|
83
|
+
:http_timeout (10) - the length of time in seconds to wait for one http
|
84
|
+
request
|
85
|
+
:url_charset (utf8) - the charset which you will use to encode your urls.
|
86
|
+
|
87
|
+
I recommend not passing the options unless you have to.
|
88
|
+
|
89
|
+
== Examples
|
90
|
+
|
91
|
+
url = "http://example.com/index.html"
|
92
|
+
if Robotstxt.get_allowed?(url, "Crawler")
|
93
|
+
open(url)
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
Net::HTTP.start("example.co.uk") do |http|
|
98
|
+
robots = Robotstxt.get(http, "Crawler")
|
99
|
+
|
100
|
+
if robots.allowed? "/index.html"
|
101
|
+
http.get("/index.html")
|
102
|
+
elsif robots.allowed? "/index.php"
|
103
|
+
http.get("/index.php")
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
== Details
|
108
|
+
|
109
|
+
=== Request level
|
110
|
+
|
111
|
+
This library handles different HTTP status codes according to the specifications
|
112
|
+
on robotstxt.org, in particular:
|
113
|
+
|
114
|
+
If an HTTPUnauthorized or an HTTPForbidden is returned when trying to access
|
115
|
+
/robots.txt, then the entire site should be considered "Disallowed".
|
116
|
+
|
117
|
+
If an HTTPRedirection is returned, it should be followed (though we give up
|
118
|
+
after five redirects, to avoid infinite loops).
|
119
|
+
|
120
|
+
If an HTTPSuccess is returned, the body is converted into utf8, and then parsed.
|
121
|
+
|
122
|
+
Any other response, or no response, indicates that there are no Disallowed urls
|
123
|
+
no the site.
|
124
|
+
|
125
|
+
=== User-agent matching
|
126
|
+
|
127
|
+
This is case-insensitive, substring matching, i.e. equivalent to matching the
|
128
|
+
user agent with /.*thing.*/i.
|
129
|
+
|
130
|
+
Additionally, * characters are interpreted as meaning any number of any character (in
|
131
|
+
regular expression idiom: /.*/). Google implies that it does this, at least for
|
132
|
+
trailing *s, and the standard implies that "*" is a special user agent meaning
|
133
|
+
"everything not referred to so far".
|
134
|
+
|
135
|
+
There can be multiple User-agent: lines for each section of Allow: and Disallow:
|
136
|
+
lines in the robots.txt file:
|
137
|
+
|
138
|
+
User-agent: Google
|
139
|
+
User-agent: Bing
|
140
|
+
Disallow: /secret
|
141
|
+
|
142
|
+
In cases like this, all user-agents inherit the same set of rules.
|
143
|
+
|
144
|
+
=== Path matching
|
145
|
+
|
146
|
+
This is case-sensitive prefix matching, i.e. equivalent to matching the
|
147
|
+
requested path (or path + '?' + query) against /^thing.*/. As with user-agents,
|
148
|
+
* is interpreted as any number of any character.
|
149
|
+
|
150
|
+
Additionally, when the pattern ends with a $, it forces the pattern to match the
|
151
|
+
entire path (or path + ? + query).
|
152
|
+
|
153
|
+
In order to get consistent results, before the globs are matched, the %-encoding
|
154
|
+
is normalised so that only /?&= remain %-encoded. For example, /h%65llo/ is the
|
155
|
+
same as /hello/, but /ac%2fdc is not the same as /ac/dc - this is due to the
|
156
|
+
significance granted to the / operator in urls.
|
157
|
+
|
158
|
+
The paths of the first section that matched our user-agent (by order of
|
159
|
+
appearance in the file) are parsed in order of appearance. The first Allow: or
|
160
|
+
Disallow: rule that matches the url is accepted. This is prescribed by
|
161
|
+
robotstxt.org, but other parsers take wildly different strategies:
|
162
|
+
Google checks all Allows: then all Disallows:
|
163
|
+
Bing checks the most-specific first
|
164
|
+
Others check all Disallows: then all Allows
|
165
|
+
|
166
|
+
As is conventional, a "Disallow: " line with no path given is treated as
|
167
|
+
"Allow: *", and if a URL didn't match any path specifiers (or the user-agent
|
168
|
+
didn't match any user-agent sections) then that is implicit permission to crawl.
|
169
|
+
|
170
|
+
== TODO
|
171
|
+
|
172
|
+
I would like to add support for the Crawl-delay directive, and indeed any other
|
173
|
+
parameters in use.
|
174
|
+
|
175
|
+
== Requirements
|
176
|
+
|
177
|
+
* Ruby >= 1.8.7
|
178
|
+
* iconv, net/http and uri
|
179
|
+
|
180
|
+
== Installation
|
181
|
+
|
182
|
+
This library is intended to be installed via the
|
183
|
+
RubyGems[http://rubyforge.org/projects/rubygems/] system.
|
184
|
+
|
185
|
+
$ gem install robotstxt
|
186
|
+
|
187
|
+
You might need administrator privileges on your system to install it.
|
188
|
+
|
189
|
+
== Author
|
190
|
+
|
191
|
+
Author:: {Conrad Irwin} <conrad@rapportive.com>
|
192
|
+
Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
|
193
|
+
|
194
|
+
== License
|
195
|
+
|
196
|
+
Robotstxt is released under the MIT license.
|
197
|
+
Copyright (c) 2010 Conrad Irwin
|
198
|
+
Copyright (c) 2009 Simone Rinzivillo
|
199
|
+
|
data/Rakefile
ADDED
data/lib/robotstxt.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby Robotstxt
|
3
|
+
#
|
4
|
+
# An Ruby Robots.txt parser.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: Robotstxt
|
9
|
+
# Author:: Conrad Irwin <conrad@rapportive.com>, Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
require 'robotstxt/common'
|
17
|
+
require 'robotstxt/parser'
|
18
|
+
require 'robotstxt/getter'
|
19
|
+
|
20
|
+
# Provides a flexible interface to help authors of web-crawlers
|
21
|
+
# respect the robots.txt exclusion standard.
|
22
|
+
#
|
23
|
+
module Robotstxt
|
24
|
+
|
25
|
+
NAME = 'Robotstxt'
|
26
|
+
GEM = 'robotstxt'
|
27
|
+
AUTHORS = ['Conrad Irwin <conrad@rapportive.com>', 'Simone Rinzivillo <srinzivillo@gmail.com>']
|
28
|
+
VERSION = '1.0'
|
29
|
+
|
30
|
+
# Obtains and parses a robotstxt file from the host identified by source,
|
31
|
+
# source can either be a URI, a string representing a URI, or a Net::HTTP
|
32
|
+
# connection associated with a host.
|
33
|
+
#
|
34
|
+
# The second parameter should be the user-agent header for your robot.
|
35
|
+
#
|
36
|
+
# There are currently two options:
|
37
|
+
# :num_redirects (default 5) is the maximum number of HTTP 3** responses
|
38
|
+
# the get() method will accept and follow the Location: header before
|
39
|
+
# giving up.
|
40
|
+
# :http_timeout (default 10) is the number of seconds to wait for each
|
41
|
+
# request before giving up.
|
42
|
+
# :url_charset (default "utf8") the character encoding you will use to
|
43
|
+
# encode urls.
|
44
|
+
#
|
45
|
+
# As indicated by robotstxt.org, this library treats HTTPUnauthorized and
|
46
|
+
# HTTPForbidden as though the robots.txt file denied access to the entire
|
47
|
+
# site, all other HTTP responses or errors are treated as though the site
|
48
|
+
# allowed all access.
|
49
|
+
#
|
50
|
+
# The return value is a Robotstxt::Parser, which you can then interact with
|
51
|
+
# by calling .allowed? or .sitemaps. i.e.
|
52
|
+
#
|
53
|
+
# Robotstxt.get("http://example.com/", "SuperRobot").allowed? "/index.html"
|
54
|
+
#
|
55
|
+
# Net::HTTP.open("example.com") do |http|
|
56
|
+
# if Robotstxt.get(http, "SuperRobot").allowed? "/index.html"
|
57
|
+
# http.get("/index.html")
|
58
|
+
# end
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
def self.get(source, robot_id, options={})
|
62
|
+
self.parse(Getter.new.obtain(source, robot_id, options), robot_id)
|
63
|
+
end
|
64
|
+
|
65
|
+
# Parses the contents of a robots.txt file for the given robot_id
|
66
|
+
#
|
67
|
+
# Returns a Robotstxt::Parser object with methods .allowed? and
|
68
|
+
# .sitemaps, i.e.
|
69
|
+
#
|
70
|
+
# Robotstxt.parse("User-agent: *\nDisallow: /a", "SuperRobot").allowed? "/b"
|
71
|
+
#
|
72
|
+
def self.parse(robotstxt, robot_id)
|
73
|
+
Parser.new(robot_id, robotstxt)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Gets a robotstxt file from the host identified by the uri
|
77
|
+
# (which can be a URI object or a string)
|
78
|
+
#
|
79
|
+
# Parses it for the given robot_id
|
80
|
+
# (which should be your user-agent)
|
81
|
+
#
|
82
|
+
# Returns true iff your robot can access said uri.
|
83
|
+
#
|
84
|
+
# Robotstxt.get_allowed? "http://www.example.com/good", "SuperRobot"
|
85
|
+
#
|
86
|
+
def self.get_allowed?(uri, robot_id)
|
87
|
+
self.get(uri, robot_id).allowed? uri
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.ultimate_scrubber(str)
|
91
|
+
str.encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => '')
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'net/http'
|
3
|
+
|
4
|
+
module Robotstxt
|
5
|
+
module CommonMethods
|
6
|
+
|
7
|
+
protected
|
8
|
+
# Convert a URI or a String into a URI
|
9
|
+
def objectify_uri(uri)
|
10
|
+
|
11
|
+
if uri.is_a? String
|
12
|
+
# URI.parse will explode when given a character that it thinks
|
13
|
+
# shouldn't appear in uris. We thus escape them before passing the
|
14
|
+
# string into the function. Unfortunately URI.escape does not respect
|
15
|
+
# all characters that have meaning in HTTP (esp. #), so we are forced
|
16
|
+
# to state exactly which characters we would like to escape.
|
17
|
+
uri = URI.escape(uri, %r{[^!$#%&'()*+,\-./0-9:;=?@A-Z_a-z~]})
|
18
|
+
uri = URI.parse(uri)
|
19
|
+
else
|
20
|
+
uri
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Robotstxt
|
2
|
+
class Getter
|
3
|
+
include CommonMethods
|
4
|
+
|
5
|
+
# Get the text of a robots.txt file from the given source, see #get.
|
6
|
+
def obtain(source, robot_id, options)
|
7
|
+
options = {
|
8
|
+
:num_redirects => 5,
|
9
|
+
:http_timeout => 10
|
10
|
+
}.merge(options)
|
11
|
+
|
12
|
+
robotstxt = if source.is_a? Net::HTTP
|
13
|
+
obtain_via_http(source, "/robots.txt", robot_id, options)
|
14
|
+
else
|
15
|
+
uri = objectify_uri(source)
|
16
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
17
|
+
http.read_timeout = options[:http_timeout]
|
18
|
+
if uri.scheme == 'https'
|
19
|
+
http.use_ssl = true
|
20
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
21
|
+
end
|
22
|
+
obtain_via_http(http, "/robots.txt", robot_id, options)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
protected
|
27
|
+
|
28
|
+
# Recursively try to obtain robots.txt following redirects and handling the
|
29
|
+
# various HTTP response codes as indicated on robotstxt.org
|
30
|
+
def obtain_via_http(http, uri, robot_id, options)
|
31
|
+
response = http.get(uri, {'User-Agent' => robot_id})
|
32
|
+
|
33
|
+
begin
|
34
|
+
case response
|
35
|
+
when Net::HTTPSuccess
|
36
|
+
decode_body(response)
|
37
|
+
when Net::HTTPRedirection
|
38
|
+
if options[:num_redirects] > 0 && response['location']
|
39
|
+
options[:num_redirects] -= 1
|
40
|
+
obtain(response['location'], robot_id, options)
|
41
|
+
else
|
42
|
+
all_allowed
|
43
|
+
end
|
44
|
+
when Net::HTTPUnauthorized
|
45
|
+
all_forbidden
|
46
|
+
when Net::HTTPForbidden
|
47
|
+
all_forbidden
|
48
|
+
else
|
49
|
+
all_allowed
|
50
|
+
end
|
51
|
+
rescue Timeout::Error #, StandardError
|
52
|
+
all_allowed
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
# A robots.txt body that forbids access to everywhere
|
58
|
+
def all_forbidden
|
59
|
+
"User-agent: *\nDisallow: /\n"
|
60
|
+
end
|
61
|
+
|
62
|
+
# A robots.txt body that allows access to everywhere
|
63
|
+
def all_allowed
|
64
|
+
"User-agent: *\nDisallow:\n"
|
65
|
+
end
|
66
|
+
|
67
|
+
# Decode the response's body according to the character encoding in the HTTP
|
68
|
+
# headers.
|
69
|
+
# In the case that we can't decode, Ruby's laissez faire attitude to encoding
|
70
|
+
# should mean that we have a reasonable chance of working anyway.
|
71
|
+
def decode_body(response)
|
72
|
+
return nil if response.body.nil?
|
73
|
+
Robotstxt.ultimate_scrubber(response.body)
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
@@ -0,0 +1,256 @@
|
|
1
|
+
|
2
|
+
module Robotstxt
|
3
|
+
# Parses robots.txt files for the perusal of a single user-agent.
|
4
|
+
#
|
5
|
+
# The behaviour implemented is guided by the following sources, though
|
6
|
+
# as there is no widely accepted standard, it may differ from other implementations.
|
7
|
+
# If you consider its behaviour to be in error, please contact the author.
|
8
|
+
#
|
9
|
+
# http://www.robotstxt.org/orig.html
|
10
|
+
# - the original, now imprecise and outdated version
|
11
|
+
# http://www.robotstxt.org/norobots-rfc.txt
|
12
|
+
# - a much more precise, outdated version
|
13
|
+
# http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237
|
14
|
+
# - a few hints at modern protocol extensions.
|
15
|
+
#
|
16
|
+
# This parser only considers lines starting with (case-insensitively:)
|
17
|
+
# Useragent: User-agent: Allow: Disallow: Sitemap:
|
18
|
+
#
|
19
|
+
# The file is divided into sections, each of which contains one or more User-agent:
|
20
|
+
# lines, followed by one or more Allow: or Disallow: rules.
|
21
|
+
#
|
22
|
+
# The first section that contains a User-agent: line that matches the robot's
|
23
|
+
# user-agent, is the only section that relevent to that robot. The sections are checked
|
24
|
+
# in the same order as they appear in the file.
|
25
|
+
#
|
26
|
+
# (The * character is taken to mean "any number of any characters" during matching of
|
27
|
+
# user-agents)
|
28
|
+
#
|
29
|
+
# Within that section, the first Allow: or Disallow: rule that matches the expression
|
30
|
+
# is taken as authoritative. If no rule in a section matches, the access is Allowed.
|
31
|
+
#
|
32
|
+
# (The order of matching is as in the RFC, Google matches all Allows and then all Disallows,
|
33
|
+
# while Bing matches the most specific rule, I'm sure there are other interpretations)
|
34
|
+
#
|
35
|
+
# When matching urls, all % encodings are normalised (except for /?=& which have meaning)
|
36
|
+
# and "*"s match any number of any character.
|
37
|
+
#
|
38
|
+
# If a pattern ends with a $, then the pattern must match the entire path, or the entire
|
39
|
+
# path with query string.
|
40
|
+
#
|
41
|
+
class Parser
|
42
|
+
include CommonMethods
|
43
|
+
|
44
|
+
# Gets every Sitemap mentioned in the body of the robots.txt file.
|
45
|
+
#
|
46
|
+
attr_reader :sitemaps
|
47
|
+
|
48
|
+
# Create a new parser for this user_agent and this robots.txt contents.
|
49
|
+
#
|
50
|
+
# This assumes that the robots.txt is ready-to-parse, in particular that
|
51
|
+
# it has been decoded as necessary, including removal of byte-order-marks et.al.
|
52
|
+
#
|
53
|
+
# Not passing a body is deprecated, but retained for compatibility with clients
|
54
|
+
# written for version 0.5.4.
|
55
|
+
#
|
56
|
+
def initialize(user_agent, body)
|
57
|
+
@robot_id = user_agent
|
58
|
+
@found = true
|
59
|
+
parse(body) # set @body, @rules and @sitemaps
|
60
|
+
end
|
61
|
+
|
62
|
+
# Given a URI object, or a string representing one, determine whether this
|
63
|
+
# robots.txt would allow access to the path.
|
64
|
+
def allowed?(uri)
|
65
|
+
|
66
|
+
uri = objectify_uri(uri)
|
67
|
+
path = (uri.path || "/") + (uri.query ? '?' + uri.query : '')
|
68
|
+
path_allowed?(@robot_id, path)
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
protected
|
73
|
+
|
74
|
+
# Check whether the relative path (a string of the url's path and query
|
75
|
+
# string) is allowed by the rules we have for the given user_agent.
|
76
|
+
#
|
77
|
+
def path_allowed?(user_agent, path)
|
78
|
+
|
79
|
+
@rules.each do |(ua_glob, path_globs)|
|
80
|
+
|
81
|
+
if match_ua_glob user_agent, ua_glob
|
82
|
+
path_globs.each do |(path_glob, allowed)|
|
83
|
+
return allowed if match_path_glob path, path_glob
|
84
|
+
end
|
85
|
+
return true
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
true
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
# This does a case-insensitive substring match such that if the user agent
|
94
|
+
# is contained within the glob, or vice-versa, we will match.
|
95
|
+
#
|
96
|
+
# According to the standard, *s shouldn't appear in the user-agent field
|
97
|
+
# except in the case of "*" meaning all user agents. Google however imply
|
98
|
+
# that the * will work, at least at the end of a string.
|
99
|
+
#
|
100
|
+
# For consistency, and because it seems expected behaviour, and because
|
101
|
+
# a glob * will match a literal * we use glob matching not string matching.
|
102
|
+
#
|
103
|
+
# The standard also advocates a substring match of the robot's user-agent
|
104
|
+
# within the user-agent field. From observation, it seems much more likely
|
105
|
+
# that the match will be the other way about, though we check for both.
|
106
|
+
#
|
107
|
+
def match_ua_glob(user_agent, glob)
|
108
|
+
|
109
|
+
glob =~ Regexp.new(Regexp.escape(user_agent), "i") ||
|
110
|
+
user_agent =~ Regexp.new(reify(glob), "i")
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
# This does case-sensitive prefix matching, such that if the path starts
|
115
|
+
# with the glob, we will match.
|
116
|
+
#
|
117
|
+
# According to the standard, that's it. However, it seems reasonably common
|
118
|
+
# for asterkisks to be interpreted as though they were globs.
|
119
|
+
#
|
120
|
+
# Additionally, some search engines, like Google, will treat a trailing $
|
121
|
+
# sign as forcing the glob to match the entire path - whether including
|
122
|
+
# or excluding the query string is not clear, so we check both.
|
123
|
+
#
|
124
|
+
# (i.e. it seems likely that a site owner who has Disallow: *.pdf$ expects
|
125
|
+
# to disallow requests to *.pdf?i_can_haz_pdf, which the robot could, if
|
126
|
+
# it were feeling malicious, construe.)
|
127
|
+
#
|
128
|
+
# With URLs there is the additional complication that %-encoding can give
|
129
|
+
# multiple representations for identical URLs, this is handled by
|
130
|
+
# normalize_percent_encoding.
|
131
|
+
#
|
132
|
+
def match_path_glob(path, glob)
|
133
|
+
|
134
|
+
if glob =~ /\$$/
|
135
|
+
end_marker = '(?:\?|$)'
|
136
|
+
glob = glob.gsub /\$$/, ""
|
137
|
+
else
|
138
|
+
end_marker = ""
|
139
|
+
end
|
140
|
+
|
141
|
+
glob = Robotstxt.ultimate_scrubber normalize_percent_encoding(glob)
|
142
|
+
path = Robotstxt.ultimate_scrubber normalize_percent_encoding(path)
|
143
|
+
|
144
|
+
path =~ Regexp.new("^" + reify(glob) + end_marker)
|
145
|
+
|
146
|
+
# Some people encode bad UTF-8 in their robots.txt files, let us not behave badly.
|
147
|
+
rescue RegexpError
|
148
|
+
false
|
149
|
+
end
|
150
|
+
|
151
|
+
# As a general rule, we want to ignore different representations of the
|
152
|
+
# same URL. Naively we could just unescape, or escape, everything, however
|
153
|
+
# the standard implies that a / is a HTTP path separator, while a %2F is an
|
154
|
+
# encoded / that does not act as a path separator. Similar issues with ?, &
|
155
|
+
# and =, though all other characters are fine. (While : also has a special
|
156
|
+
# meaning in HTTP, most implementations ignore this in the path)
|
157
|
+
#
|
158
|
+
# It's also worth noting that %-encoding is case-insensitive, so we
|
159
|
+
# explicitly upcase the few that we want to keep.
|
160
|
+
#
|
161
|
+
def normalize_percent_encoding(path)
|
162
|
+
|
163
|
+
# First double-escape any characters we don't want to unescape
|
164
|
+
# & / = ?
|
165
|
+
path = path.gsub(/%(26|2F|3D|3F)/i) do |code|
|
166
|
+
"%25#{code.upcase}"
|
167
|
+
end
|
168
|
+
|
169
|
+
URI.unescape(path)
|
170
|
+
|
171
|
+
end
|
172
|
+
|
173
|
+
# Convert the asterisks in a glob into (.*)s for regular expressions,
|
174
|
+
# and at the same time, escape any other characters that would have
|
175
|
+
# a significance in a regex.
|
176
|
+
#
|
177
|
+
def reify(glob)
|
178
|
+
glob = Robotstxt.ultimate_scrubber(glob)
|
179
|
+
|
180
|
+
# -1 on a split prevents trailing empty strings from being deleted.
|
181
|
+
glob.split("*", -1).map{ |part| Regexp.escape(part) }.join(".*")
|
182
|
+
|
183
|
+
end
|
184
|
+
|
185
|
+
# Convert the @body into a set of @rules so that our parsing mechanism
|
186
|
+
# becomes easier.
|
187
|
+
#
|
188
|
+
# @rules is an array of pairs. The first in the pair is the glob for the
|
189
|
+
# user-agent and the second another array of pairs. The first of the new
|
190
|
+
# pair is a glob for the path, and the second whether it appears in an
|
191
|
+
# Allow: or a Disallow: rule.
|
192
|
+
#
|
193
|
+
# For example:
|
194
|
+
#
|
195
|
+
# User-agent: *
|
196
|
+
# Disallow: /secret/
|
197
|
+
# Allow: / # allow everything...
|
198
|
+
#
|
199
|
+
# Would be parsed so that:
|
200
|
+
#
|
201
|
+
# @rules = [["*", [ ["/secret/", false], ["/", true] ]]]
|
202
|
+
#
|
203
|
+
#
|
204
|
+
# The order of the arrays is maintained so that the first match in the file
|
205
|
+
# is obeyed as indicated by the pseudo-RFC on http://robotstxt.org/. There
|
206
|
+
# are alternative interpretations, some parse by speicifity of glob, and
|
207
|
+
# some check Allow lines for any match before Disallow lines. All are
|
208
|
+
# justifiable, but we could only pick one.
|
209
|
+
#
|
210
|
+
# Note that a blank Disallow: should be treated as an Allow: * and multiple
|
211
|
+
# user-agents may share the same set of rules.
|
212
|
+
#
|
213
|
+
def parse(body)
|
214
|
+
|
215
|
+
@body = Robotstxt.ultimate_scrubber(body)
|
216
|
+
@rules = []
|
217
|
+
@sitemaps = []
|
218
|
+
|
219
|
+
body.split(/[\r\n]+/).each do |line|
|
220
|
+
prefix, value = line.delete("\000").split(":", 2).map(&:strip)
|
221
|
+
value.sub! /\s+#.*/, '' if value
|
222
|
+
parser_mode = :begin
|
223
|
+
|
224
|
+
if prefix && value
|
225
|
+
|
226
|
+
case prefix.downcase
|
227
|
+
when /^user-?agent$/
|
228
|
+
if parser_mode == :user_agent
|
229
|
+
@rules << [value, rules.last[1]]
|
230
|
+
else
|
231
|
+
parser_mode = :user_agent
|
232
|
+
@rules << [value, []]
|
233
|
+
end
|
234
|
+
when "disallow"
|
235
|
+
parser_mode = :rules
|
236
|
+
@rules << ["*", []] if @rules.empty?
|
237
|
+
|
238
|
+
if value == ""
|
239
|
+
@rules.last[1] << ["*", true]
|
240
|
+
else
|
241
|
+
@rules.last[1] << [value, false]
|
242
|
+
end
|
243
|
+
when "allow"
|
244
|
+
parser_mode = :rules
|
245
|
+
@rules << ["*", []] if @rules.empty?
|
246
|
+
@rules.last[1] << [value, true]
|
247
|
+
when "sitemap"
|
248
|
+
@sitemaps << value
|
249
|
+
else
|
250
|
+
# Ignore comments, Crawl-delay: and badly formed lines.
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
data/robotstxt.gemspec
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.name = "robotstxt-parser"
|
6
|
+
gem.version = "0.1.0"
|
7
|
+
gem.authors = ["Garen Torikian"]
|
8
|
+
gem.email = ["gjtorikian@gmail.com"]
|
9
|
+
gem.description = %q{Robotstxt-Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.}
|
10
|
+
gem.summary = %q{Robotstxt-parser is an Ruby robots.txt file parser.}
|
11
|
+
gem.homepage = "https://github.com/gjtorikian/robotstxt-parser"
|
12
|
+
gem.license = "MIT"
|
13
|
+
gem.files = `git ls-files`.split($/)
|
14
|
+
gem.test_files = gem.files.grep(%r{^(text)/})
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
|
17
|
+
gem.add_development_dependency "rake"
|
18
|
+
gem.add_development_dependency "fakeweb", '~> 1.3'
|
19
|
+
end
|
data/test/getter_test.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'test/unit'
|
7
|
+
require 'robotstxt'
|
8
|
+
require 'fakeweb'
|
9
|
+
|
10
|
+
FakeWeb.allow_net_connect = false
|
11
|
+
|
12
|
+
class TestRobotstxt < Test::Unit::TestCase
|
13
|
+
|
14
|
+
def test_absense
|
15
|
+
FakeWeb.register_uri(:get, "http://example.com/robots.txt", :status => ["404", "Not found"])
|
16
|
+
assert true == Robotstxt.get_allowed?("http://example.com/index.html", "Google")
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_error
|
20
|
+
FakeWeb.register_uri(:get, "http://example.com/robots.txt", :status => ["500", "Internal Server Error"])
|
21
|
+
assert true == Robotstxt.get_allowed?("http://example.com/index.html", "Google")
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_unauthorized
|
25
|
+
FakeWeb.register_uri(:get, "http://example.com/robots.txt", :status => ["401", "Unauthorized"])
|
26
|
+
assert false == Robotstxt.get_allowed?("http://example.com/index.html", "Google")
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_forbidden
|
30
|
+
FakeWeb.register_uri(:get, "http://example.com/robots.txt", :status => ["403", "Forbidden"])
|
31
|
+
assert false == Robotstxt.get_allowed?("http://example.com/index.html", "Google")
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_uri_object
|
35
|
+
FakeWeb.register_uri(:get, "http://example.com/robots.txt", :body => "User-agent:*\nDisallow: /test")
|
36
|
+
|
37
|
+
robotstxt = Robotstxt.get(URI.parse("http://example.com/index.html"), "Google")
|
38
|
+
|
39
|
+
assert true == robotstxt.allowed?("/index.html")
|
40
|
+
assert false == robotstxt.allowed?("/test/index.html")
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_existing_http_connection
|
44
|
+
FakeWeb.register_uri(:get, "http://example.com/robots.txt", :body => "User-agent:*\nDisallow: /test")
|
45
|
+
|
46
|
+
http = Net::HTTP.start("example.com", 80) do |http|
|
47
|
+
robotstxt = Robotstxt.get(http, "Google")
|
48
|
+
assert true == robotstxt.allowed?("/index.html")
|
49
|
+
assert false == robotstxt.allowed?("/test/index.html")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_redirects
|
54
|
+
FakeWeb.register_uri(:get, "http://example.com/robots.txt", :response => "HTTP/1.1 303 See Other\nLocation: http://www.exemplar.com/robots.txt\n\n")
|
55
|
+
FakeWeb.register_uri(:get, "http://www.exemplar.com/robots.txt", :body => "User-agent:*\nDisallow: /private")
|
56
|
+
|
57
|
+
robotstxt = Robotstxt.get("http://example.com/", "Google")
|
58
|
+
|
59
|
+
assert true == robotstxt.allowed?("/index.html")
|
60
|
+
assert false == robotstxt.allowed?("/private/index.html")
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_encoding
|
64
|
+
# "User-agent: *\n Disallow: /encyclop@dia" where @ is the ae ligature (U+00E6)
|
65
|
+
FakeWeb.register_uri(:get, "http://example.com/robots.txt", :response => "HTTP/1.1 200 OK\nContent-type: text/plain; charset=utf-16\n\n" +
|
66
|
+
"\xff\xfeU\x00s\x00e\x00r\x00-\x00a\x00g\x00e\x00n\x00t\x00:\x00 \x00*\x00\n\x00D\x00i\x00s\x00a\x00l\x00l\x00o\x00w\x00:\x00 \x00/\x00e\x00n\x00c\x00y\x00c\x00l\x00o\x00p\x00\xe6\x00d\x00i\x00a\x00")
|
67
|
+
robotstxt = Robotstxt.get("http://example.com/#index", "Google")
|
68
|
+
|
69
|
+
assert true == robotstxt.allowed?("/index.html")
|
70
|
+
assert false == robotstxt.allowed?("/encyclop%c3%a6dia/index.html")
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
data/test/parser_test.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
4
|
+
|
5
|
+
require 'test/unit'
|
6
|
+
require 'robotstxt'
|
7
|
+
require 'cgi'
|
8
|
+
|
9
|
+
class TestParser < Test::Unit::TestCase
|
10
|
+
|
11
|
+
def test_basics
|
12
|
+
client = Robotstxt::Parser.new("Test", <<-ROBOTS
|
13
|
+
User-agent: *
|
14
|
+
Disallow: /?*\t\t\t#comment
|
15
|
+
Disallow: /home
|
16
|
+
Disallow: /dashboard
|
17
|
+
Disallow: /terms-conditions
|
18
|
+
Disallow: /privacy-policy
|
19
|
+
Disallow: /index.php
|
20
|
+
Disallow: /chargify_system
|
21
|
+
Disallow: /test*
|
22
|
+
Disallow: /team* # comment
|
23
|
+
Disallow: /index
|
24
|
+
Allow: / # comment
|
25
|
+
Sitemap: http://example.com/sitemap.xml
|
26
|
+
ROBOTS
|
27
|
+
)
|
28
|
+
assert true == client.allowed?("/")
|
29
|
+
assert false == client.allowed?("/?")
|
30
|
+
assert false == client.allowed?("/?key=value")
|
31
|
+
assert true == client.allowed?("/example")
|
32
|
+
assert true == client.allowed?("/example/index.php")
|
33
|
+
assert false == client.allowed?("/test")
|
34
|
+
assert false == client.allowed?("/test/example")
|
35
|
+
assert false == client.allowed?("/team-game")
|
36
|
+
assert false == client.allowed?("/team-game/example")
|
37
|
+
assert ["http://example.com/sitemap.xml"] == client.sitemaps
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_blank_disallow
|
42
|
+
google = Robotstxt::Parser.new("Google", <<-ROBOTSTXT
|
43
|
+
User-agent: *
|
44
|
+
Disallow:
|
45
|
+
ROBOTSTXT
|
46
|
+
)
|
47
|
+
assert true == google.allowed?("/")
|
48
|
+
assert true == google.allowed?("/index.html")
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_url_escaping
|
52
|
+
google = Robotstxt::Parser.new("Google", <<-ROBOTSTXT
|
53
|
+
User-agent: *
|
54
|
+
Disallow: /test/
|
55
|
+
Disallow: /secret%2Fgarden/
|
56
|
+
Disallow: /%61lpha/
|
57
|
+
ROBOTSTXT
|
58
|
+
)
|
59
|
+
assert true == google.allowed?("/allowed/")
|
60
|
+
assert false == google.allowed?("/test/")
|
61
|
+
assert true == google.allowed?("/test%2Fetc/")
|
62
|
+
assert false == google.allowed?("/secret%2fgarden/")
|
63
|
+
assert true == google.allowed?("/secret/garden/")
|
64
|
+
assert false == google.allowed?("/alph%61/")
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_trail_matching
|
68
|
+
google = Robotstxt::Parser.new("Google", <<-ROBOTSTXT
|
69
|
+
User-agent: *
|
70
|
+
#comments
|
71
|
+
Disallow: /*.pdf$
|
72
|
+
ROBOTSTXT
|
73
|
+
)
|
74
|
+
assert true == google.allowed?("/.pdfs/index.html")
|
75
|
+
assert false == google.allowed?("/.pdfs/index.pdf")
|
76
|
+
assert false == google.allowed?("/.pdfs/index.pdf?action=view")
|
77
|
+
assert false == google.allowed?("/.pdfs/index.html?download_as=.pdf")
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_useragents
|
81
|
+
robotstxt = <<-ROBOTS
|
82
|
+
User-agent: Google
|
83
|
+
User-agent: Yahoo
|
84
|
+
Disallow:
|
85
|
+
|
86
|
+
User-agent: *
|
87
|
+
Disallow: /
|
88
|
+
ROBOTS
|
89
|
+
assert true == Robotstxt::Parser.new("Google", robotstxt).allowed?("/hello")
|
90
|
+
assert true == Robotstxt::Parser.new("Yahoo", robotstxt).allowed?("/hello")
|
91
|
+
assert false == Robotstxt::Parser.new("Bing", robotstxt).allowed?("/hello")
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_missing_useragent
|
95
|
+
robotstxt = <<-ROBOTS
|
96
|
+
Disallow: /index
|
97
|
+
ROBOTS
|
98
|
+
assert true === Robotstxt::Parser.new("Google", robotstxt).allowed?("/hello")
|
99
|
+
assert false === Robotstxt::Parser.new("Google", robotstxt).allowed?("/index/wold")
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_strange_newlines
|
103
|
+
robotstxt = "User-agent: *\r\r\rDisallow: *"
|
104
|
+
assert false === Robotstxt::Parser.new("Google", robotstxt).allowed?("/index/wold")
|
105
|
+
end
|
106
|
+
|
107
|
+
def test_bad_unicode
|
108
|
+
unless ENV['TRAVIS']
|
109
|
+
robotstxt = "User-agent: *\ndisallow: /?id=%C3%CB%D1%CA%A4%C5%D4%BB%C7%D5%B4%D5%E2%CD\n"
|
110
|
+
assert true === Robotstxt::Parser.new("Google", robotstxt).allowed?("/index/wold")
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: robotstxt-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Garen Torikian
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-09-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: fakeweb
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
description: 'Robotstxt-Parser allows you to the check the accessibility of URLs and
|
42
|
+
get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.'
|
43
|
+
email:
|
44
|
+
- gjtorikian@gmail.com
|
45
|
+
executables: []
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- ".gitignore"
|
50
|
+
- ".travis.yml"
|
51
|
+
- Gemfile
|
52
|
+
- LICENSE.rdoc
|
53
|
+
- README.rdoc
|
54
|
+
- Rakefile
|
55
|
+
- lib/robotstxt.rb
|
56
|
+
- lib/robotstxt/common.rb
|
57
|
+
- lib/robotstxt/getter.rb
|
58
|
+
- lib/robotstxt/parser.rb
|
59
|
+
- robotstxt.gemspec
|
60
|
+
- test/getter_test.rb
|
61
|
+
- test/parser_test.rb
|
62
|
+
homepage: https://github.com/gjtorikian/robotstxt-parser
|
63
|
+
licenses:
|
64
|
+
- MIT
|
65
|
+
metadata: {}
|
66
|
+
post_install_message:
|
67
|
+
rdoc_options: []
|
68
|
+
require_paths:
|
69
|
+
- lib
|
70
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '0'
|
80
|
+
requirements: []
|
81
|
+
rubyforge_project:
|
82
|
+
rubygems_version: 2.2.2
|
83
|
+
signing_key:
|
84
|
+
specification_version: 4
|
85
|
+
summary: Robotstxt-parser is an Ruby robots.txt file parser.
|
86
|
+
test_files: []
|