google_robotstxt_parser 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +28 -0
- data/.gitmodules +3 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +6 -0
- data/Guardfile +16 -0
- data/LICENSE +22 -0
- data/README.md +57 -0
- data/Rakefile +6 -0
- data/ext/robotstxt/.DS_Store +0 -0
- data/ext/robotstxt/extconf.rb +83 -0
- data/ext/robotstxt/robotstxt/.gitignore +1 -0
- data/ext/robotstxt/robotstxt/BUILD +40 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt +174 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt.in +30 -0
- data/ext/robotstxt/robotstxt/CONTRIBUTING.md +30 -0
- data/ext/robotstxt/robotstxt/LICENSE +203 -0
- data/ext/robotstxt/robotstxt/README.md +134 -0
- data/ext/robotstxt/robotstxt/WORKSPACE +28 -0
- data/ext/robotstxt/robotstxt/protocol-draft/README.md +9 -0
- data/ext/robotstxt/robotstxt/protocol-draft/draft-koster-rep-00.txt +529 -0
- data/ext/robotstxt/robotstxt/robots.cc +706 -0
- data/ext/robotstxt/robotstxt/robots.h +241 -0
- data/ext/robotstxt/robotstxt/robots_main.cc +101 -0
- data/ext/robotstxt/robotstxt/robots_test.cc +990 -0
- data/ext/robotstxt/robotstxt.cc +32 -0
- data/google_robotstxt_parser.gemspec +45 -0
- data/lib/google_robotstxt_parser/version.rb +6 -0
- data/lib/google_robotstxt_parser.rb +4 -0
- data/spec/google_robotstxt_parser_spec.rb +33 -0
- data/spec/spec_helper.rb +19 -0
- metadata +146 -0
@@ -0,0 +1,241 @@
|
|
1
|
+
// Copyright 1999 Google LLC
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// https://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// -----------------------------------------------------------------------------
|
16
|
+
// File: robots.h
|
17
|
+
// -----------------------------------------------------------------------------
|
18
|
+
//
|
19
|
+
// This file implements the standard defined by the Robots Exclusion Protocol
|
20
|
+
// (REP) internet draft (I-D).
|
21
|
+
// https://tools.ietf.org/html/draft-koster-rep
|
22
|
+
//
|
23
|
+
// Google doesn't follow the standard strictly, because there are a lot of
|
24
|
+
// non-conforming robots.txt files out there, and we err on the side of
|
25
|
+
// disallowing when this seems intended.
|
26
|
+
//
|
27
|
+
// An more user-friendly description of how Google handles robots.txt can be
|
28
|
+
// found at:
|
29
|
+
// https://developers.google.com/search/reference/robots_txt
|
30
|
+
//
|
31
|
+
// This library provides a low-level parser for robots.txt (ParseRobotsTxt()),
|
32
|
+
// and a matcher for URLs against a robots.txt (class RobotsMatcher).
|
33
|
+
|
34
|
+
#ifndef THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
|
35
|
+
#define THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
|
36
|
+
|
37
|
+
#include <string>
|
38
|
+
#include <vector>
|
39
|
+
|
40
|
+
#include "absl/strings/string_view.h"
|
41
|
+
|
42
|
+
namespace googlebot {
|
43
|
+
// Handler for directives found in robots.txt. These callbacks are called by
|
44
|
+
// ParseRobotsTxt() in the sequence they have been found in the file.
|
45
|
+
class RobotsParseHandler {
|
46
|
+
public:
|
47
|
+
RobotsParseHandler() {}
|
48
|
+
virtual ~RobotsParseHandler() {}
|
49
|
+
|
50
|
+
// Disallow copying and assignment.
|
51
|
+
RobotsParseHandler(const RobotsParseHandler&) = delete;
|
52
|
+
RobotsParseHandler& operator=(const RobotsParseHandler&) = delete;
|
53
|
+
|
54
|
+
virtual void HandleRobotsStart() = 0;
|
55
|
+
virtual void HandleRobotsEnd() = 0;
|
56
|
+
|
57
|
+
virtual void HandleUserAgent(int line_num, absl::string_view value) = 0;
|
58
|
+
virtual void HandleAllow(int line_num, absl::string_view value) = 0;
|
59
|
+
virtual void HandleDisallow(int line_num, absl::string_view value) = 0;
|
60
|
+
|
61
|
+
virtual void HandleSitemap(int line_num, absl::string_view value) = 0;
|
62
|
+
|
63
|
+
// Any other unrecognized name/value pairs.
|
64
|
+
virtual void HandleUnknownAction(int line_num, absl::string_view action,
|
65
|
+
absl::string_view value) = 0;
|
66
|
+
};
|
67
|
+
|
68
|
+
// Parses body of a robots.txt and emits parse callbacks. This will accept
|
69
|
+
// typical typos found in robots.txt, such as 'disalow'.
|
70
|
+
//
|
71
|
+
// Note, this function will accept all kind of input but will skip
|
72
|
+
// everything that does not look like a robots directive.
|
73
|
+
void ParseRobotsTxt(absl::string_view robots_body,
|
74
|
+
RobotsParseHandler* parse_callback);
|
75
|
+
|
76
|
+
// RobotsMatcher - matches robots.txt against URLs.
|
77
|
+
//
|
78
|
+
// The Matcher uses a default match strategy for Allow/Disallow patterns which
|
79
|
+
// is the official way of Google crawler to match robots.txt. It is also
|
80
|
+
// possible to provide a custom match strategy.
|
81
|
+
//
|
82
|
+
// The entry point for the user is to call one of the *AllowedByRobots()
|
83
|
+
// methods that return directly if a URL is being allowed according to the
|
84
|
+
// robots.txt and the crawl agent.
|
85
|
+
// The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe.
|
86
|
+
class RobotsMatchStrategy;
|
87
|
+
class RobotsMatcher : protected RobotsParseHandler {
|
88
|
+
public:
|
89
|
+
// Create a RobotsMatcher with the default matching strategy. The default
|
90
|
+
// matching strategy is longest-match as opposed to the former internet draft
|
91
|
+
// that provisioned first-match strategy. Analysis shows that longest-match,
|
92
|
+
// while more restrictive for crawlers, is what webmasters assume when writing
|
93
|
+
// directives. For example, in case of conflicting matches (both Allow and
|
94
|
+
// Disallow), the longest match is the one the user wants. For example, in
|
95
|
+
// case of a robots.txt file that has the following rules
|
96
|
+
// Allow: /
|
97
|
+
// Disallow: /cgi-bin
|
98
|
+
// it's pretty obvious what the webmaster wants: they want to allow crawl of
|
99
|
+
// every URI except /cgi-bin. However, according to the expired internet
|
100
|
+
// standard, crawlers should be allowed to crawl everything with such a rule.
|
101
|
+
RobotsMatcher();
|
102
|
+
|
103
|
+
~RobotsMatcher() override;
|
104
|
+
|
105
|
+
// Disallow copying and assignment.
|
106
|
+
RobotsMatcher(const RobotsMatcher&) = delete;
|
107
|
+
RobotsMatcher& operator=(const RobotsMatcher&) = delete;
|
108
|
+
|
109
|
+
// Verifies that the given user agent is valid to be matched against
|
110
|
+
// robots.txt. Valid user agent strings only contain the characters
|
111
|
+
// [a-zA-Z_-].
|
112
|
+
static bool IsValidUserAgentToObey(absl::string_view user_agent);
|
113
|
+
|
114
|
+
// Returns true iff 'url' is allowed to be fetched by any member of the
|
115
|
+
// "user_agents" vector. 'url' must be %-encoded according to RFC3986.
|
116
|
+
bool AllowedByRobots(absl::string_view robots_body,
|
117
|
+
const std::vector<std::string>* user_agents,
|
118
|
+
const std::string& url);
|
119
|
+
|
120
|
+
// Do robots check for 'url' when there is only one user agent. 'url' must
|
121
|
+
// be %-encoded according to RFC3986.
|
122
|
+
bool OneAgentAllowedByRobots(absl::string_view robots_txt,
|
123
|
+
const std::string& user_agent,
|
124
|
+
const std::string& url);
|
125
|
+
|
126
|
+
// Returns true if we are disallowed from crawling a matching URI.
|
127
|
+
bool disallow() const;
|
128
|
+
|
129
|
+
// Returns true if we are disallowed from crawling a matching URI. Ignores any
|
130
|
+
// rules specified for the default user agent, and bases its results only on
|
131
|
+
// the specified user agents.
|
132
|
+
bool disallow_ignore_global() const;
|
133
|
+
|
134
|
+
// Returns true iff, when AllowedByRobots() was called, the robots file
|
135
|
+
// referred explicitly to one of the specified user agents.
|
136
|
+
bool ever_seen_specific_agent() const;
|
137
|
+
|
138
|
+
// Returns the line that matched or 0 if none matched.
|
139
|
+
const int matching_line() const;
|
140
|
+
|
141
|
+
protected:
|
142
|
+
// Parse callbacks.
|
143
|
+
// Protected because used in unittest. Never override RobotsMatcher, implement
|
144
|
+
// googlebot::RobotsParseHandler instead.
|
145
|
+
void HandleRobotsStart() override;
|
146
|
+
void HandleRobotsEnd() override {}
|
147
|
+
|
148
|
+
void HandleUserAgent(int line_num, absl::string_view value) override;
|
149
|
+
void HandleAllow(int line_num, absl::string_view value) override;
|
150
|
+
void HandleDisallow(int line_num, absl::string_view value) override;
|
151
|
+
|
152
|
+
void HandleSitemap(int line_num, absl::string_view value) override;
|
153
|
+
void HandleUnknownAction(int line_num, absl::string_view action,
|
154
|
+
absl::string_view value) override;
|
155
|
+
|
156
|
+
protected:
|
157
|
+
// Extract the matchable part of a user agent string, essentially stopping at
|
158
|
+
// the first invalid character.
|
159
|
+
// Example: 'Googlebot/2.1' becomes 'Googlebot'
|
160
|
+
static absl::string_view ExtractUserAgent(absl::string_view user_agent);
|
161
|
+
|
162
|
+
// Initialize next path and user-agents to check. Path must contain only the
|
163
|
+
// path, params, and query (if any) of the url and must start with a '/'.
|
164
|
+
void InitUserAgentsAndPath(const std::vector<std::string>* user_agents,
|
165
|
+
const char* path);
|
166
|
+
|
167
|
+
// Returns true if any user-agent was seen.
|
168
|
+
bool seen_any_agent() const {
|
169
|
+
return seen_global_agent_ || seen_specific_agent_;
|
170
|
+
}
|
171
|
+
|
172
|
+
// Instead of just maintaining a Boolean indicating whether a given line has
|
173
|
+
// matched, we maintain a count of the maximum number of characters matched by
|
174
|
+
// that pattern.
|
175
|
+
//
|
176
|
+
// This structure stores the information associated with a match (e.g. when a
|
177
|
+
// Disallow is matched) as priority of the match and line matching.
|
178
|
+
//
|
179
|
+
// The priority is initialized with a negative value to make sure that a match
|
180
|
+
// of priority 0 is higher priority than no match at all.
|
181
|
+
class Match {
|
182
|
+
private:
|
183
|
+
static const int kNoMatchPriority = -1;
|
184
|
+
|
185
|
+
public:
|
186
|
+
Match(int priority, int line) : priority_(priority), line_(line) {}
|
187
|
+
Match() : priority_(kNoMatchPriority), line_(0) {}
|
188
|
+
|
189
|
+
void Set(int priority, int line) {
|
190
|
+
priority_ = priority;
|
191
|
+
line_ = line;
|
192
|
+
}
|
193
|
+
|
194
|
+
void Clear() { Set(kNoMatchPriority, 0); }
|
195
|
+
|
196
|
+
int line() const { return line_; }
|
197
|
+
int priority() const { return priority_; }
|
198
|
+
|
199
|
+
static const Match& HigherPriorityMatch(const Match& a, const Match& b) {
|
200
|
+
if (a.priority() > b.priority()) {
|
201
|
+
return a;
|
202
|
+
} else {
|
203
|
+
return b;
|
204
|
+
}
|
205
|
+
}
|
206
|
+
|
207
|
+
private:
|
208
|
+
int priority_;
|
209
|
+
int line_;
|
210
|
+
};
|
211
|
+
|
212
|
+
// For each of the directives within user-agents, we keep global and specific
|
213
|
+
// match scores.
|
214
|
+
struct MatchHierarchy {
|
215
|
+
Match global; // Match for '*'
|
216
|
+
Match specific; // Match for queried agent.
|
217
|
+
void Clear() {
|
218
|
+
global.Clear();
|
219
|
+
specific.Clear();
|
220
|
+
}
|
221
|
+
};
|
222
|
+
MatchHierarchy allow_; // Characters of 'url' matching Allow.
|
223
|
+
MatchHierarchy disallow_; // Characters of 'url' matching Disallow.
|
224
|
+
|
225
|
+
bool seen_global_agent_; // True if processing global agent rules.
|
226
|
+
bool seen_specific_agent_; // True if processing our specific agent.
|
227
|
+
bool ever_seen_specific_agent_; // True if we ever saw a block for our agent.
|
228
|
+
bool seen_separator_; // True if saw any key: value pair.
|
229
|
+
|
230
|
+
// The path we want to pattern match. Not owned and only a valid pointer
|
231
|
+
// during the lifetime of *AllowedByRobots calls.
|
232
|
+
const char* path_;
|
233
|
+
// The User-Agents we are interested in. Not owned and only a valid
|
234
|
+
// pointer during the lifetime of *AllowedByRobots calls.
|
235
|
+
const std::vector<std::string>* user_agents_;
|
236
|
+
|
237
|
+
RobotsMatchStrategy* match_strategy_;
|
238
|
+
};
|
239
|
+
|
240
|
+
} // namespace googlebot
|
241
|
+
#endif // THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
|
@@ -0,0 +1,101 @@
|
|
1
|
+
// Copyright 2019 Google LLC
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// https://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// -----------------------------------------------------------------------------
|
16
|
+
// File: robots_main.cc
|
17
|
+
// -----------------------------------------------------------------------------
|
18
|
+
//
|
19
|
+
// Simple binary to assess whether a URL is accessible to a user-agent according
|
20
|
+
// to records found in a local robots.txt file, based on Google's robots.txt
|
21
|
+
// parsing and matching algorithms.
|
22
|
+
// Usage:
|
23
|
+
// robots_main <local_path_to_robotstxt> <user_agent> <url>
|
24
|
+
// Arguments:
|
25
|
+
// local_path_to_robotstxt: local path to a file containing robots.txt records.
|
26
|
+
// For example: /home/users/username/robots.txt
|
27
|
+
// user_agent: a token to be matched against records in the robots.txt.
|
28
|
+
// For example: Googlebot
|
29
|
+
// url: a url to be matched against records in the robots.txt. The URL must be
|
30
|
+
// %-encoded according to RFC3986.
|
31
|
+
// For example: https://example.com/accessible/url.html
|
32
|
+
// Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
|
33
|
+
// to access 'url' based on records in 'local_path_to_robotstxt'.
|
34
|
+
//
|
35
|
+
#include <fstream>
|
36
|
+
#include <iostream>
|
37
|
+
|
38
|
+
#include "robots.h"
|
39
|
+
|
40
|
+
bool LoadFile(const std::string& filename, std::string* result) {
|
41
|
+
std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
|
42
|
+
if (file.is_open()) {
|
43
|
+
size_t size = file.tellg();
|
44
|
+
std::vector<char> buffer(size);
|
45
|
+
file.seekg(0, std::ios::beg);
|
46
|
+
file.read(buffer.data(), size);
|
47
|
+
file.close();
|
48
|
+
if (!file) return false; // file reading error (failbit or badbit).
|
49
|
+
result->assign(buffer.begin(), buffer.end());
|
50
|
+
return true;
|
51
|
+
}
|
52
|
+
return false;
|
53
|
+
}
|
54
|
+
|
55
|
+
void ShowHelp(int argc, char** argv) {
|
56
|
+
std::cerr << "Shows whether the given user_agent and URI combination"
|
57
|
+
<< " is allowed or disallowed by the given robots.txt file. "
|
58
|
+
<< std::endl
|
59
|
+
<< std::endl;
|
60
|
+
std::cerr << "Usage: " << std::endl
|
61
|
+
<< " " << argv[0] << " <robots.txt filename> <user_agent> <URI>"
|
62
|
+
<< std::endl
|
63
|
+
<< std::endl;
|
64
|
+
std::cerr << "The URI must be %-encoded according to RFC3986." << std::endl
|
65
|
+
<< std::endl;
|
66
|
+
std::cerr << "Example: " << std::endl
|
67
|
+
<< " " << argv[0] << " robots.txt FooBot http://example.com/foo"
|
68
|
+
<< std::endl;
|
69
|
+
}
|
70
|
+
|
71
|
+
int main(int argc, char** argv) {
|
72
|
+
std::string filename = argc >= 2 ? argv[1] : "";
|
73
|
+
if (filename == "-h" || filename == "-help" || filename == "--help") {
|
74
|
+
ShowHelp(argc, argv);
|
75
|
+
return 0;
|
76
|
+
}
|
77
|
+
if (argc != 4) {
|
78
|
+
std::cerr << "Invalid amount of arguments. Showing help." << std::endl
|
79
|
+
<< std::endl;
|
80
|
+
ShowHelp(argc, argv);
|
81
|
+
return 1;
|
82
|
+
}
|
83
|
+
std::string robots_content;
|
84
|
+
if (!(LoadFile(filename, &robots_content))) {
|
85
|
+
std::cerr << "failed to read file \"" << filename << "\"" << std::endl;
|
86
|
+
return 1;
|
87
|
+
}
|
88
|
+
|
89
|
+
std::string user_agent = argv[2];
|
90
|
+
std::vector<std::string> user_agents(1, user_agent);
|
91
|
+
googlebot::RobotsMatcher matcher;
|
92
|
+
std::string url = argv[3];
|
93
|
+
bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
|
94
|
+
|
95
|
+
std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
|
96
|
+
<< "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
|
97
|
+
if (robots_content.empty()) {
|
98
|
+
std::cout << "notice: robots file is empty so all user-agents are allowed"
|
99
|
+
<< std::endl;
|
100
|
+
}
|
101
|
+
}
|