google_robotstxt_parser 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,241 @@
1
+ // Copyright 1999 Google LLC
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // https://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // -----------------------------------------------------------------------------
16
+ // File: robots.h
17
+ // -----------------------------------------------------------------------------
18
+ //
19
+ // This file implements the standard defined by the Robots Exclusion Protocol
20
+ // (REP) internet draft (I-D).
21
+ // https://tools.ietf.org/html/draft-koster-rep
22
+ //
23
+ // Google doesn't follow the standard strictly, because there are a lot of
24
+ // non-conforming robots.txt files out there, and we err on the side of
25
+ // disallowing when this seems intended.
26
+ //
27
+ // An more user-friendly description of how Google handles robots.txt can be
28
+ // found at:
29
+ // https://developers.google.com/search/reference/robots_txt
30
+ //
31
+ // This library provides a low-level parser for robots.txt (ParseRobotsTxt()),
32
+ // and a matcher for URLs against a robots.txt (class RobotsMatcher).
33
+
34
+ #ifndef THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
35
+ #define THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
36
+
37
+ #include <string>
38
+ #include <vector>
39
+
40
+ #include "absl/strings/string_view.h"
41
+
42
+ namespace googlebot {
43
+ // Handler for directives found in robots.txt. These callbacks are called by
44
+ // ParseRobotsTxt() in the sequence they have been found in the file.
45
+ class RobotsParseHandler {
46
+ public:
47
+ RobotsParseHandler() {}
48
+ virtual ~RobotsParseHandler() {}
49
+
50
+ // Disallow copying and assignment.
51
+ RobotsParseHandler(const RobotsParseHandler&) = delete;
52
+ RobotsParseHandler& operator=(const RobotsParseHandler&) = delete;
53
+
54
+ virtual void HandleRobotsStart() = 0;
55
+ virtual void HandleRobotsEnd() = 0;
56
+
57
+ virtual void HandleUserAgent(int line_num, absl::string_view value) = 0;
58
+ virtual void HandleAllow(int line_num, absl::string_view value) = 0;
59
+ virtual void HandleDisallow(int line_num, absl::string_view value) = 0;
60
+
61
+ virtual void HandleSitemap(int line_num, absl::string_view value) = 0;
62
+
63
+ // Any other unrecognized name/value pairs.
64
+ virtual void HandleUnknownAction(int line_num, absl::string_view action,
65
+ absl::string_view value) = 0;
66
+ };
67
+
68
+ // Parses body of a robots.txt and emits parse callbacks. This will accept
69
+ // typical typos found in robots.txt, such as 'disalow'.
70
+ //
71
+ // Note, this function will accept all kind of input but will skip
72
+ // everything that does not look like a robots directive.
73
+ void ParseRobotsTxt(absl::string_view robots_body,
74
+ RobotsParseHandler* parse_callback);
75
+
76
+ // RobotsMatcher - matches robots.txt against URLs.
77
+ //
78
+ // The Matcher uses a default match strategy for Allow/Disallow patterns which
79
+ // is the official way of Google crawler to match robots.txt. It is also
80
+ // possible to provide a custom match strategy.
81
+ //
82
+ // The entry point for the user is to call one of the *AllowedByRobots()
83
+ // methods that return directly if a URL is being allowed according to the
84
+ // robots.txt and the crawl agent.
85
+ // The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe.
86
+ class RobotsMatchStrategy;
87
+ class RobotsMatcher : protected RobotsParseHandler {
88
+ public:
89
+ // Create a RobotsMatcher with the default matching strategy. The default
90
+ // matching strategy is longest-match as opposed to the former internet draft
91
+ // that provisioned first-match strategy. Analysis shows that longest-match,
92
+ // while more restrictive for crawlers, is what webmasters assume when writing
93
+ // directives. For example, in case of conflicting matches (both Allow and
94
+ // Disallow), the longest match is the one the user wants. For example, in
95
+ // case of a robots.txt file that has the following rules
96
+ // Allow: /
97
+ // Disallow: /cgi-bin
98
+ // it's pretty obvious what the webmaster wants: they want to allow crawl of
99
+ // every URI except /cgi-bin. However, according to the expired internet
100
+ // standard, crawlers should be allowed to crawl everything with such a rule.
101
+ RobotsMatcher();
102
+
103
+ ~RobotsMatcher() override;
104
+
105
+ // Disallow copying and assignment.
106
+ RobotsMatcher(const RobotsMatcher&) = delete;
107
+ RobotsMatcher& operator=(const RobotsMatcher&) = delete;
108
+
109
+ // Verifies that the given user agent is valid to be matched against
110
+ // robots.txt. Valid user agent strings only contain the characters
111
+ // [a-zA-Z_-].
112
+ static bool IsValidUserAgentToObey(absl::string_view user_agent);
113
+
114
+ // Returns true iff 'url' is allowed to be fetched by any member of the
115
+ // "user_agents" vector. 'url' must be %-encoded according to RFC3986.
116
+ bool AllowedByRobots(absl::string_view robots_body,
117
+ const std::vector<std::string>* user_agents,
118
+ const std::string& url);
119
+
120
+ // Do robots check for 'url' when there is only one user agent. 'url' must
121
+ // be %-encoded according to RFC3986.
122
+ bool OneAgentAllowedByRobots(absl::string_view robots_txt,
123
+ const std::string& user_agent,
124
+ const std::string& url);
125
+
126
+ // Returns true if we are disallowed from crawling a matching URI.
127
+ bool disallow() const;
128
+
129
+ // Returns true if we are disallowed from crawling a matching URI. Ignores any
130
+ // rules specified for the default user agent, and bases its results only on
131
+ // the specified user agents.
132
+ bool disallow_ignore_global() const;
133
+
134
+ // Returns true iff, when AllowedByRobots() was called, the robots file
135
+ // referred explicitly to one of the specified user agents.
136
+ bool ever_seen_specific_agent() const;
137
+
138
+ // Returns the line that matched or 0 if none matched.
139
+ const int matching_line() const;
140
+
141
+ protected:
142
+ // Parse callbacks.
143
+ // Protected because used in unittest. Never override RobotsMatcher, implement
144
+ // googlebot::RobotsParseHandler instead.
145
+ void HandleRobotsStart() override;
146
+ void HandleRobotsEnd() override {}
147
+
148
+ void HandleUserAgent(int line_num, absl::string_view value) override;
149
+ void HandleAllow(int line_num, absl::string_view value) override;
150
+ void HandleDisallow(int line_num, absl::string_view value) override;
151
+
152
+ void HandleSitemap(int line_num, absl::string_view value) override;
153
+ void HandleUnknownAction(int line_num, absl::string_view action,
154
+ absl::string_view value) override;
155
+
156
+ protected:
157
+ // Extract the matchable part of a user agent string, essentially stopping at
158
+ // the first invalid character.
159
+ // Example: 'Googlebot/2.1' becomes 'Googlebot'
160
+ static absl::string_view ExtractUserAgent(absl::string_view user_agent);
161
+
162
+ // Initialize next path and user-agents to check. Path must contain only the
163
+ // path, params, and query (if any) of the url and must start with a '/'.
164
+ void InitUserAgentsAndPath(const std::vector<std::string>* user_agents,
165
+ const char* path);
166
+
167
+ // Returns true if any user-agent was seen.
168
+ bool seen_any_agent() const {
169
+ return seen_global_agent_ || seen_specific_agent_;
170
+ }
171
+
172
+ // Instead of just maintaining a Boolean indicating whether a given line has
173
+ // matched, we maintain a count of the maximum number of characters matched by
174
+ // that pattern.
175
+ //
176
+ // This structure stores the information associated with a match (e.g. when a
177
+ // Disallow is matched) as priority of the match and line matching.
178
+ //
179
+ // The priority is initialized with a negative value to make sure that a match
180
+ // of priority 0 is higher priority than no match at all.
181
+ class Match {
182
+ private:
183
+ static const int kNoMatchPriority = -1;
184
+
185
+ public:
186
+ Match(int priority, int line) : priority_(priority), line_(line) {}
187
+ Match() : priority_(kNoMatchPriority), line_(0) {}
188
+
189
+ void Set(int priority, int line) {
190
+ priority_ = priority;
191
+ line_ = line;
192
+ }
193
+
194
+ void Clear() { Set(kNoMatchPriority, 0); }
195
+
196
+ int line() const { return line_; }
197
+ int priority() const { return priority_; }
198
+
199
+ static const Match& HigherPriorityMatch(const Match& a, const Match& b) {
200
+ if (a.priority() > b.priority()) {
201
+ return a;
202
+ } else {
203
+ return b;
204
+ }
205
+ }
206
+
207
+ private:
208
+ int priority_;
209
+ int line_;
210
+ };
211
+
212
+ // For each of the directives within user-agents, we keep global and specific
213
+ // match scores.
214
+ struct MatchHierarchy {
215
+ Match global; // Match for '*'
216
+ Match specific; // Match for queried agent.
217
+ void Clear() {
218
+ global.Clear();
219
+ specific.Clear();
220
+ }
221
+ };
222
+ MatchHierarchy allow_; // Characters of 'url' matching Allow.
223
+ MatchHierarchy disallow_; // Characters of 'url' matching Disallow.
224
+
225
+ bool seen_global_agent_; // True if processing global agent rules.
226
+ bool seen_specific_agent_; // True if processing our specific agent.
227
+ bool ever_seen_specific_agent_; // True if we ever saw a block for our agent.
228
+ bool seen_separator_; // True if saw any key: value pair.
229
+
230
+ // The path we want to pattern match. Not owned and only a valid pointer
231
+ // during the lifetime of *AllowedByRobots calls.
232
+ const char* path_;
233
+ // The User-Agents we are interested in. Not owned and only a valid
234
+ // pointer during the lifetime of *AllowedByRobots calls.
235
+ const std::vector<std::string>* user_agents_;
236
+
237
+ RobotsMatchStrategy* match_strategy_;
238
+ };
239
+
240
+ } // namespace googlebot
241
+ #endif // THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
@@ -0,0 +1,101 @@
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // https://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // -----------------------------------------------------------------------------
16
+ // File: robots_main.cc
17
+ // -----------------------------------------------------------------------------
18
+ //
19
+ // Simple binary to assess whether a URL is accessible to a user-agent according
20
+ // to records found in a local robots.txt file, based on Google's robots.txt
21
+ // parsing and matching algorithms.
22
+ // Usage:
23
+ // robots_main <local_path_to_robotstxt> <user_agent> <url>
24
+ // Arguments:
25
+ // local_path_to_robotstxt: local path to a file containing robots.txt records.
26
+ // For example: /home/users/username/robots.txt
27
+ // user_agent: a token to be matched against records in the robots.txt.
28
+ // For example: Googlebot
29
+ // url: a url to be matched against records in the robots.txt. The URL must be
30
+ // %-encoded according to RFC3986.
31
+ // For example: https://example.com/accessible/url.html
32
+ // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
33
+ // to access 'url' based on records in 'local_path_to_robotstxt'.
34
+ //
35
+ #include <fstream>
36
+ #include <iostream>
37
+
38
+ #include "robots.h"
39
+
40
+ bool LoadFile(const std::string& filename, std::string* result) {
41
+ std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
42
+ if (file.is_open()) {
43
+ size_t size = file.tellg();
44
+ std::vector<char> buffer(size);
45
+ file.seekg(0, std::ios::beg);
46
+ file.read(buffer.data(), size);
47
+ file.close();
48
+ if (!file) return false; // file reading error (failbit or badbit).
49
+ result->assign(buffer.begin(), buffer.end());
50
+ return true;
51
+ }
52
+ return false;
53
+ }
54
+
55
+ void ShowHelp(int argc, char** argv) {
56
+ std::cerr << "Shows whether the given user_agent and URI combination"
57
+ << " is allowed or disallowed by the given robots.txt file. "
58
+ << std::endl
59
+ << std::endl;
60
+ std::cerr << "Usage: " << std::endl
61
+ << " " << argv[0] << " <robots.txt filename> <user_agent> <URI>"
62
+ << std::endl
63
+ << std::endl;
64
+ std::cerr << "The URI must be %-encoded according to RFC3986." << std::endl
65
+ << std::endl;
66
+ std::cerr << "Example: " << std::endl
67
+ << " " << argv[0] << " robots.txt FooBot http://example.com/foo"
68
+ << std::endl;
69
+ }
70
+
71
+ int main(int argc, char** argv) {
72
+ std::string filename = argc >= 2 ? argv[1] : "";
73
+ if (filename == "-h" || filename == "-help" || filename == "--help") {
74
+ ShowHelp(argc, argv);
75
+ return 0;
76
+ }
77
+ if (argc != 4) {
78
+ std::cerr << "Invalid amount of arguments. Showing help." << std::endl
79
+ << std::endl;
80
+ ShowHelp(argc, argv);
81
+ return 1;
82
+ }
83
+ std::string robots_content;
84
+ if (!(LoadFile(filename, &robots_content))) {
85
+ std::cerr << "failed to read file \"" << filename << "\"" << std::endl;
86
+ return 1;
87
+ }
88
+
89
+ std::string user_agent = argv[2];
90
+ std::vector<std::string> user_agents(1, user_agent);
91
+ googlebot::RobotsMatcher matcher;
92
+ std::string url = argv[3];
93
+ bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
94
+
95
+ std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
96
+ << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
97
+ if (robots_content.empty()) {
98
+ std::cout << "notice: robots file is empty so all user-agents are allowed"
99
+ << std::endl;
100
+ }
101
+ }