google_robotstxt_parser 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,241 @@
1
+ // Copyright 1999 Google LLC
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // https://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // -----------------------------------------------------------------------------
16
+ // File: robots.h
17
+ // -----------------------------------------------------------------------------
18
+ //
19
+ // This file implements the standard defined by the Robots Exclusion Protocol
20
+ // (REP) internet draft (I-D).
21
+ // https://tools.ietf.org/html/draft-koster-rep
22
+ //
23
+ // Google doesn't follow the standard strictly, because there are a lot of
24
+ // non-conforming robots.txt files out there, and we err on the side of
25
+ // disallowing when this seems intended.
26
+ //
27
+ // An more user-friendly description of how Google handles robots.txt can be
28
+ // found at:
29
+ // https://developers.google.com/search/reference/robots_txt
30
+ //
31
+ // This library provides a low-level parser for robots.txt (ParseRobotsTxt()),
32
+ // and a matcher for URLs against a robots.txt (class RobotsMatcher).
33
+
34
+ #ifndef THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
35
+ #define THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
36
+
37
+ #include <string>
38
+ #include <vector>
39
+
40
+ #include "absl/strings/string_view.h"
41
+
42
+ namespace googlebot {
43
+ // Handler for directives found in robots.txt. These callbacks are called by
44
+ // ParseRobotsTxt() in the sequence they have been found in the file.
45
+ class RobotsParseHandler {
46
+ public:
47
+ RobotsParseHandler() {}
48
+ virtual ~RobotsParseHandler() {}
49
+
50
+ // Disallow copying and assignment.
51
+ RobotsParseHandler(const RobotsParseHandler&) = delete;
52
+ RobotsParseHandler& operator=(const RobotsParseHandler&) = delete;
53
+
54
+ virtual void HandleRobotsStart() = 0;
55
+ virtual void HandleRobotsEnd() = 0;
56
+
57
+ virtual void HandleUserAgent(int line_num, absl::string_view value) = 0;
58
+ virtual void HandleAllow(int line_num, absl::string_view value) = 0;
59
+ virtual void HandleDisallow(int line_num, absl::string_view value) = 0;
60
+
61
+ virtual void HandleSitemap(int line_num, absl::string_view value) = 0;
62
+
63
+ // Any other unrecognized name/value pairs.
64
+ virtual void HandleUnknownAction(int line_num, absl::string_view action,
65
+ absl::string_view value) = 0;
66
+ };
67
+
68
+ // Parses body of a robots.txt and emits parse callbacks. This will accept
69
+ // typical typos found in robots.txt, such as 'disalow'.
70
+ //
71
+ // Note, this function will accept all kind of input but will skip
72
+ // everything that does not look like a robots directive.
73
+ void ParseRobotsTxt(absl::string_view robots_body,
74
+ RobotsParseHandler* parse_callback);
75
+
76
+ // RobotsMatcher - matches robots.txt against URLs.
77
+ //
78
+ // The Matcher uses a default match strategy for Allow/Disallow patterns which
79
+ // is the official way of Google crawler to match robots.txt. It is also
80
+ // possible to provide a custom match strategy.
81
+ //
82
+ // The entry point for the user is to call one of the *AllowedByRobots()
83
+ // methods that return directly if a URL is being allowed according to the
84
+ // robots.txt and the crawl agent.
85
+ // The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe.
86
+ class RobotsMatchStrategy;
87
+ class RobotsMatcher : protected RobotsParseHandler {
88
+ public:
89
+ // Create a RobotsMatcher with the default matching strategy. The default
90
+ // matching strategy is longest-match as opposed to the former internet draft
91
+ // that provisioned first-match strategy. Analysis shows that longest-match,
92
+ // while more restrictive for crawlers, is what webmasters assume when writing
93
+ // directives. For example, in case of conflicting matches (both Allow and
94
+ // Disallow), the longest match is the one the user wants. For example, in
95
+ // case of a robots.txt file that has the following rules
96
+ // Allow: /
97
+ // Disallow: /cgi-bin
98
+ // it's pretty obvious what the webmaster wants: they want to allow crawl of
99
+ // every URI except /cgi-bin. However, according to the expired internet
100
+ // standard, crawlers should be allowed to crawl everything with such a rule.
101
+ RobotsMatcher();
102
+
103
+ ~RobotsMatcher() override;
104
+
105
+ // Disallow copying and assignment.
106
+ RobotsMatcher(const RobotsMatcher&) = delete;
107
+ RobotsMatcher& operator=(const RobotsMatcher&) = delete;
108
+
109
+ // Verifies that the given user agent is valid to be matched against
110
+ // robots.txt. Valid user agent strings only contain the characters
111
+ // [a-zA-Z_-].
112
+ static bool IsValidUserAgentToObey(absl::string_view user_agent);
113
+
114
+ // Returns true iff 'url' is allowed to be fetched by any member of the
115
+ // "user_agents" vector. 'url' must be %-encoded according to RFC3986.
116
+ bool AllowedByRobots(absl::string_view robots_body,
117
+ const std::vector<std::string>* user_agents,
118
+ const std::string& url);
119
+
120
+ // Do robots check for 'url' when there is only one user agent. 'url' must
121
+ // be %-encoded according to RFC3986.
122
+ bool OneAgentAllowedByRobots(absl::string_view robots_txt,
123
+ const std::string& user_agent,
124
+ const std::string& url);
125
+
126
+ // Returns true if we are disallowed from crawling a matching URI.
127
+ bool disallow() const;
128
+
129
+ // Returns true if we are disallowed from crawling a matching URI. Ignores any
130
+ // rules specified for the default user agent, and bases its results only on
131
+ // the specified user agents.
132
+ bool disallow_ignore_global() const;
133
+
134
+ // Returns true iff, when AllowedByRobots() was called, the robots file
135
+ // referred explicitly to one of the specified user agents.
136
+ bool ever_seen_specific_agent() const;
137
+
138
+ // Returns the line that matched or 0 if none matched.
139
+ const int matching_line() const;
140
+
141
+ protected:
142
+ // Parse callbacks.
143
+ // Protected because used in unittest. Never override RobotsMatcher, implement
144
+ // googlebot::RobotsParseHandler instead.
145
+ void HandleRobotsStart() override;
146
+ void HandleRobotsEnd() override {}
147
+
148
+ void HandleUserAgent(int line_num, absl::string_view value) override;
149
+ void HandleAllow(int line_num, absl::string_view value) override;
150
+ void HandleDisallow(int line_num, absl::string_view value) override;
151
+
152
+ void HandleSitemap(int line_num, absl::string_view value) override;
153
+ void HandleUnknownAction(int line_num, absl::string_view action,
154
+ absl::string_view value) override;
155
+
156
+ protected:
157
+ // Extract the matchable part of a user agent string, essentially stopping at
158
+ // the first invalid character.
159
+ // Example: 'Googlebot/2.1' becomes 'Googlebot'
160
+ static absl::string_view ExtractUserAgent(absl::string_view user_agent);
161
+
162
+ // Initialize next path and user-agents to check. Path must contain only the
163
+ // path, params, and query (if any) of the url and must start with a '/'.
164
+ void InitUserAgentsAndPath(const std::vector<std::string>* user_agents,
165
+ const char* path);
166
+
167
+ // Returns true if any user-agent was seen.
168
+ bool seen_any_agent() const {
169
+ return seen_global_agent_ || seen_specific_agent_;
170
+ }
171
+
172
+ // Instead of just maintaining a Boolean indicating whether a given line has
173
+ // matched, we maintain a count of the maximum number of characters matched by
174
+ // that pattern.
175
+ //
176
+ // This structure stores the information associated with a match (e.g. when a
177
+ // Disallow is matched) as priority of the match and line matching.
178
+ //
179
+ // The priority is initialized with a negative value to make sure that a match
180
+ // of priority 0 is higher priority than no match at all.
181
+ class Match {
182
+ private:
183
+ static const int kNoMatchPriority = -1;
184
+
185
+ public:
186
+ Match(int priority, int line) : priority_(priority), line_(line) {}
187
+ Match() : priority_(kNoMatchPriority), line_(0) {}
188
+
189
+ void Set(int priority, int line) {
190
+ priority_ = priority;
191
+ line_ = line;
192
+ }
193
+
194
+ void Clear() { Set(kNoMatchPriority, 0); }
195
+
196
+ int line() const { return line_; }
197
+ int priority() const { return priority_; }
198
+
199
+ static const Match& HigherPriorityMatch(const Match& a, const Match& b) {
200
+ if (a.priority() > b.priority()) {
201
+ return a;
202
+ } else {
203
+ return b;
204
+ }
205
+ }
206
+
207
+ private:
208
+ int priority_;
209
+ int line_;
210
+ };
211
+
212
+ // For each of the directives within user-agents, we keep global and specific
213
+ // match scores.
214
+ struct MatchHierarchy {
215
+ Match global; // Match for '*'
216
+ Match specific; // Match for queried agent.
217
+ void Clear() {
218
+ global.Clear();
219
+ specific.Clear();
220
+ }
221
+ };
222
+ MatchHierarchy allow_; // Characters of 'url' matching Allow.
223
+ MatchHierarchy disallow_; // Characters of 'url' matching Disallow.
224
+
225
+ bool seen_global_agent_; // True if processing global agent rules.
226
+ bool seen_specific_agent_; // True if processing our specific agent.
227
+ bool ever_seen_specific_agent_; // True if we ever saw a block for our agent.
228
+ bool seen_separator_; // True if saw any key: value pair.
229
+
230
+ // The path we want to pattern match. Not owned and only a valid pointer
231
+ // during the lifetime of *AllowedByRobots calls.
232
+ const char* path_;
233
+ // The User-Agents we are interested in. Not owned and only a valid
234
+ // pointer during the lifetime of *AllowedByRobots calls.
235
+ const std::vector<std::string>* user_agents_;
236
+
237
+ RobotsMatchStrategy* match_strategy_;
238
+ };
239
+
240
+ } // namespace googlebot
241
+ #endif // THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
@@ -0,0 +1,101 @@
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // https://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // -----------------------------------------------------------------------------
16
+ // File: robots_main.cc
17
+ // -----------------------------------------------------------------------------
18
+ //
19
+ // Simple binary to assess whether a URL is accessible to a user-agent according
20
+ // to records found in a local robots.txt file, based on Google's robots.txt
21
+ // parsing and matching algorithms.
22
+ // Usage:
23
+ // robots_main <local_path_to_robotstxt> <user_agent> <url>
24
+ // Arguments:
25
+ // local_path_to_robotstxt: local path to a file containing robots.txt records.
26
+ // For example: /home/users/username/robots.txt
27
+ // user_agent: a token to be matched against records in the robots.txt.
28
+ // For example: Googlebot
29
+ // url: a url to be matched against records in the robots.txt. The URL must be
30
+ // %-encoded according to RFC3986.
31
+ // For example: https://example.com/accessible/url.html
32
+ // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
33
+ // to access 'url' based on records in 'local_path_to_robotstxt'.
34
+ //
35
+ #include <fstream>
36
+ #include <iostream>
37
+
38
+ #include "robots.h"
39
+
40
+ bool LoadFile(const std::string& filename, std::string* result) {
41
+ std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
42
+ if (file.is_open()) {
43
+ size_t size = file.tellg();
44
+ std::vector<char> buffer(size);
45
+ file.seekg(0, std::ios::beg);
46
+ file.read(buffer.data(), size);
47
+ file.close();
48
+ if (!file) return false; // file reading error (failbit or badbit).
49
+ result->assign(buffer.begin(), buffer.end());
50
+ return true;
51
+ }
52
+ return false;
53
+ }
54
+
55
+ void ShowHelp(int argc, char** argv) {
56
+ std::cerr << "Shows whether the given user_agent and URI combination"
57
+ << " is allowed or disallowed by the given robots.txt file. "
58
+ << std::endl
59
+ << std::endl;
60
+ std::cerr << "Usage: " << std::endl
61
+ << " " << argv[0] << " <robots.txt filename> <user_agent> <URI>"
62
+ << std::endl
63
+ << std::endl;
64
+ std::cerr << "The URI must be %-encoded according to RFC3986." << std::endl
65
+ << std::endl;
66
+ std::cerr << "Example: " << std::endl
67
+ << " " << argv[0] << " robots.txt FooBot http://example.com/foo"
68
+ << std::endl;
69
+ }
70
+
71
+ int main(int argc, char** argv) {
72
+ std::string filename = argc >= 2 ? argv[1] : "";
73
+ if (filename == "-h" || filename == "-help" || filename == "--help") {
74
+ ShowHelp(argc, argv);
75
+ return 0;
76
+ }
77
+ if (argc != 4) {
78
+ std::cerr << "Invalid amount of arguments. Showing help." << std::endl
79
+ << std::endl;
80
+ ShowHelp(argc, argv);
81
+ return 1;
82
+ }
83
+ std::string robots_content;
84
+ if (!(LoadFile(filename, &robots_content))) {
85
+ std::cerr << "failed to read file \"" << filename << "\"" << std::endl;
86
+ return 1;
87
+ }
88
+
89
+ std::string user_agent = argv[2];
90
+ std::vector<std::string> user_agents(1, user_agent);
91
+ googlebot::RobotsMatcher matcher;
92
+ std::string url = argv[3];
93
+ bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
94
+
95
+ std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
96
+ << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
97
+ if (robots_content.empty()) {
98
+ std::cout << "notice: robots file is empty so all user-agents are allowed"
99
+ << std::endl;
100
+ }
101
+ }