google_robotstxt_parser 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +28 -0
- data/.gitmodules +3 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +6 -0
- data/Guardfile +16 -0
- data/LICENSE +22 -0
- data/README.md +57 -0
- data/Rakefile +6 -0
- data/ext/robotstxt/.DS_Store +0 -0
- data/ext/robotstxt/extconf.rb +83 -0
- data/ext/robotstxt/robotstxt/.gitignore +1 -0
- data/ext/robotstxt/robotstxt/BUILD +40 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt +174 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt.in +30 -0
- data/ext/robotstxt/robotstxt/CONTRIBUTING.md +30 -0
- data/ext/robotstxt/robotstxt/LICENSE +203 -0
- data/ext/robotstxt/robotstxt/README.md +134 -0
- data/ext/robotstxt/robotstxt/WORKSPACE +28 -0
- data/ext/robotstxt/robotstxt/protocol-draft/README.md +9 -0
- data/ext/robotstxt/robotstxt/protocol-draft/draft-koster-rep-00.txt +529 -0
- data/ext/robotstxt/robotstxt/robots.cc +706 -0
- data/ext/robotstxt/robotstxt/robots.h +241 -0
- data/ext/robotstxt/robotstxt/robots_main.cc +101 -0
- data/ext/robotstxt/robotstxt/robots_test.cc +990 -0
- data/ext/robotstxt/robotstxt.cc +32 -0
- data/google_robotstxt_parser.gemspec +45 -0
- data/lib/google_robotstxt_parser/version.rb +6 -0
- data/lib/google_robotstxt_parser.rb +4 -0
- data/spec/google_robotstxt_parser_spec.rb +33 -0
- data/spec/spec_helper.rb +19 -0
- metadata +146 -0
@@ -0,0 +1,241 @@
|
|
1
|
+
// Copyright 1999 Google LLC
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// https://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// -----------------------------------------------------------------------------
|
16
|
+
// File: robots.h
|
17
|
+
// -----------------------------------------------------------------------------
|
18
|
+
//
|
19
|
+
// This file implements the standard defined by the Robots Exclusion Protocol
|
20
|
+
// (REP) internet draft (I-D).
|
21
|
+
// https://tools.ietf.org/html/draft-koster-rep
|
22
|
+
//
|
23
|
+
// Google doesn't follow the standard strictly, because there are a lot of
|
24
|
+
// non-conforming robots.txt files out there, and we err on the side of
|
25
|
+
// disallowing when this seems intended.
|
26
|
+
//
|
27
|
+
// An more user-friendly description of how Google handles robots.txt can be
|
28
|
+
// found at:
|
29
|
+
// https://developers.google.com/search/reference/robots_txt
|
30
|
+
//
|
31
|
+
// This library provides a low-level parser for robots.txt (ParseRobotsTxt()),
|
32
|
+
// and a matcher for URLs against a robots.txt (class RobotsMatcher).
|
33
|
+
|
34
|
+
#ifndef THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
|
35
|
+
#define THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
|
36
|
+
|
37
|
+
#include <string>
|
38
|
+
#include <vector>
|
39
|
+
|
40
|
+
#include "absl/strings/string_view.h"
|
41
|
+
|
42
|
+
namespace googlebot {
|
43
|
+
// Handler for directives found in robots.txt. These callbacks are called by
|
44
|
+
// ParseRobotsTxt() in the sequence they have been found in the file.
|
45
|
+
class RobotsParseHandler {
|
46
|
+
public:
|
47
|
+
RobotsParseHandler() {}
|
48
|
+
virtual ~RobotsParseHandler() {}
|
49
|
+
|
50
|
+
// Disallow copying and assignment.
|
51
|
+
RobotsParseHandler(const RobotsParseHandler&) = delete;
|
52
|
+
RobotsParseHandler& operator=(const RobotsParseHandler&) = delete;
|
53
|
+
|
54
|
+
virtual void HandleRobotsStart() = 0;
|
55
|
+
virtual void HandleRobotsEnd() = 0;
|
56
|
+
|
57
|
+
virtual void HandleUserAgent(int line_num, absl::string_view value) = 0;
|
58
|
+
virtual void HandleAllow(int line_num, absl::string_view value) = 0;
|
59
|
+
virtual void HandleDisallow(int line_num, absl::string_view value) = 0;
|
60
|
+
|
61
|
+
virtual void HandleSitemap(int line_num, absl::string_view value) = 0;
|
62
|
+
|
63
|
+
// Any other unrecognized name/value pairs.
|
64
|
+
virtual void HandleUnknownAction(int line_num, absl::string_view action,
|
65
|
+
absl::string_view value) = 0;
|
66
|
+
};
|
67
|
+
|
68
|
+
// Parses body of a robots.txt and emits parse callbacks. This will accept
|
69
|
+
// typical typos found in robots.txt, such as 'disalow'.
|
70
|
+
//
|
71
|
+
// Note, this function will accept all kind of input but will skip
|
72
|
+
// everything that does not look like a robots directive.
|
73
|
+
void ParseRobotsTxt(absl::string_view robots_body,
|
74
|
+
RobotsParseHandler* parse_callback);
|
75
|
+
|
76
|
+
// RobotsMatcher - matches robots.txt against URLs.
|
77
|
+
//
|
78
|
+
// The Matcher uses a default match strategy for Allow/Disallow patterns which
|
79
|
+
// is the official way of Google crawler to match robots.txt. It is also
|
80
|
+
// possible to provide a custom match strategy.
|
81
|
+
//
|
82
|
+
// The entry point for the user is to call one of the *AllowedByRobots()
|
83
|
+
// methods that return directly if a URL is being allowed according to the
|
84
|
+
// robots.txt and the crawl agent.
|
85
|
+
// The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe.
|
86
|
+
class RobotsMatchStrategy;
|
87
|
+
class RobotsMatcher : protected RobotsParseHandler {
|
88
|
+
public:
|
89
|
+
// Create a RobotsMatcher with the default matching strategy. The default
|
90
|
+
// matching strategy is longest-match as opposed to the former internet draft
|
91
|
+
// that provisioned first-match strategy. Analysis shows that longest-match,
|
92
|
+
// while more restrictive for crawlers, is what webmasters assume when writing
|
93
|
+
// directives. For example, in case of conflicting matches (both Allow and
|
94
|
+
// Disallow), the longest match is the one the user wants. For example, in
|
95
|
+
// case of a robots.txt file that has the following rules
|
96
|
+
// Allow: /
|
97
|
+
// Disallow: /cgi-bin
|
98
|
+
// it's pretty obvious what the webmaster wants: they want to allow crawl of
|
99
|
+
// every URI except /cgi-bin. However, according to the expired internet
|
100
|
+
// standard, crawlers should be allowed to crawl everything with such a rule.
|
101
|
+
RobotsMatcher();
|
102
|
+
|
103
|
+
~RobotsMatcher() override;
|
104
|
+
|
105
|
+
// Disallow copying and assignment.
|
106
|
+
RobotsMatcher(const RobotsMatcher&) = delete;
|
107
|
+
RobotsMatcher& operator=(const RobotsMatcher&) = delete;
|
108
|
+
|
109
|
+
// Verifies that the given user agent is valid to be matched against
|
110
|
+
// robots.txt. Valid user agent strings only contain the characters
|
111
|
+
// [a-zA-Z_-].
|
112
|
+
static bool IsValidUserAgentToObey(absl::string_view user_agent);
|
113
|
+
|
114
|
+
// Returns true iff 'url' is allowed to be fetched by any member of the
|
115
|
+
// "user_agents" vector. 'url' must be %-encoded according to RFC3986.
|
116
|
+
bool AllowedByRobots(absl::string_view robots_body,
|
117
|
+
const std::vector<std::string>* user_agents,
|
118
|
+
const std::string& url);
|
119
|
+
|
120
|
+
// Do robots check for 'url' when there is only one user agent. 'url' must
|
121
|
+
// be %-encoded according to RFC3986.
|
122
|
+
bool OneAgentAllowedByRobots(absl::string_view robots_txt,
|
123
|
+
const std::string& user_agent,
|
124
|
+
const std::string& url);
|
125
|
+
|
126
|
+
// Returns true if we are disallowed from crawling a matching URI.
|
127
|
+
bool disallow() const;
|
128
|
+
|
129
|
+
// Returns true if we are disallowed from crawling a matching URI. Ignores any
|
130
|
+
// rules specified for the default user agent, and bases its results only on
|
131
|
+
// the specified user agents.
|
132
|
+
bool disallow_ignore_global() const;
|
133
|
+
|
134
|
+
// Returns true iff, when AllowedByRobots() was called, the robots file
|
135
|
+
// referred explicitly to one of the specified user agents.
|
136
|
+
bool ever_seen_specific_agent() const;
|
137
|
+
|
138
|
+
// Returns the line that matched or 0 if none matched.
|
139
|
+
const int matching_line() const;
|
140
|
+
|
141
|
+
protected:
|
142
|
+
// Parse callbacks.
|
143
|
+
// Protected because used in unittest. Never override RobotsMatcher, implement
|
144
|
+
// googlebot::RobotsParseHandler instead.
|
145
|
+
void HandleRobotsStart() override;
|
146
|
+
void HandleRobotsEnd() override {}
|
147
|
+
|
148
|
+
void HandleUserAgent(int line_num, absl::string_view value) override;
|
149
|
+
void HandleAllow(int line_num, absl::string_view value) override;
|
150
|
+
void HandleDisallow(int line_num, absl::string_view value) override;
|
151
|
+
|
152
|
+
void HandleSitemap(int line_num, absl::string_view value) override;
|
153
|
+
void HandleUnknownAction(int line_num, absl::string_view action,
|
154
|
+
absl::string_view value) override;
|
155
|
+
|
156
|
+
protected:
|
157
|
+
// Extract the matchable part of a user agent string, essentially stopping at
|
158
|
+
// the first invalid character.
|
159
|
+
// Example: 'Googlebot/2.1' becomes 'Googlebot'
|
160
|
+
static absl::string_view ExtractUserAgent(absl::string_view user_agent);
|
161
|
+
|
162
|
+
// Initialize next path and user-agents to check. Path must contain only the
|
163
|
+
// path, params, and query (if any) of the url and must start with a '/'.
|
164
|
+
void InitUserAgentsAndPath(const std::vector<std::string>* user_agents,
|
165
|
+
const char* path);
|
166
|
+
|
167
|
+
// Returns true if any user-agent was seen.
|
168
|
+
bool seen_any_agent() const {
|
169
|
+
return seen_global_agent_ || seen_specific_agent_;
|
170
|
+
}
|
171
|
+
|
172
|
+
// Instead of just maintaining a Boolean indicating whether a given line has
|
173
|
+
// matched, we maintain a count of the maximum number of characters matched by
|
174
|
+
// that pattern.
|
175
|
+
//
|
176
|
+
// This structure stores the information associated with a match (e.g. when a
|
177
|
+
// Disallow is matched) as priority of the match and line matching.
|
178
|
+
//
|
179
|
+
// The priority is initialized with a negative value to make sure that a match
|
180
|
+
// of priority 0 is higher priority than no match at all.
|
181
|
+
class Match {
|
182
|
+
private:
|
183
|
+
static const int kNoMatchPriority = -1;
|
184
|
+
|
185
|
+
public:
|
186
|
+
Match(int priority, int line) : priority_(priority), line_(line) {}
|
187
|
+
Match() : priority_(kNoMatchPriority), line_(0) {}
|
188
|
+
|
189
|
+
void Set(int priority, int line) {
|
190
|
+
priority_ = priority;
|
191
|
+
line_ = line;
|
192
|
+
}
|
193
|
+
|
194
|
+
void Clear() { Set(kNoMatchPriority, 0); }
|
195
|
+
|
196
|
+
int line() const { return line_; }
|
197
|
+
int priority() const { return priority_; }
|
198
|
+
|
199
|
+
static const Match& HigherPriorityMatch(const Match& a, const Match& b) {
|
200
|
+
if (a.priority() > b.priority()) {
|
201
|
+
return a;
|
202
|
+
} else {
|
203
|
+
return b;
|
204
|
+
}
|
205
|
+
}
|
206
|
+
|
207
|
+
private:
|
208
|
+
int priority_;
|
209
|
+
int line_;
|
210
|
+
};
|
211
|
+
|
212
|
+
// For each of the directives within user-agents, we keep global and specific
|
213
|
+
// match scores.
|
214
|
+
struct MatchHierarchy {
|
215
|
+
Match global; // Match for '*'
|
216
|
+
Match specific; // Match for queried agent.
|
217
|
+
void Clear() {
|
218
|
+
global.Clear();
|
219
|
+
specific.Clear();
|
220
|
+
}
|
221
|
+
};
|
222
|
+
MatchHierarchy allow_; // Characters of 'url' matching Allow.
|
223
|
+
MatchHierarchy disallow_; // Characters of 'url' matching Disallow.
|
224
|
+
|
225
|
+
bool seen_global_agent_; // True if processing global agent rules.
|
226
|
+
bool seen_specific_agent_; // True if processing our specific agent.
|
227
|
+
bool ever_seen_specific_agent_; // True if we ever saw a block for our agent.
|
228
|
+
bool seen_separator_; // True if saw any key: value pair.
|
229
|
+
|
230
|
+
// The path we want to pattern match. Not owned and only a valid pointer
|
231
|
+
// during the lifetime of *AllowedByRobots calls.
|
232
|
+
const char* path_;
|
233
|
+
// The User-Agents we are interested in. Not owned and only a valid
|
234
|
+
// pointer during the lifetime of *AllowedByRobots calls.
|
235
|
+
const std::vector<std::string>* user_agents_;
|
236
|
+
|
237
|
+
RobotsMatchStrategy* match_strategy_;
|
238
|
+
};
|
239
|
+
|
240
|
+
} // namespace googlebot
|
241
|
+
#endif // THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
|
@@ -0,0 +1,101 @@
|
|
1
|
+
// Copyright 2019 Google LLC
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// https://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// -----------------------------------------------------------------------------
|
16
|
+
// File: robots_main.cc
|
17
|
+
// -----------------------------------------------------------------------------
|
18
|
+
//
|
19
|
+
// Simple binary to assess whether a URL is accessible to a user-agent according
|
20
|
+
// to records found in a local robots.txt file, based on Google's robots.txt
|
21
|
+
// parsing and matching algorithms.
|
22
|
+
// Usage:
|
23
|
+
// robots_main <local_path_to_robotstxt> <user_agent> <url>
|
24
|
+
// Arguments:
|
25
|
+
// local_path_to_robotstxt: local path to a file containing robots.txt records.
|
26
|
+
// For example: /home/users/username/robots.txt
|
27
|
+
// user_agent: a token to be matched against records in the robots.txt.
|
28
|
+
// For example: Googlebot
|
29
|
+
// url: a url to be matched against records in the robots.txt. The URL must be
|
30
|
+
// %-encoded according to RFC3986.
|
31
|
+
// For example: https://example.com/accessible/url.html
|
32
|
+
// Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
|
33
|
+
// to access 'url' based on records in 'local_path_to_robotstxt'.
|
34
|
+
//
|
35
|
+
#include <fstream>
|
36
|
+
#include <iostream>
|
37
|
+
|
38
|
+
#include "robots.h"
|
39
|
+
|
40
|
+
bool LoadFile(const std::string& filename, std::string* result) {
|
41
|
+
std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
|
42
|
+
if (file.is_open()) {
|
43
|
+
size_t size = file.tellg();
|
44
|
+
std::vector<char> buffer(size);
|
45
|
+
file.seekg(0, std::ios::beg);
|
46
|
+
file.read(buffer.data(), size);
|
47
|
+
file.close();
|
48
|
+
if (!file) return false; // file reading error (failbit or badbit).
|
49
|
+
result->assign(buffer.begin(), buffer.end());
|
50
|
+
return true;
|
51
|
+
}
|
52
|
+
return false;
|
53
|
+
}
|
54
|
+
|
55
|
+
void ShowHelp(int argc, char** argv) {
|
56
|
+
std::cerr << "Shows whether the given user_agent and URI combination"
|
57
|
+
<< " is allowed or disallowed by the given robots.txt file. "
|
58
|
+
<< std::endl
|
59
|
+
<< std::endl;
|
60
|
+
std::cerr << "Usage: " << std::endl
|
61
|
+
<< " " << argv[0] << " <robots.txt filename> <user_agent> <URI>"
|
62
|
+
<< std::endl
|
63
|
+
<< std::endl;
|
64
|
+
std::cerr << "The URI must be %-encoded according to RFC3986." << std::endl
|
65
|
+
<< std::endl;
|
66
|
+
std::cerr << "Example: " << std::endl
|
67
|
+
<< " " << argv[0] << " robots.txt FooBot http://example.com/foo"
|
68
|
+
<< std::endl;
|
69
|
+
}
|
70
|
+
|
71
|
+
int main(int argc, char** argv) {
|
72
|
+
std::string filename = argc >= 2 ? argv[1] : "";
|
73
|
+
if (filename == "-h" || filename == "-help" || filename == "--help") {
|
74
|
+
ShowHelp(argc, argv);
|
75
|
+
return 0;
|
76
|
+
}
|
77
|
+
if (argc != 4) {
|
78
|
+
std::cerr << "Invalid amount of arguments. Showing help." << std::endl
|
79
|
+
<< std::endl;
|
80
|
+
ShowHelp(argc, argv);
|
81
|
+
return 1;
|
82
|
+
}
|
83
|
+
std::string robots_content;
|
84
|
+
if (!(LoadFile(filename, &robots_content))) {
|
85
|
+
std::cerr << "failed to read file \"" << filename << "\"" << std::endl;
|
86
|
+
return 1;
|
87
|
+
}
|
88
|
+
|
89
|
+
std::string user_agent = argv[2];
|
90
|
+
std::vector<std::string> user_agents(1, user_agent);
|
91
|
+
googlebot::RobotsMatcher matcher;
|
92
|
+
std::string url = argv[3];
|
93
|
+
bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
|
94
|
+
|
95
|
+
std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
|
96
|
+
<< "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
|
97
|
+
if (robots_content.empty()) {
|
98
|
+
std::cout << "notice: robots file is empty so all user-agents are allowed"
|
99
|
+
<< std::endl;
|
100
|
+
}
|
101
|
+
}
|