google_robotstxt_parser 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +28 -0
- data/.gitmodules +3 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +6 -0
- data/Guardfile +16 -0
- data/LICENSE +22 -0
- data/README.md +57 -0
- data/Rakefile +6 -0
- data/ext/robotstxt/.DS_Store +0 -0
- data/ext/robotstxt/extconf.rb +83 -0
- data/ext/robotstxt/robotstxt/.gitignore +1 -0
- data/ext/robotstxt/robotstxt/BUILD +40 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt +174 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt.in +30 -0
- data/ext/robotstxt/robotstxt/CONTRIBUTING.md +30 -0
- data/ext/robotstxt/robotstxt/LICENSE +203 -0
- data/ext/robotstxt/robotstxt/README.md +134 -0
- data/ext/robotstxt/robotstxt/WORKSPACE +28 -0
- data/ext/robotstxt/robotstxt/protocol-draft/README.md +9 -0
- data/ext/robotstxt/robotstxt/protocol-draft/draft-koster-rep-00.txt +529 -0
- data/ext/robotstxt/robotstxt/robots.cc +706 -0
- data/ext/robotstxt/robotstxt/robots.h +241 -0
- data/ext/robotstxt/robotstxt/robots_main.cc +101 -0
- data/ext/robotstxt/robotstxt/robots_test.cc +990 -0
- data/ext/robotstxt/robotstxt.cc +32 -0
- data/google_robotstxt_parser.gemspec +45 -0
- data/lib/google_robotstxt_parser/version.rb +6 -0
- data/lib/google_robotstxt_parser.rb +4 -0
- data/spec/google_robotstxt_parser_spec.rb +33 -0
- data/spec/spec_helper.rb +19 -0
- metadata +146 -0
@@ -0,0 +1,706 @@
|
|
1
|
+
// Copyright 1999 Google LLC
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// https://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// -----------------------------------------------------------------------------
|
16
|
+
// File: robots.cc
|
17
|
+
// -----------------------------------------------------------------------------
|
18
|
+
//
|
19
|
+
// Implements expired internet draft
|
20
|
+
// http://www.robotstxt.org/norobots-rfc.txt
|
21
|
+
// with Google-specific optimizations detailed at
|
22
|
+
// https://developers.google.com/search/reference/robots_txt
|
23
|
+
|
24
|
+
#include "robots.h"
|
25
|
+
|
26
|
+
#include <stdlib.h>
|
27
|
+
|
28
|
+
#include <cstddef>
|
29
|
+
#include <vector>
|
30
|
+
|
31
|
+
#include "absl/base/macros.h"
|
32
|
+
#include "absl/container/fixed_array.h"
|
33
|
+
#include "absl/strings/ascii.h"
|
34
|
+
#include "absl/strings/match.h"
|
35
|
+
#include "absl/strings/numbers.h"
|
36
|
+
#include "absl/strings/string_view.h"
|
37
|
+
|
38
|
+
// Allow for typos such as DISALOW in robots.txt.
|
39
|
+
static bool kAllowFrequentTypos = true;
|
40
|
+
|
41
|
+
namespace googlebot {
|
42
|
+
|
43
|
+
// A RobotsMatchStrategy defines a strategy for matching individual lines in a
|
44
|
+
// robots.txt file. Each Match* method should return a match priority, which is
|
45
|
+
// interpreted as:
|
46
|
+
//
|
47
|
+
// match priority < 0:
|
48
|
+
// No match.
|
49
|
+
//
|
50
|
+
// match priority == 0:
|
51
|
+
// Match, but treat it as if matched an empty pattern.
|
52
|
+
//
|
53
|
+
// match priority > 0:
|
54
|
+
// Match.
|
55
|
+
class RobotsMatchStrategy {
|
56
|
+
public:
|
57
|
+
virtual ~RobotsMatchStrategy() {}
|
58
|
+
|
59
|
+
virtual int MatchAllow(absl::string_view path,
|
60
|
+
absl::string_view pattern) = 0;
|
61
|
+
virtual int MatchDisallow(absl::string_view path,
|
62
|
+
absl::string_view pattern) = 0;
|
63
|
+
|
64
|
+
protected:
|
65
|
+
// Implements robots.txt pattern matching.
|
66
|
+
static bool Matches(absl::string_view path, absl::string_view pattern);
|
67
|
+
};
|
68
|
+
|
69
|
+
// Returns true if URI path matches the specified pattern. Pattern is anchored
|
70
|
+
// at the beginning of path. '$' is special only at the end of pattern.
|
71
|
+
//
|
72
|
+
// Since 'path' and 'pattern' are both externally determined (by the webmaster),
|
73
|
+
// we make sure to have acceptable worst-case performance.
|
74
|
+
/* static */ bool RobotsMatchStrategy::Matches(
|
75
|
+
absl::string_view path, absl::string_view pattern) {
|
76
|
+
const size_t pathlen = path.length();
|
77
|
+
absl::FixedArray<size_t> pos(pathlen + 1);
|
78
|
+
int numpos;
|
79
|
+
|
80
|
+
// The pos[] array holds a sorted list of indexes of 'path', with length
|
81
|
+
// 'numpos'. At the start and end of each iteration of the main loop below,
|
82
|
+
// the pos[] array will hold a list of the prefixes of the 'path' which can
|
83
|
+
// match the current prefix of 'pattern'. If this list is ever empty,
|
84
|
+
// return false. If we reach the end of 'pattern' with at least one element
|
85
|
+
// in pos[], return true.
|
86
|
+
|
87
|
+
pos[0] = 0;
|
88
|
+
numpos = 1;
|
89
|
+
|
90
|
+
for (auto pat = pattern.begin(); pat != pattern.end(); ++pat) {
|
91
|
+
if (*pat == '$' && pat + 1 == pattern.end()) {
|
92
|
+
return (pos[numpos - 1] == pathlen);
|
93
|
+
}
|
94
|
+
if (*pat == '*') {
|
95
|
+
numpos = pathlen - pos[0] + 1;
|
96
|
+
for (int i = 1; i < numpos; i++) {
|
97
|
+
pos[i] = pos[i-1] + 1;
|
98
|
+
}
|
99
|
+
} else {
|
100
|
+
// Includes '$' when not at end of pattern.
|
101
|
+
int newnumpos = 0;
|
102
|
+
for (int i = 0; i < numpos; i++) {
|
103
|
+
if (pos[i] < pathlen && path[pos[i]] == *pat) {
|
104
|
+
pos[newnumpos++] = pos[i] + 1;
|
105
|
+
}
|
106
|
+
}
|
107
|
+
numpos = newnumpos;
|
108
|
+
if (numpos == 0) return false;
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
return true;
|
113
|
+
}
|
114
|
+
|
115
|
+
static const char* kHexDigits = "0123456789ABCDEF";
|
116
|
+
|
117
|
+
// GetPathParamsQuery is not in anonymous namespace to allow testing.
|
118
|
+
//
|
119
|
+
// Extracts path (with params) and query part from URL. Removes scheme,
|
120
|
+
// authority, and fragment. Result always starts with "/".
|
121
|
+
// Returns "/" if the url doesn't have a path or is not valid.
|
122
|
+
std::string GetPathParamsQuery(const std::string& url) {
|
123
|
+
std::string path;
|
124
|
+
|
125
|
+
// Initial two slashes are ignored.
|
126
|
+
size_t search_start = 0;
|
127
|
+
if (url.size() >= 2 && url[0] == '/' && url[1] == '/') search_start = 2;
|
128
|
+
|
129
|
+
size_t early_path = url.find_first_of("/?;", search_start);
|
130
|
+
size_t protocol_end = url.find("://", search_start);
|
131
|
+
if (early_path < protocol_end) {
|
132
|
+
// If path, param or query starts before ://, :// doesn't indicate protocol.
|
133
|
+
protocol_end = std::string::npos;
|
134
|
+
}
|
135
|
+
if (protocol_end == std::string::npos) {
|
136
|
+
protocol_end = search_start;
|
137
|
+
} else {
|
138
|
+
protocol_end += 3;
|
139
|
+
}
|
140
|
+
|
141
|
+
size_t path_start = url.find_first_of("/?;", protocol_end);
|
142
|
+
if (path_start != std::string::npos) {
|
143
|
+
size_t hash_pos = url.find('#', search_start);
|
144
|
+
if (hash_pos < path_start) return "/";
|
145
|
+
size_t path_end = (hash_pos == std::string::npos) ? url.size() : hash_pos;
|
146
|
+
if (url[path_start] != '/') {
|
147
|
+
// Prepend a slash if the result would start e.g. with '?'.
|
148
|
+
return "/" + url.substr(path_start, path_end - path_start);
|
149
|
+
}
|
150
|
+
return url.substr(path_start, path_end - path_start);
|
151
|
+
}
|
152
|
+
|
153
|
+
return "/";
|
154
|
+
}
|
155
|
+
|
156
|
+
// MaybeEscapePattern is not in anonymous namespace to allow testing.
|
157
|
+
//
|
158
|
+
// Canonicalize the allowed/disallowed paths. For example:
|
159
|
+
// /SanJoséSellers ==> /Sanjos%C3%A9Sellers
|
160
|
+
// %aa ==> %AA
|
161
|
+
// When the function returns, (*dst) either points to src, or is newly
|
162
|
+
// allocated.
|
163
|
+
// Returns true if dst was newly allocated.
|
164
|
+
bool MaybeEscapePattern(const char* src, char** dst) {
|
165
|
+
int num_to_escape = 0;
|
166
|
+
bool need_capitalize = false;
|
167
|
+
|
168
|
+
// First, scan the buffer to see if changes are needed. Most don't.
|
169
|
+
for (int i = 0; src[i] != 0; i++) {
|
170
|
+
// (a) % escape sequence.
|
171
|
+
if (src[i] == '%' &&
|
172
|
+
absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
|
173
|
+
if (absl::ascii_islower(src[i+1]) || absl::ascii_islower(src[i+2])) {
|
174
|
+
need_capitalize = true;
|
175
|
+
}
|
176
|
+
i += 2;
|
177
|
+
// (b) needs escaping.
|
178
|
+
} else if (src[i] & 0x80) {
|
179
|
+
num_to_escape++;
|
180
|
+
}
|
181
|
+
// (c) Already escaped and escape-characters normalized (eg. %2f -> %2F).
|
182
|
+
}
|
183
|
+
// Return if no changes needed.
|
184
|
+
if (!num_to_escape && !need_capitalize) {
|
185
|
+
(*dst) = const_cast<char*>(src);
|
186
|
+
return false;
|
187
|
+
}
|
188
|
+
(*dst) = new char[num_to_escape * 2 + strlen(src) + 1];
|
189
|
+
int j = 0;
|
190
|
+
for (int i = 0; src[i] != 0; i++) {
|
191
|
+
// (a) Normalize %-escaped sequence (eg. %2f -> %2F).
|
192
|
+
if (src[i] == '%' &&
|
193
|
+
absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
|
194
|
+
(*dst)[j++] = src[i++];
|
195
|
+
(*dst)[j++] = absl::ascii_toupper(src[i++]);
|
196
|
+
(*dst)[j++] = absl::ascii_toupper(src[i]);
|
197
|
+
// (b) %-escape octets whose highest bit is set. These are outside the
|
198
|
+
// ASCII range.
|
199
|
+
} else if (src[i] & 0x80) {
|
200
|
+
(*dst)[j++] = '%';
|
201
|
+
(*dst)[j++] = kHexDigits[(src[i] >> 4) & 0xf];
|
202
|
+
(*dst)[j++] = kHexDigits[src[i] & 0xf];
|
203
|
+
// (c) Normal character, no modification needed.
|
204
|
+
} else {
|
205
|
+
(*dst)[j++] = src[i];
|
206
|
+
}
|
207
|
+
}
|
208
|
+
(*dst)[j] = 0;
|
209
|
+
return true;
|
210
|
+
}
|
211
|
+
|
212
|
+
// Internal helper classes and functions.
|
213
|
+
namespace {
|
214
|
+
|
215
|
+
// A robots.txt has lines of key/value pairs. A ParsedRobotsKey represents
|
216
|
+
// a key. This class can parse a text-representation (including common typos)
|
217
|
+
// and represent them as an enumeration which allows for faster processing
|
218
|
+
// afterwards.
|
219
|
+
// For unparsable keys, the original string representation is kept.
|
220
|
+
class ParsedRobotsKey {
|
221
|
+
public:
|
222
|
+
enum KeyType {
|
223
|
+
// Generic highlevel fields.
|
224
|
+
USER_AGENT,
|
225
|
+
SITEMAP,
|
226
|
+
|
227
|
+
// Fields within a user-agent.
|
228
|
+
ALLOW,
|
229
|
+
DISALLOW,
|
230
|
+
|
231
|
+
// Unrecognized field; kept as-is. High number so that additions to the
|
232
|
+
// enumeration above does not change the serialization.
|
233
|
+
UNKNOWN = 128
|
234
|
+
};
|
235
|
+
|
236
|
+
ParsedRobotsKey() : type_(UNKNOWN) {}
|
237
|
+
|
238
|
+
// Disallow copying and assignment.
|
239
|
+
ParsedRobotsKey(const ParsedRobotsKey&) = delete;
|
240
|
+
ParsedRobotsKey& operator=(const ParsedRobotsKey&) = delete;
|
241
|
+
|
242
|
+
// Parse given key text. Does not copy the text, so the text_key must stay
|
243
|
+
// valid for the object's life-time or the next Parse() call.
|
244
|
+
void Parse(absl::string_view key);
|
245
|
+
|
246
|
+
// Returns the type of key.
|
247
|
+
KeyType type() const { return type_; }
|
248
|
+
|
249
|
+
// If this is an unknown key, get the text.
|
250
|
+
absl::string_view GetUnknownText() const;
|
251
|
+
|
252
|
+
private:
|
253
|
+
static bool KeyIsUserAgent(absl::string_view key);
|
254
|
+
static bool KeyIsAllow(absl::string_view key);
|
255
|
+
static bool KeyIsDisallow(absl::string_view key);
|
256
|
+
static bool KeyIsSitemap(absl::string_view key);
|
257
|
+
|
258
|
+
KeyType type_;
|
259
|
+
absl::string_view key_text_;
|
260
|
+
};
|
261
|
+
|
262
|
+
void EmitKeyValueToHandler(int line, const ParsedRobotsKey& key,
|
263
|
+
absl::string_view value,
|
264
|
+
RobotsParseHandler* handler) {
|
265
|
+
typedef ParsedRobotsKey Key;
|
266
|
+
switch (key.type()) {
|
267
|
+
case Key::USER_AGENT: handler->HandleUserAgent(line, value); break;
|
268
|
+
case Key::ALLOW: handler->HandleAllow(line, value); break;
|
269
|
+
case Key::DISALLOW: handler->HandleDisallow(line, value); break;
|
270
|
+
case Key::SITEMAP: handler->HandleSitemap(line, value); break;
|
271
|
+
case Key::UNKNOWN:
|
272
|
+
handler->HandleUnknownAction(line, key.GetUnknownText(), value);
|
273
|
+
break;
|
274
|
+
// No default case Key:: to have the compiler warn about new values.
|
275
|
+
}
|
276
|
+
}
|
277
|
+
|
278
|
+
class RobotsTxtParser {
|
279
|
+
public:
|
280
|
+
typedef ParsedRobotsKey Key;
|
281
|
+
|
282
|
+
RobotsTxtParser(absl::string_view robots_body,
|
283
|
+
RobotsParseHandler* handler)
|
284
|
+
: robots_body_(robots_body), handler_(handler) {
|
285
|
+
}
|
286
|
+
|
287
|
+
void Parse();
|
288
|
+
|
289
|
+
private:
|
290
|
+
static bool GetKeyAndValueFrom(char ** key, char **value, char *line);
|
291
|
+
static void StripWhitespaceSlowly(char ** s);
|
292
|
+
|
293
|
+
void ParseAndEmitLine(int current_line, char* line);
|
294
|
+
bool NeedEscapeValueForKey(const Key& key);
|
295
|
+
|
296
|
+
absl::string_view robots_body_;
|
297
|
+
RobotsParseHandler* const handler_;
|
298
|
+
};
|
299
|
+
|
300
|
+
bool RobotsTxtParser::NeedEscapeValueForKey(const Key& key) {
|
301
|
+
switch (key.type()) {
|
302
|
+
case RobotsTxtParser::Key::USER_AGENT:
|
303
|
+
case RobotsTxtParser::Key::SITEMAP:
|
304
|
+
return false;
|
305
|
+
default:
|
306
|
+
return true;
|
307
|
+
}
|
308
|
+
}
|
309
|
+
|
310
|
+
// Removes leading and trailing whitespace from null-terminated string s.
|
311
|
+
/* static */ void RobotsTxtParser::StripWhitespaceSlowly(char ** s) {
|
312
|
+
absl::string_view stripped = absl::StripAsciiWhitespace(*s);
|
313
|
+
*s = const_cast<char*>(stripped.data());
|
314
|
+
(*s)[stripped.size()] = '\0';
|
315
|
+
}
|
316
|
+
|
317
|
+
bool RobotsTxtParser::GetKeyAndValueFrom(char ** key, char ** value,
|
318
|
+
char * line) {
|
319
|
+
// Remove comments from the current robots.txt line.
|
320
|
+
char* const comment = strchr(line, '#');
|
321
|
+
if (nullptr != comment) {
|
322
|
+
*comment = '\0';
|
323
|
+
}
|
324
|
+
StripWhitespaceSlowly(&line);
|
325
|
+
|
326
|
+
// Rules must match the following pattern:
|
327
|
+
// <key>[ \t]*:[ \t]*<value>
|
328
|
+
char* sep = strchr(line, ':');
|
329
|
+
if (nullptr == sep) {
|
330
|
+
// Google-specific optimization: some people forget the colon, so we need to
|
331
|
+
// accept whitespace in its stead.
|
332
|
+
static const char * const kWhite = " \t";
|
333
|
+
sep = strpbrk(line, kWhite);
|
334
|
+
if (nullptr != sep) {
|
335
|
+
const char* const val = sep + strspn(sep, kWhite);
|
336
|
+
assert(*val); // since we dropped trailing whitespace above.
|
337
|
+
if (nullptr != strpbrk(val, kWhite)) {
|
338
|
+
// We only accept whitespace as a separator if there are exactly two
|
339
|
+
// sequences of non-whitespace characters. If we get here, there were
|
340
|
+
// more than 2 such sequences since we stripped trailing whitespace
|
341
|
+
// above.
|
342
|
+
return false;
|
343
|
+
}
|
344
|
+
}
|
345
|
+
}
|
346
|
+
if (nullptr == sep) {
|
347
|
+
return false; // Couldn't find a separator.
|
348
|
+
}
|
349
|
+
|
350
|
+
*key = line; // Key starts at beginning of line.
|
351
|
+
*sep = '\0'; // And stops at the separator.
|
352
|
+
StripWhitespaceSlowly(key); // Get rid of any trailing whitespace.
|
353
|
+
|
354
|
+
if (strlen(*key) > 0) {
|
355
|
+
*value = 1 + sep; // Value starts after the separator.
|
356
|
+
StripWhitespaceSlowly(value); // Get rid of any leading whitespace.
|
357
|
+
return true;
|
358
|
+
}
|
359
|
+
return false;
|
360
|
+
}
|
361
|
+
|
362
|
+
void RobotsTxtParser::ParseAndEmitLine(int current_line, char* line) {
|
363
|
+
char* string_key;
|
364
|
+
char* value;
|
365
|
+
if (!GetKeyAndValueFrom(&string_key, &value, line)) {
|
366
|
+
return;
|
367
|
+
}
|
368
|
+
|
369
|
+
Key key;
|
370
|
+
key.Parse(string_key);
|
371
|
+
if (NeedEscapeValueForKey(key)) {
|
372
|
+
char* escaped_value = nullptr;
|
373
|
+
const bool is_escaped = MaybeEscapePattern(value, &escaped_value);
|
374
|
+
EmitKeyValueToHandler(current_line, key, escaped_value, handler_);
|
375
|
+
if (is_escaped) delete[] escaped_value;
|
376
|
+
} else {
|
377
|
+
EmitKeyValueToHandler(current_line, key, value, handler_);
|
378
|
+
}
|
379
|
+
}
|
380
|
+
|
381
|
+
void RobotsTxtParser::Parse() {
|
382
|
+
// UTF-8 byte order marks.
|
383
|
+
static const unsigned char utf_bom[3] = {0xEF, 0xBB, 0xBF};
|
384
|
+
|
385
|
+
// Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's
|
386
|
+
// fairly safe to assume any valid line isn't going to be more than many times
|
387
|
+
// that max url length of 2KB. We want some padding for
|
388
|
+
// UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well.
|
389
|
+
// If so, we can ignore the chars on a line past that.
|
390
|
+
const int kMaxLineLen = 2083 * 8;
|
391
|
+
// Allocate a buffer used to process the current line.
|
392
|
+
char* const line_buffer = new char[kMaxLineLen];
|
393
|
+
// last_line_pos is the last writeable pos within the line array
|
394
|
+
// (only a final '\0' may go here).
|
395
|
+
const char* const line_buffer_end = line_buffer + kMaxLineLen - 1;
|
396
|
+
char* line_pos = line_buffer;
|
397
|
+
int line_num = 0;
|
398
|
+
size_t bom_pos = 0;
|
399
|
+
bool last_was_carriage_return = false;
|
400
|
+
handler_->HandleRobotsStart();
|
401
|
+
|
402
|
+
{
|
403
|
+
for (const unsigned char ch : robots_body_) {
|
404
|
+
ABSL_ASSERT(line_pos <= line_buffer_end);
|
405
|
+
// Google-specific optimization: UTF-8 byte order marks should never
|
406
|
+
// appear in a robots.txt file, but they do nevertheless. Skipping
|
407
|
+
// possible BOM-prefix in the first bytes of the input.
|
408
|
+
if (bom_pos < sizeof(utf_bom) && ch == utf_bom[bom_pos++]) {
|
409
|
+
continue;
|
410
|
+
}
|
411
|
+
bom_pos = sizeof(utf_bom);
|
412
|
+
if (ch != 0x0A && ch != 0x0D) { // Non-line-ending char case.
|
413
|
+
// Put in next spot on current line, as long as there's room.
|
414
|
+
if (line_pos < line_buffer_end) {
|
415
|
+
*(line_pos++) = ch;
|
416
|
+
}
|
417
|
+
} else { // Line-ending character char case.
|
418
|
+
*line_pos = '\0';
|
419
|
+
// Only emit an empty line if this was not due to the second character
|
420
|
+
// of the DOS line-ending \r\n .
|
421
|
+
const bool is_CRLF_continuation =
|
422
|
+
(line_pos == line_buffer) && last_was_carriage_return && ch == 0x0A;
|
423
|
+
if (!is_CRLF_continuation) {
|
424
|
+
ParseAndEmitLine(++line_num, line_buffer);
|
425
|
+
}
|
426
|
+
line_pos = line_buffer;
|
427
|
+
last_was_carriage_return = (ch == 0x0D);
|
428
|
+
}
|
429
|
+
}
|
430
|
+
}
|
431
|
+
*line_pos = '\0';
|
432
|
+
ParseAndEmitLine(++line_num, line_buffer);
|
433
|
+
handler_->HandleRobotsEnd();
|
434
|
+
delete [] line_buffer;
|
435
|
+
}
|
436
|
+
|
437
|
+
// Implements the default robots.txt matching strategy. The maximum number of
|
438
|
+
// characters matched by a pattern is returned as its match priority.
|
439
|
+
class LongestMatchRobotsMatchStrategy : public RobotsMatchStrategy {
|
440
|
+
public:
|
441
|
+
LongestMatchRobotsMatchStrategy() { }
|
442
|
+
|
443
|
+
// Disallow copying and assignment.
|
444
|
+
LongestMatchRobotsMatchStrategy(const LongestMatchRobotsMatchStrategy&) =
|
445
|
+
delete;
|
446
|
+
LongestMatchRobotsMatchStrategy& operator=(
|
447
|
+
const LongestMatchRobotsMatchStrategy&) = delete;
|
448
|
+
|
449
|
+
int MatchAllow(absl::string_view path, absl::string_view pattern) override;
|
450
|
+
int MatchDisallow(absl::string_view path, absl::string_view pattern) override;
|
451
|
+
};
|
452
|
+
} // end anonymous namespace
|
453
|
+
|
454
|
+
void ParseRobotsTxt(absl::string_view robots_body,
|
455
|
+
RobotsParseHandler* parse_callback) {
|
456
|
+
RobotsTxtParser parser(robots_body, parse_callback);
|
457
|
+
parser.Parse();
|
458
|
+
}
|
459
|
+
|
460
|
+
RobotsMatcher::RobotsMatcher()
|
461
|
+
: seen_global_agent_(false),
|
462
|
+
seen_specific_agent_(false),
|
463
|
+
ever_seen_specific_agent_(false),
|
464
|
+
seen_separator_(false),
|
465
|
+
path_(nullptr),
|
466
|
+
user_agents_(nullptr) {
|
467
|
+
match_strategy_ = new LongestMatchRobotsMatchStrategy();
|
468
|
+
}
|
469
|
+
|
470
|
+
RobotsMatcher::~RobotsMatcher() {
|
471
|
+
delete match_strategy_;
|
472
|
+
}
|
473
|
+
|
474
|
+
bool RobotsMatcher::ever_seen_specific_agent() const {
|
475
|
+
return ever_seen_specific_agent_;
|
476
|
+
}
|
477
|
+
|
478
|
+
void RobotsMatcher::InitUserAgentsAndPath(
|
479
|
+
const std::vector<std::string>* user_agents, const char* path) {
|
480
|
+
// The RobotsParser object doesn't own path_ or user_agents_, so overwriting
|
481
|
+
// these pointers doesn't cause a memory leak.
|
482
|
+
path_ = path;
|
483
|
+
ABSL_ASSERT('/' == *path_);
|
484
|
+
user_agents_ = user_agents;
|
485
|
+
}
|
486
|
+
|
487
|
+
bool RobotsMatcher::AllowedByRobots(absl::string_view robots_body,
|
488
|
+
const std::vector<std::string>* user_agents,
|
489
|
+
const std::string& url) {
|
490
|
+
// The url is not normalized (escaped, percent encoded) here because the user
|
491
|
+
// is asked to provide it in escaped form already.
|
492
|
+
std::string path = GetPathParamsQuery(url);
|
493
|
+
InitUserAgentsAndPath(user_agents, path.c_str());
|
494
|
+
ParseRobotsTxt(robots_body, this);
|
495
|
+
return !disallow();
|
496
|
+
}
|
497
|
+
|
498
|
+
bool RobotsMatcher::OneAgentAllowedByRobots(absl::string_view robots_txt,
|
499
|
+
const std::string& user_agent,
|
500
|
+
const std::string& url) {
|
501
|
+
std::vector<std::string> v;
|
502
|
+
v.push_back(user_agent);
|
503
|
+
return AllowedByRobots(robots_txt, &v, url);
|
504
|
+
}
|
505
|
+
|
506
|
+
bool RobotsMatcher::disallow() const {
|
507
|
+
if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
|
508
|
+
return (disallow_.specific.priority() > allow_.specific.priority());
|
509
|
+
}
|
510
|
+
|
511
|
+
if (ever_seen_specific_agent_) {
|
512
|
+
// Matching group for user-agent but either without disallow or empty one,
|
513
|
+
// i.e. priority == 0.
|
514
|
+
return false;
|
515
|
+
}
|
516
|
+
|
517
|
+
if (disallow_.global.priority() > 0 || allow_.global.priority() > 0) {
|
518
|
+
return disallow_.global.priority() > allow_.global.priority();
|
519
|
+
}
|
520
|
+
return false;
|
521
|
+
}
|
522
|
+
|
523
|
+
bool RobotsMatcher::disallow_ignore_global() const {
|
524
|
+
if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
|
525
|
+
return disallow_.specific.priority() > allow_.specific.priority();
|
526
|
+
}
|
527
|
+
return false;
|
528
|
+
}
|
529
|
+
|
530
|
+
const int RobotsMatcher::matching_line() const {
|
531
|
+
if (ever_seen_specific_agent_) {
|
532
|
+
return Match::HigherPriorityMatch(disallow_.specific, allow_.specific)
|
533
|
+
.line();
|
534
|
+
}
|
535
|
+
return Match::HigherPriorityMatch(disallow_.global, allow_.global).line();
|
536
|
+
}
|
537
|
+
|
538
|
+
void RobotsMatcher::HandleRobotsStart() {
|
539
|
+
// This is a new robots.txt file, so we need to reset all the instance member
|
540
|
+
// variables. We do it in the same order the instance member variables are
|
541
|
+
// declared, so it's easier to keep track of which ones we have (or maybe
|
542
|
+
// haven't!) done.
|
543
|
+
allow_.Clear();
|
544
|
+
disallow_.Clear();
|
545
|
+
|
546
|
+
seen_global_agent_ = false;
|
547
|
+
seen_specific_agent_ = false;
|
548
|
+
ever_seen_specific_agent_ = false;
|
549
|
+
seen_separator_ = false;
|
550
|
+
}
|
551
|
+
|
552
|
+
/*static*/ absl::string_view RobotsMatcher::ExtractUserAgent(
|
553
|
+
absl::string_view user_agent) {
|
554
|
+
// Allowed characters in user-agent are [a-zA-Z_-].
|
555
|
+
const char* end = user_agent.data();
|
556
|
+
while (absl::ascii_isalpha(*end) || *end == '-' || *end == '_') {
|
557
|
+
++end;
|
558
|
+
}
|
559
|
+
return user_agent.substr(0, end - user_agent.data());
|
560
|
+
}
|
561
|
+
|
562
|
+
/*static*/ bool RobotsMatcher::IsValidUserAgentToObey(
|
563
|
+
absl::string_view user_agent) {
|
564
|
+
return user_agent.length() > 0 && ExtractUserAgent(user_agent) == user_agent;
|
565
|
+
}
|
566
|
+
|
567
|
+
void RobotsMatcher::HandleUserAgent(int line_num,
|
568
|
+
absl::string_view user_agent) {
|
569
|
+
if (seen_separator_) {
|
570
|
+
seen_specific_agent_ = seen_global_agent_ = seen_separator_ = false;
|
571
|
+
}
|
572
|
+
|
573
|
+
// Google-specific optimization: a '*' followed by space and more characters
|
574
|
+
// in a user-agent record is still regarded a global rule.
|
575
|
+
if (user_agent.length() >= 1 && user_agent[0] == '*' &&
|
576
|
+
(user_agent.length() == 1 || isspace(user_agent[1]))) {
|
577
|
+
seen_global_agent_ = true;
|
578
|
+
} else {
|
579
|
+
user_agent = ExtractUserAgent(user_agent);
|
580
|
+
for (const auto& agent : *user_agents_) {
|
581
|
+
if (absl::EqualsIgnoreCase(user_agent, agent)) {
|
582
|
+
ever_seen_specific_agent_ = seen_specific_agent_ = true;
|
583
|
+
break;
|
584
|
+
}
|
585
|
+
}
|
586
|
+
}
|
587
|
+
}
|
588
|
+
|
589
|
+
void RobotsMatcher::HandleAllow(int line_num, absl::string_view value) {
|
590
|
+
if (!seen_any_agent()) return;
|
591
|
+
seen_separator_ = true;
|
592
|
+
const int priority = match_strategy_->MatchAllow(path_, value);
|
593
|
+
if (priority >= 0) {
|
594
|
+
if (seen_specific_agent_) {
|
595
|
+
if (allow_.specific.priority() < priority) {
|
596
|
+
allow_.specific.Set(priority, line_num);
|
597
|
+
}
|
598
|
+
} else {
|
599
|
+
assert(seen_global_agent_);
|
600
|
+
if (allow_.global.priority() < priority) {
|
601
|
+
allow_.global.Set(priority, line_num);
|
602
|
+
}
|
603
|
+
}
|
604
|
+
} else {
|
605
|
+
// Google-specific optimization: 'index.htm' and 'index.html' are normalized
|
606
|
+
// to '/'.
|
607
|
+
const size_t slash_pos = value.find_last_of('/');
|
608
|
+
|
609
|
+
if (slash_pos != absl::string_view::npos &&
|
610
|
+
absl::StartsWith(absl::ClippedSubstr(value, slash_pos),
|
611
|
+
"/index.htm")) {
|
612
|
+
const int len = slash_pos + 1;
|
613
|
+
absl::FixedArray<char> newpattern(len + 1);
|
614
|
+
strncpy(newpattern.data(), value.data(), len);
|
615
|
+
newpattern[len] = '$';
|
616
|
+
HandleAllow(line_num,
|
617
|
+
absl::string_view(newpattern.data(), newpattern.size()));
|
618
|
+
}
|
619
|
+
}
|
620
|
+
}
|
621
|
+
|
622
|
+
void RobotsMatcher::HandleDisallow(int line_num, absl::string_view value) {
|
623
|
+
if (!seen_any_agent()) return;
|
624
|
+
seen_separator_ = true;
|
625
|
+
const int priority = match_strategy_->MatchDisallow(path_, value);
|
626
|
+
if (priority >= 0) {
|
627
|
+
if (seen_specific_agent_) {
|
628
|
+
if (disallow_.specific.priority() < priority) {
|
629
|
+
disallow_.specific.Set(priority, line_num);
|
630
|
+
}
|
631
|
+
} else {
|
632
|
+
assert(seen_global_agent_);
|
633
|
+
if (disallow_.global.priority() < priority) {
|
634
|
+
disallow_.global.Set(priority, line_num);
|
635
|
+
}
|
636
|
+
}
|
637
|
+
}
|
638
|
+
}
|
639
|
+
|
640
|
+
int LongestMatchRobotsMatchStrategy::MatchAllow(absl::string_view path,
|
641
|
+
absl::string_view pattern) {
|
642
|
+
return Matches(path, pattern) ? pattern.length() : -1;
|
643
|
+
}
|
644
|
+
|
645
|
+
int LongestMatchRobotsMatchStrategy::MatchDisallow(absl::string_view path,
|
646
|
+
absl::string_view pattern) {
|
647
|
+
return Matches(path, pattern) ? pattern.length() : -1;
|
648
|
+
}
|
649
|
+
|
650
|
+
void RobotsMatcher::HandleSitemap(int line_num, absl::string_view value) {
|
651
|
+
seen_separator_ = true;
|
652
|
+
}
|
653
|
+
|
654
|
+
void RobotsMatcher::HandleUnknownAction(int line_num, absl::string_view action,
|
655
|
+
absl::string_view value) {
|
656
|
+
seen_separator_ = true;
|
657
|
+
}
|
658
|
+
|
659
|
+
void ParsedRobotsKey::Parse(absl::string_view key) {
|
660
|
+
key_text_ = absl::string_view();
|
661
|
+
if (KeyIsUserAgent(key)) {
|
662
|
+
type_ = USER_AGENT;
|
663
|
+
} else if (KeyIsAllow(key)) {
|
664
|
+
type_ = ALLOW;
|
665
|
+
} else if (KeyIsDisallow(key)) {
|
666
|
+
type_ = DISALLOW;
|
667
|
+
} else if (KeyIsSitemap(key)) {
|
668
|
+
type_ = SITEMAP;
|
669
|
+
} else {
|
670
|
+
type_ = UNKNOWN;
|
671
|
+
key_text_ = key;
|
672
|
+
}
|
673
|
+
}
|
674
|
+
|
675
|
+
absl::string_view ParsedRobotsKey::GetUnknownText() const {
|
676
|
+
ABSL_ASSERT(type_ == UNKNOWN && !key_text_.empty());
|
677
|
+
return key_text_;
|
678
|
+
}
|
679
|
+
|
680
|
+
bool ParsedRobotsKey::KeyIsUserAgent(absl::string_view key) {
|
681
|
+
return (
|
682
|
+
absl::StartsWithIgnoreCase(key, "user-agent") ||
|
683
|
+
(kAllowFrequentTypos && (absl::StartsWithIgnoreCase(key, "useragent") ||
|
684
|
+
absl::StartsWithIgnoreCase(key, "user agent"))));
|
685
|
+
}
|
686
|
+
|
687
|
+
bool ParsedRobotsKey::KeyIsAllow(absl::string_view key) {
|
688
|
+
return absl::StartsWithIgnoreCase(key, "allow");
|
689
|
+
}
|
690
|
+
|
691
|
+
bool ParsedRobotsKey::KeyIsDisallow(absl::string_view key) {
|
692
|
+
return (
|
693
|
+
absl::StartsWithIgnoreCase(key, "disallow") ||
|
694
|
+
(kAllowFrequentTypos && ((absl::StartsWithIgnoreCase(key, "dissallow")) ||
|
695
|
+
(absl::StartsWithIgnoreCase(key, "dissalow")) ||
|
696
|
+
(absl::StartsWithIgnoreCase(key, "disalow")) ||
|
697
|
+
(absl::StartsWithIgnoreCase(key, "diasllow")) ||
|
698
|
+
(absl::StartsWithIgnoreCase(key, "disallaw")))));
|
699
|
+
}
|
700
|
+
|
701
|
+
bool ParsedRobotsKey::KeyIsSitemap(absl::string_view key) {
|
702
|
+
return ((absl::StartsWithIgnoreCase(key, "sitemap")) ||
|
703
|
+
(absl::StartsWithIgnoreCase(key, "site-map")));
|
704
|
+
}
|
705
|
+
|
706
|
+
} // namespace googlebot
|