google_robotstxt_parser 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,706 @@
1
+ // Copyright 1999 Google LLC
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // https://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // -----------------------------------------------------------------------------
16
+ // File: robots.cc
17
+ // -----------------------------------------------------------------------------
18
+ //
19
+ // Implements expired internet draft
20
+ // http://www.robotstxt.org/norobots-rfc.txt
21
+ // with Google-specific optimizations detailed at
22
+ // https://developers.google.com/search/reference/robots_txt
23
+
24
+ #include "robots.h"
25
+
26
+ #include <stdlib.h>
27
+
28
+ #include <cstddef>
29
+ #include <vector>
30
+
31
+ #include "absl/base/macros.h"
32
+ #include "absl/container/fixed_array.h"
33
+ #include "absl/strings/ascii.h"
34
+ #include "absl/strings/match.h"
35
+ #include "absl/strings/numbers.h"
36
+ #include "absl/strings/string_view.h"
37
+
38
+ // Allow for typos such as DISALOW in robots.txt.
39
+ static bool kAllowFrequentTypos = true;
40
+
41
+ namespace googlebot {
42
+
43
+ // A RobotsMatchStrategy defines a strategy for matching individual lines in a
44
+ // robots.txt file. Each Match* method should return a match priority, which is
45
+ // interpreted as:
46
+ //
47
+ // match priority < 0:
48
+ // No match.
49
+ //
50
+ // match priority == 0:
51
+ // Match, but treat it as if matched an empty pattern.
52
+ //
53
+ // match priority > 0:
54
+ // Match.
55
+ class RobotsMatchStrategy {
56
+ public:
57
+ virtual ~RobotsMatchStrategy() {}
58
+
59
+ virtual int MatchAllow(absl::string_view path,
60
+ absl::string_view pattern) = 0;
61
+ virtual int MatchDisallow(absl::string_view path,
62
+ absl::string_view pattern) = 0;
63
+
64
+ protected:
65
+ // Implements robots.txt pattern matching.
66
+ static bool Matches(absl::string_view path, absl::string_view pattern);
67
+ };
68
+
69
+ // Returns true if URI path matches the specified pattern. Pattern is anchored
70
+ // at the beginning of path. '$' is special only at the end of pattern.
71
+ //
72
+ // Since 'path' and 'pattern' are both externally determined (by the webmaster),
73
+ // we make sure to have acceptable worst-case performance.
74
+ /* static */ bool RobotsMatchStrategy::Matches(
75
+ absl::string_view path, absl::string_view pattern) {
76
+ const size_t pathlen = path.length();
77
+ absl::FixedArray<size_t> pos(pathlen + 1);
78
+ int numpos;
79
+
80
+ // The pos[] array holds a sorted list of indexes of 'path', with length
81
+ // 'numpos'. At the start and end of each iteration of the main loop below,
82
+ // the pos[] array will hold a list of the prefixes of the 'path' which can
83
+ // match the current prefix of 'pattern'. If this list is ever empty,
84
+ // return false. If we reach the end of 'pattern' with at least one element
85
+ // in pos[], return true.
86
+
87
+ pos[0] = 0;
88
+ numpos = 1;
89
+
90
+ for (auto pat = pattern.begin(); pat != pattern.end(); ++pat) {
91
+ if (*pat == '$' && pat + 1 == pattern.end()) {
92
+ return (pos[numpos - 1] == pathlen);
93
+ }
94
+ if (*pat == '*') {
95
+ numpos = pathlen - pos[0] + 1;
96
+ for (int i = 1; i < numpos; i++) {
97
+ pos[i] = pos[i-1] + 1;
98
+ }
99
+ } else {
100
+ // Includes '$' when not at end of pattern.
101
+ int newnumpos = 0;
102
+ for (int i = 0; i < numpos; i++) {
103
+ if (pos[i] < pathlen && path[pos[i]] == *pat) {
104
+ pos[newnumpos++] = pos[i] + 1;
105
+ }
106
+ }
107
+ numpos = newnumpos;
108
+ if (numpos == 0) return false;
109
+ }
110
+ }
111
+
112
+ return true;
113
+ }
114
+
115
+ static const char* kHexDigits = "0123456789ABCDEF";
116
+
117
+ // GetPathParamsQuery is not in anonymous namespace to allow testing.
118
+ //
119
+ // Extracts path (with params) and query part from URL. Removes scheme,
120
+ // authority, and fragment. Result always starts with "/".
121
+ // Returns "/" if the url doesn't have a path or is not valid.
122
+ std::string GetPathParamsQuery(const std::string& url) {
123
+ std::string path;
124
+
125
+ // Initial two slashes are ignored.
126
+ size_t search_start = 0;
127
+ if (url.size() >= 2 && url[0] == '/' && url[1] == '/') search_start = 2;
128
+
129
+ size_t early_path = url.find_first_of("/?;", search_start);
130
+ size_t protocol_end = url.find("://", search_start);
131
+ if (early_path < protocol_end) {
132
+ // If path, param or query starts before ://, :// doesn't indicate protocol.
133
+ protocol_end = std::string::npos;
134
+ }
135
+ if (protocol_end == std::string::npos) {
136
+ protocol_end = search_start;
137
+ } else {
138
+ protocol_end += 3;
139
+ }
140
+
141
+ size_t path_start = url.find_first_of("/?;", protocol_end);
142
+ if (path_start != std::string::npos) {
143
+ size_t hash_pos = url.find('#', search_start);
144
+ if (hash_pos < path_start) return "/";
145
+ size_t path_end = (hash_pos == std::string::npos) ? url.size() : hash_pos;
146
+ if (url[path_start] != '/') {
147
+ // Prepend a slash if the result would start e.g. with '?'.
148
+ return "/" + url.substr(path_start, path_end - path_start);
149
+ }
150
+ return url.substr(path_start, path_end - path_start);
151
+ }
152
+
153
+ return "/";
154
+ }
155
+
156
+ // MaybeEscapePattern is not in anonymous namespace to allow testing.
157
+ //
158
+ // Canonicalize the allowed/disallowed paths. For example:
159
+ // /SanJoséSellers ==> /Sanjos%C3%A9Sellers
160
+ // %aa ==> %AA
161
+ // When the function returns, (*dst) either points to src, or is newly
162
+ // allocated.
163
+ // Returns true if dst was newly allocated.
164
+ bool MaybeEscapePattern(const char* src, char** dst) {
165
+ int num_to_escape = 0;
166
+ bool need_capitalize = false;
167
+
168
+ // First, scan the buffer to see if changes are needed. Most don't.
169
+ for (int i = 0; src[i] != 0; i++) {
170
+ // (a) % escape sequence.
171
+ if (src[i] == '%' &&
172
+ absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
173
+ if (absl::ascii_islower(src[i+1]) || absl::ascii_islower(src[i+2])) {
174
+ need_capitalize = true;
175
+ }
176
+ i += 2;
177
+ // (b) needs escaping.
178
+ } else if (src[i] & 0x80) {
179
+ num_to_escape++;
180
+ }
181
+ // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F).
182
+ }
183
+ // Return if no changes needed.
184
+ if (!num_to_escape && !need_capitalize) {
185
+ (*dst) = const_cast<char*>(src);
186
+ return false;
187
+ }
188
+ (*dst) = new char[num_to_escape * 2 + strlen(src) + 1];
189
+ int j = 0;
190
+ for (int i = 0; src[i] != 0; i++) {
191
+ // (a) Normalize %-escaped sequence (eg. %2f -> %2F).
192
+ if (src[i] == '%' &&
193
+ absl::ascii_isxdigit(src[i+1]) && absl::ascii_isxdigit(src[i+2])) {
194
+ (*dst)[j++] = src[i++];
195
+ (*dst)[j++] = absl::ascii_toupper(src[i++]);
196
+ (*dst)[j++] = absl::ascii_toupper(src[i]);
197
+ // (b) %-escape octets whose highest bit is set. These are outside the
198
+ // ASCII range.
199
+ } else if (src[i] & 0x80) {
200
+ (*dst)[j++] = '%';
201
+ (*dst)[j++] = kHexDigits[(src[i] >> 4) & 0xf];
202
+ (*dst)[j++] = kHexDigits[src[i] & 0xf];
203
+ // (c) Normal character, no modification needed.
204
+ } else {
205
+ (*dst)[j++] = src[i];
206
+ }
207
+ }
208
+ (*dst)[j] = 0;
209
+ return true;
210
+ }
211
+
212
+ // Internal helper classes and functions.
213
+ namespace {
214
+
215
+ // A robots.txt has lines of key/value pairs. A ParsedRobotsKey represents
216
+ // a key. This class can parse a text-representation (including common typos)
217
+ // and represent them as an enumeration which allows for faster processing
218
+ // afterwards.
219
+ // For unparsable keys, the original string representation is kept.
220
+ class ParsedRobotsKey {
221
+ public:
222
+ enum KeyType {
223
+ // Generic highlevel fields.
224
+ USER_AGENT,
225
+ SITEMAP,
226
+
227
+ // Fields within a user-agent.
228
+ ALLOW,
229
+ DISALLOW,
230
+
231
+ // Unrecognized field; kept as-is. High number so that additions to the
232
+ // enumeration above does not change the serialization.
233
+ UNKNOWN = 128
234
+ };
235
+
236
+ ParsedRobotsKey() : type_(UNKNOWN) {}
237
+
238
+ // Disallow copying and assignment.
239
+ ParsedRobotsKey(const ParsedRobotsKey&) = delete;
240
+ ParsedRobotsKey& operator=(const ParsedRobotsKey&) = delete;
241
+
242
+ // Parse given key text. Does not copy the text, so the text_key must stay
243
+ // valid for the object's life-time or the next Parse() call.
244
+ void Parse(absl::string_view key);
245
+
246
+ // Returns the type of key.
247
+ KeyType type() const { return type_; }
248
+
249
+ // If this is an unknown key, get the text.
250
+ absl::string_view GetUnknownText() const;
251
+
252
+ private:
253
+ static bool KeyIsUserAgent(absl::string_view key);
254
+ static bool KeyIsAllow(absl::string_view key);
255
+ static bool KeyIsDisallow(absl::string_view key);
256
+ static bool KeyIsSitemap(absl::string_view key);
257
+
258
+ KeyType type_;
259
+ absl::string_view key_text_;
260
+ };
261
+
262
+ void EmitKeyValueToHandler(int line, const ParsedRobotsKey& key,
263
+ absl::string_view value,
264
+ RobotsParseHandler* handler) {
265
+ typedef ParsedRobotsKey Key;
266
+ switch (key.type()) {
267
+ case Key::USER_AGENT: handler->HandleUserAgent(line, value); break;
268
+ case Key::ALLOW: handler->HandleAllow(line, value); break;
269
+ case Key::DISALLOW: handler->HandleDisallow(line, value); break;
270
+ case Key::SITEMAP: handler->HandleSitemap(line, value); break;
271
+ case Key::UNKNOWN:
272
+ handler->HandleUnknownAction(line, key.GetUnknownText(), value);
273
+ break;
274
+ // No default case Key:: to have the compiler warn about new values.
275
+ }
276
+ }
277
+
278
+ class RobotsTxtParser {
279
+ public:
280
+ typedef ParsedRobotsKey Key;
281
+
282
+ RobotsTxtParser(absl::string_view robots_body,
283
+ RobotsParseHandler* handler)
284
+ : robots_body_(robots_body), handler_(handler) {
285
+ }
286
+
287
+ void Parse();
288
+
289
+ private:
290
+ static bool GetKeyAndValueFrom(char ** key, char **value, char *line);
291
+ static void StripWhitespaceSlowly(char ** s);
292
+
293
+ void ParseAndEmitLine(int current_line, char* line);
294
+ bool NeedEscapeValueForKey(const Key& key);
295
+
296
+ absl::string_view robots_body_;
297
+ RobotsParseHandler* const handler_;
298
+ };
299
+
300
+ bool RobotsTxtParser::NeedEscapeValueForKey(const Key& key) {
301
+ switch (key.type()) {
302
+ case RobotsTxtParser::Key::USER_AGENT:
303
+ case RobotsTxtParser::Key::SITEMAP:
304
+ return false;
305
+ default:
306
+ return true;
307
+ }
308
+ }
309
+
310
+ // Removes leading and trailing whitespace from null-terminated string s.
311
+ /* static */ void RobotsTxtParser::StripWhitespaceSlowly(char ** s) {
312
+ absl::string_view stripped = absl::StripAsciiWhitespace(*s);
313
+ *s = const_cast<char*>(stripped.data());
314
+ (*s)[stripped.size()] = '\0';
315
+ }
316
+
317
+ bool RobotsTxtParser::GetKeyAndValueFrom(char ** key, char ** value,
318
+ char * line) {
319
+ // Remove comments from the current robots.txt line.
320
+ char* const comment = strchr(line, '#');
321
+ if (nullptr != comment) {
322
+ *comment = '\0';
323
+ }
324
+ StripWhitespaceSlowly(&line);
325
+
326
+ // Rules must match the following pattern:
327
+ // <key>[ \t]*:[ \t]*<value>
328
+ char* sep = strchr(line, ':');
329
+ if (nullptr == sep) {
330
+ // Google-specific optimization: some people forget the colon, so we need to
331
+ // accept whitespace in its stead.
332
+ static const char * const kWhite = " \t";
333
+ sep = strpbrk(line, kWhite);
334
+ if (nullptr != sep) {
335
+ const char* const val = sep + strspn(sep, kWhite);
336
+ assert(*val); // since we dropped trailing whitespace above.
337
+ if (nullptr != strpbrk(val, kWhite)) {
338
+ // We only accept whitespace as a separator if there are exactly two
339
+ // sequences of non-whitespace characters. If we get here, there were
340
+ // more than 2 such sequences since we stripped trailing whitespace
341
+ // above.
342
+ return false;
343
+ }
344
+ }
345
+ }
346
+ if (nullptr == sep) {
347
+ return false; // Couldn't find a separator.
348
+ }
349
+
350
+ *key = line; // Key starts at beginning of line.
351
+ *sep = '\0'; // And stops at the separator.
352
+ StripWhitespaceSlowly(key); // Get rid of any trailing whitespace.
353
+
354
+ if (strlen(*key) > 0) {
355
+ *value = 1 + sep; // Value starts after the separator.
356
+ StripWhitespaceSlowly(value); // Get rid of any leading whitespace.
357
+ return true;
358
+ }
359
+ return false;
360
+ }
361
+
362
+ void RobotsTxtParser::ParseAndEmitLine(int current_line, char* line) {
363
+ char* string_key;
364
+ char* value;
365
+ if (!GetKeyAndValueFrom(&string_key, &value, line)) {
366
+ return;
367
+ }
368
+
369
+ Key key;
370
+ key.Parse(string_key);
371
+ if (NeedEscapeValueForKey(key)) {
372
+ char* escaped_value = nullptr;
373
+ const bool is_escaped = MaybeEscapePattern(value, &escaped_value);
374
+ EmitKeyValueToHandler(current_line, key, escaped_value, handler_);
375
+ if (is_escaped) delete[] escaped_value;
376
+ } else {
377
+ EmitKeyValueToHandler(current_line, key, value, handler_);
378
+ }
379
+ }
380
+
381
+ void RobotsTxtParser::Parse() {
382
+ // UTF-8 byte order marks.
383
+ static const unsigned char utf_bom[3] = {0xEF, 0xBB, 0xBF};
384
+
385
+ // Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's
386
+ // fairly safe to assume any valid line isn't going to be more than many times
387
+ // that max url length of 2KB. We want some padding for
388
+ // UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well.
389
+ // If so, we can ignore the chars on a line past that.
390
+ const int kMaxLineLen = 2083 * 8;
391
+ // Allocate a buffer used to process the current line.
392
+ char* const line_buffer = new char[kMaxLineLen];
393
+ // last_line_pos is the last writeable pos within the line array
394
+ // (only a final '\0' may go here).
395
+ const char* const line_buffer_end = line_buffer + kMaxLineLen - 1;
396
+ char* line_pos = line_buffer;
397
+ int line_num = 0;
398
+ size_t bom_pos = 0;
399
+ bool last_was_carriage_return = false;
400
+ handler_->HandleRobotsStart();
401
+
402
+ {
403
+ for (const unsigned char ch : robots_body_) {
404
+ ABSL_ASSERT(line_pos <= line_buffer_end);
405
+ // Google-specific optimization: UTF-8 byte order marks should never
406
+ // appear in a robots.txt file, but they do nevertheless. Skipping
407
+ // possible BOM-prefix in the first bytes of the input.
408
+ if (bom_pos < sizeof(utf_bom) && ch == utf_bom[bom_pos++]) {
409
+ continue;
410
+ }
411
+ bom_pos = sizeof(utf_bom);
412
+ if (ch != 0x0A && ch != 0x0D) { // Non-line-ending char case.
413
+ // Put in next spot on current line, as long as there's room.
414
+ if (line_pos < line_buffer_end) {
415
+ *(line_pos++) = ch;
416
+ }
417
+ } else { // Line-ending character char case.
418
+ *line_pos = '\0';
419
+ // Only emit an empty line if this was not due to the second character
420
+ // of the DOS line-ending \r\n .
421
+ const bool is_CRLF_continuation =
422
+ (line_pos == line_buffer) && last_was_carriage_return && ch == 0x0A;
423
+ if (!is_CRLF_continuation) {
424
+ ParseAndEmitLine(++line_num, line_buffer);
425
+ }
426
+ line_pos = line_buffer;
427
+ last_was_carriage_return = (ch == 0x0D);
428
+ }
429
+ }
430
+ }
431
+ *line_pos = '\0';
432
+ ParseAndEmitLine(++line_num, line_buffer);
433
+ handler_->HandleRobotsEnd();
434
+ delete [] line_buffer;
435
+ }
436
+
437
+ // Implements the default robots.txt matching strategy. The maximum number of
438
+ // characters matched by a pattern is returned as its match priority.
439
+ class LongestMatchRobotsMatchStrategy : public RobotsMatchStrategy {
440
+ public:
441
+ LongestMatchRobotsMatchStrategy() { }
442
+
443
+ // Disallow copying and assignment.
444
+ LongestMatchRobotsMatchStrategy(const LongestMatchRobotsMatchStrategy&) =
445
+ delete;
446
+ LongestMatchRobotsMatchStrategy& operator=(
447
+ const LongestMatchRobotsMatchStrategy&) = delete;
448
+
449
+ int MatchAllow(absl::string_view path, absl::string_view pattern) override;
450
+ int MatchDisallow(absl::string_view path, absl::string_view pattern) override;
451
+ };
452
+ } // end anonymous namespace
453
+
454
+ void ParseRobotsTxt(absl::string_view robots_body,
455
+ RobotsParseHandler* parse_callback) {
456
+ RobotsTxtParser parser(robots_body, parse_callback);
457
+ parser.Parse();
458
+ }
459
+
460
+ RobotsMatcher::RobotsMatcher()
461
+ : seen_global_agent_(false),
462
+ seen_specific_agent_(false),
463
+ ever_seen_specific_agent_(false),
464
+ seen_separator_(false),
465
+ path_(nullptr),
466
+ user_agents_(nullptr) {
467
+ match_strategy_ = new LongestMatchRobotsMatchStrategy();
468
+ }
469
+
470
+ RobotsMatcher::~RobotsMatcher() {
471
+ delete match_strategy_;
472
+ }
473
+
474
+ bool RobotsMatcher::ever_seen_specific_agent() const {
475
+ return ever_seen_specific_agent_;
476
+ }
477
+
478
+ void RobotsMatcher::InitUserAgentsAndPath(
479
+ const std::vector<std::string>* user_agents, const char* path) {
480
+ // The RobotsParser object doesn't own path_ or user_agents_, so overwriting
481
+ // these pointers doesn't cause a memory leak.
482
+ path_ = path;
483
+ ABSL_ASSERT('/' == *path_);
484
+ user_agents_ = user_agents;
485
+ }
486
+
487
+ bool RobotsMatcher::AllowedByRobots(absl::string_view robots_body,
488
+ const std::vector<std::string>* user_agents,
489
+ const std::string& url) {
490
+ // The url is not normalized (escaped, percent encoded) here because the user
491
+ // is asked to provide it in escaped form already.
492
+ std::string path = GetPathParamsQuery(url);
493
+ InitUserAgentsAndPath(user_agents, path.c_str());
494
+ ParseRobotsTxt(robots_body, this);
495
+ return !disallow();
496
+ }
497
+
498
+ bool RobotsMatcher::OneAgentAllowedByRobots(absl::string_view robots_txt,
499
+ const std::string& user_agent,
500
+ const std::string& url) {
501
+ std::vector<std::string> v;
502
+ v.push_back(user_agent);
503
+ return AllowedByRobots(robots_txt, &v, url);
504
+ }
505
+
506
+ bool RobotsMatcher::disallow() const {
507
+ if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
508
+ return (disallow_.specific.priority() > allow_.specific.priority());
509
+ }
510
+
511
+ if (ever_seen_specific_agent_) {
512
+ // Matching group for user-agent but either without disallow or empty one,
513
+ // i.e. priority == 0.
514
+ return false;
515
+ }
516
+
517
+ if (disallow_.global.priority() > 0 || allow_.global.priority() > 0) {
518
+ return disallow_.global.priority() > allow_.global.priority();
519
+ }
520
+ return false;
521
+ }
522
+
523
+ bool RobotsMatcher::disallow_ignore_global() const {
524
+ if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) {
525
+ return disallow_.specific.priority() > allow_.specific.priority();
526
+ }
527
+ return false;
528
+ }
529
+
530
+ const int RobotsMatcher::matching_line() const {
531
+ if (ever_seen_specific_agent_) {
532
+ return Match::HigherPriorityMatch(disallow_.specific, allow_.specific)
533
+ .line();
534
+ }
535
+ return Match::HigherPriorityMatch(disallow_.global, allow_.global).line();
536
+ }
537
+
538
+ void RobotsMatcher::HandleRobotsStart() {
539
+ // This is a new robots.txt file, so we need to reset all the instance member
540
+ // variables. We do it in the same order the instance member variables are
541
+ // declared, so it's easier to keep track of which ones we have (or maybe
542
+ // haven't!) done.
543
+ allow_.Clear();
544
+ disallow_.Clear();
545
+
546
+ seen_global_agent_ = false;
547
+ seen_specific_agent_ = false;
548
+ ever_seen_specific_agent_ = false;
549
+ seen_separator_ = false;
550
+ }
551
+
552
+ /*static*/ absl::string_view RobotsMatcher::ExtractUserAgent(
553
+ absl::string_view user_agent) {
554
+ // Allowed characters in user-agent are [a-zA-Z_-].
555
+ const char* end = user_agent.data();
556
+ while (absl::ascii_isalpha(*end) || *end == '-' || *end == '_') {
557
+ ++end;
558
+ }
559
+ return user_agent.substr(0, end - user_agent.data());
560
+ }
561
+
562
+ /*static*/ bool RobotsMatcher::IsValidUserAgentToObey(
563
+ absl::string_view user_agent) {
564
+ return user_agent.length() > 0 && ExtractUserAgent(user_agent) == user_agent;
565
+ }
566
+
567
+ void RobotsMatcher::HandleUserAgent(int line_num,
568
+ absl::string_view user_agent) {
569
+ if (seen_separator_) {
570
+ seen_specific_agent_ = seen_global_agent_ = seen_separator_ = false;
571
+ }
572
+
573
+ // Google-specific optimization: a '*' followed by space and more characters
574
+ // in a user-agent record is still regarded a global rule.
575
+ if (user_agent.length() >= 1 && user_agent[0] == '*' &&
576
+ (user_agent.length() == 1 || isspace(user_agent[1]))) {
577
+ seen_global_agent_ = true;
578
+ } else {
579
+ user_agent = ExtractUserAgent(user_agent);
580
+ for (const auto& agent : *user_agents_) {
581
+ if (absl::EqualsIgnoreCase(user_agent, agent)) {
582
+ ever_seen_specific_agent_ = seen_specific_agent_ = true;
583
+ break;
584
+ }
585
+ }
586
+ }
587
+ }
588
+
589
+ void RobotsMatcher::HandleAllow(int line_num, absl::string_view value) {
590
+ if (!seen_any_agent()) return;
591
+ seen_separator_ = true;
592
+ const int priority = match_strategy_->MatchAllow(path_, value);
593
+ if (priority >= 0) {
594
+ if (seen_specific_agent_) {
595
+ if (allow_.specific.priority() < priority) {
596
+ allow_.specific.Set(priority, line_num);
597
+ }
598
+ } else {
599
+ assert(seen_global_agent_);
600
+ if (allow_.global.priority() < priority) {
601
+ allow_.global.Set(priority, line_num);
602
+ }
603
+ }
604
+ } else {
605
+ // Google-specific optimization: 'index.htm' and 'index.html' are normalized
606
+ // to '/'.
607
+ const size_t slash_pos = value.find_last_of('/');
608
+
609
+ if (slash_pos != absl::string_view::npos &&
610
+ absl::StartsWith(absl::ClippedSubstr(value, slash_pos),
611
+ "/index.htm")) {
612
+ const int len = slash_pos + 1;
613
+ absl::FixedArray<char> newpattern(len + 1);
614
+ strncpy(newpattern.data(), value.data(), len);
615
+ newpattern[len] = '$';
616
+ HandleAllow(line_num,
617
+ absl::string_view(newpattern.data(), newpattern.size()));
618
+ }
619
+ }
620
+ }
621
+
622
+ void RobotsMatcher::HandleDisallow(int line_num, absl::string_view value) {
623
+ if (!seen_any_agent()) return;
624
+ seen_separator_ = true;
625
+ const int priority = match_strategy_->MatchDisallow(path_, value);
626
+ if (priority >= 0) {
627
+ if (seen_specific_agent_) {
628
+ if (disallow_.specific.priority() < priority) {
629
+ disallow_.specific.Set(priority, line_num);
630
+ }
631
+ } else {
632
+ assert(seen_global_agent_);
633
+ if (disallow_.global.priority() < priority) {
634
+ disallow_.global.Set(priority, line_num);
635
+ }
636
+ }
637
+ }
638
+ }
639
+
640
+ int LongestMatchRobotsMatchStrategy::MatchAllow(absl::string_view path,
641
+ absl::string_view pattern) {
642
+ return Matches(path, pattern) ? pattern.length() : -1;
643
+ }
644
+
645
+ int LongestMatchRobotsMatchStrategy::MatchDisallow(absl::string_view path,
646
+ absl::string_view pattern) {
647
+ return Matches(path, pattern) ? pattern.length() : -1;
648
+ }
649
+
650
+ void RobotsMatcher::HandleSitemap(int line_num, absl::string_view value) {
651
+ seen_separator_ = true;
652
+ }
653
+
654
+ void RobotsMatcher::HandleUnknownAction(int line_num, absl::string_view action,
655
+ absl::string_view value) {
656
+ seen_separator_ = true;
657
+ }
658
+
659
+ void ParsedRobotsKey::Parse(absl::string_view key) {
660
+ key_text_ = absl::string_view();
661
+ if (KeyIsUserAgent(key)) {
662
+ type_ = USER_AGENT;
663
+ } else if (KeyIsAllow(key)) {
664
+ type_ = ALLOW;
665
+ } else if (KeyIsDisallow(key)) {
666
+ type_ = DISALLOW;
667
+ } else if (KeyIsSitemap(key)) {
668
+ type_ = SITEMAP;
669
+ } else {
670
+ type_ = UNKNOWN;
671
+ key_text_ = key;
672
+ }
673
+ }
674
+
675
+ absl::string_view ParsedRobotsKey::GetUnknownText() const {
676
+ ABSL_ASSERT(type_ == UNKNOWN && !key_text_.empty());
677
+ return key_text_;
678
+ }
679
+
680
+ bool ParsedRobotsKey::KeyIsUserAgent(absl::string_view key) {
681
+ return (
682
+ absl::StartsWithIgnoreCase(key, "user-agent") ||
683
+ (kAllowFrequentTypos && (absl::StartsWithIgnoreCase(key, "useragent") ||
684
+ absl::StartsWithIgnoreCase(key, "user agent"))));
685
+ }
686
+
687
+ bool ParsedRobotsKey::KeyIsAllow(absl::string_view key) {
688
+ return absl::StartsWithIgnoreCase(key, "allow");
689
+ }
690
+
691
+ bool ParsedRobotsKey::KeyIsDisallow(absl::string_view key) {
692
+ return (
693
+ absl::StartsWithIgnoreCase(key, "disallow") ||
694
+ (kAllowFrequentTypos && ((absl::StartsWithIgnoreCase(key, "dissallow")) ||
695
+ (absl::StartsWithIgnoreCase(key, "dissalow")) ||
696
+ (absl::StartsWithIgnoreCase(key, "disalow")) ||
697
+ (absl::StartsWithIgnoreCase(key, "diasllow")) ||
698
+ (absl::StartsWithIgnoreCase(key, "disallaw")))));
699
+ }
700
+
701
+ bool ParsedRobotsKey::KeyIsSitemap(absl::string_view key) {
702
+ return ((absl::StartsWithIgnoreCase(key, "sitemap")) ||
703
+ (absl::StartsWithIgnoreCase(key, "site-map")));
704
+ }
705
+
706
+ } // namespace googlebot