google_robotstxt_parser 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,990 @@
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // https://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // This file tests the robots.txt parsing and matching code found in robots.cc
16
+ // against the current Robots Exclusion Protocol (REP) internet draft (I-D).
17
+ // https://tools.ietf.org/html/draft-koster-rep
18
+ #include "robots.h"
19
+
20
+ #include <string>
21
+
22
+ #include "gtest/gtest.h"
23
+ #include "absl/strings/str_cat.h"
24
+ #include "absl/strings/string_view.h"
25
+
26
+ namespace {
27
+
28
+ using ::googlebot::RobotsMatcher;
29
+
30
+ bool IsUserAgentAllowed(const absl::string_view robotstxt,
31
+ const std::string& useragent, const std::string& url) {
32
+ RobotsMatcher matcher;
33
+ return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
34
+ }
35
+
36
+ // Google-specific: system test.
37
+ TEST(RobotsUnittest, GoogleOnly_SystemTest) {
38
+ const absl::string_view robotstxt =
39
+ "user-agent: FooBot\n"
40
+ "disallow: /\n";
41
+ // Empty robots.txt: everything allowed.
42
+ EXPECT_TRUE(IsUserAgentAllowed("", "FooBot", ""));
43
+
44
+ // Empty user-agent to be matched: everything allowed.
45
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "", ""));
46
+
47
+ // Empty url: implicitly disallowed, see method comment for GetPathParamsQuery
48
+ // in robots.cc.
49
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", ""));
50
+
51
+ // All params empty: same as robots.txt empty, everything allowed.
52
+ EXPECT_TRUE(IsUserAgentAllowed("", "", ""));
53
+ }
54
+ // Rules are colon separated name-value pairs. The following names are
55
+ // provisioned:
56
+ // user-agent: <value>
57
+ // allow: <value>
58
+ // disallow: <value>
59
+ // See REP I-D section "Protocol Definition".
60
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.1
61
+ //
62
+ // Google specific: webmasters sometimes miss the colon separator, but it's
63
+ // obvious what they mean by "disallow /", so we assume the colon if it's
64
+ // missing.
65
+ TEST(RobotsUnittest, ID_LineSyntax_Line) {
66
+ const absl::string_view robotstxt_correct =
67
+ "user-agent: FooBot\n"
68
+ "disallow: /\n";
69
+ const absl::string_view robotstxt_incorrect =
70
+ "foo: FooBot\n"
71
+ "bar: /\n";
72
+ const absl::string_view robotstxt_incorrect_accepted =
73
+ "user-agent FooBot\n"
74
+ "disallow /\n";
75
+ const std::string url = "http://foo.bar/x/y";
76
+
77
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_correct, "FooBot", url));
78
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_incorrect, "FooBot", url));
79
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_incorrect_accepted, "FooBot", url));
80
+ }
81
+
82
+ // A group is one or more user-agent line followed by rules, and terminated
83
+ // by a another user-agent line. Rules for same user-agents are combined
84
+ // opaquely into one group. Rules outside groups are ignored.
85
+ // See REP I-D section "Protocol Definition".
86
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.1
87
+ TEST(RobotsUnittest, ID_LineSyntax_Groups) {
88
+ const absl::string_view robotstxt =
89
+ "allow: /foo/bar/\n"
90
+ "\n"
91
+ "user-agent: FooBot\n"
92
+ "disallow: /\n"
93
+ "allow: /x/\n"
94
+ "user-agent: BarBot\n"
95
+ "disallow: /\n"
96
+ "allow: /y/\n"
97
+ "\n"
98
+ "\n"
99
+ "allow: /w/\n"
100
+ "user-agent: BazBot\n"
101
+ "\n"
102
+ "user-agent: FooBot\n"
103
+ "allow: /z/\n"
104
+ "disallow: /\n";
105
+
106
+ const std::string url_w = "http://foo.bar/w/a";
107
+ const std::string url_x = "http://foo.bar/x/b";
108
+ const std::string url_y = "http://foo.bar/y/c";
109
+ const std::string url_z = "http://foo.bar/z/d";
110
+ const std::string url_foo = "http://foo.bar/foo/bar/";
111
+
112
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_x));
113
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_z));
114
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_y));
115
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_y));
116
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_w));
117
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_z));
118
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BazBot", url_z));
119
+
120
+ // Lines with rules outside groups are ignored.
121
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_foo));
122
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_foo));
123
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
124
+ }
125
+
126
+ // REP lines are case insensitive. See REP I-D section "Protocol Definition".
127
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.1
128
+ TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {
129
+ const absl::string_view robotstxt_upper =
130
+ "USER-AGENT: FooBot\n"
131
+ "ALLOW: /x/\n"
132
+ "DISALLOW: /\n";
133
+ const absl::string_view robotstxt_lower =
134
+ "user-agent: FooBot\n"
135
+ "allow: /x/\n"
136
+ "disallow: /\n";
137
+ const absl::string_view robotstxt_camel =
138
+ "uSeR-aGeNt: FooBot\n"
139
+ "AlLoW: /x/\n"
140
+ "dIsAlLoW: /\n";
141
+ const std::string url_allowed = "http://foo.bar/x/y";
142
+ const std::string url_disallowed = "http://foo.bar/a/b";
143
+
144
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_allowed));
145
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_allowed));
146
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_allowed));
147
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_disallowed));
148
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_disallowed));
149
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_disallowed));
150
+ }
151
+
152
+ // A user-agent line is expected to contain only [a-zA-Z_-] characters and must
153
+ // not be empty. See REP I-D section "The user-agent line".
154
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
155
+ TEST(RobotsUnittest, ID_VerifyValidUserAgentsToObey) {
156
+ EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot"));
157
+ EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot-Bar"));
158
+ EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foo_Bar"));
159
+
160
+ EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(absl::string_view()));
161
+ EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(""));
162
+ EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("ツ"));
163
+
164
+ EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot*"));
165
+ EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(" Foobot "));
166
+ EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot/2.1"));
167
+
168
+ EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
169
+ }
170
+
171
+ // User-agent line values are case insensitive. See REP I-D section "The
172
+ // user-agent line".
173
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
174
+ TEST(RobotsUnittest, ID_UserAgentValueCaseInsensitive) {
175
+ const absl::string_view robotstxt_upper =
176
+ "User-Agent: FOO BAR\n"
177
+ "Allow: /x/\n"
178
+ "Disallow: /\n";
179
+ const absl::string_view robotstxt_lower =
180
+ "User-Agent: foo bar\n"
181
+ "Allow: /x/\n"
182
+ "Disallow: /\n";
183
+ const absl::string_view robotstxt_camel =
184
+ "User-Agent: FoO bAr\n"
185
+ "Allow: /x/\n"
186
+ "Disallow: /\n";
187
+ const std::string url_allowed = "http://foo.bar/x/y";
188
+ const std::string url_disallowed = "http://foo.bar/a/b";
189
+
190
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_allowed));
191
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_allowed));
192
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_allowed));
193
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_disallowed));
194
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_disallowed));
195
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_disallowed));
196
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "foo", url_allowed));
197
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "foo", url_allowed));
198
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "foo", url_allowed));
199
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "foo", url_disallowed));
200
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "foo", url_disallowed));
201
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "foo", url_disallowed));
202
+ }
203
+
204
+ // Google specific: accept user-agent value up to the first space. Space is not
205
+ // allowed in user-agent values, but that doesn't stop webmasters from using
206
+ // them. This is more restrictive than the I-D, since in case of the bad value
207
+ // "Googlebot Images" we'd still obey the rules with "Googlebot".
208
+ // Extends REP I-D section "The user-agent line"
209
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
210
+ TEST(RobotsUnittest, GoogleOnly_AcceptUserAgentUpToFirstSpace) {
211
+ EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
212
+ const absl::string_view robotstxt =
213
+ "User-Agent: *\n"
214
+ "Disallow: /\n"
215
+ "User-Agent: Foo Bar\n"
216
+ "Allow: /x/\n"
217
+ "Disallow: /\n";
218
+ const std::string url = "http://foo.bar/x/y";
219
+
220
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "Foo", url));
221
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "Foo Bar", url));
222
+ }
223
+
224
+ // If no group matches the user-agent, crawlers must obey the first group with a
225
+ // user-agent line with a "*" value, if present. If no group satisfies either
226
+ // condition, or no groups are present at all, no rules apply.
227
+ // See REP I-D section "The user-agent line".
228
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
229
+ TEST(RobotsUnittest, ID_GlobalGroups_Secondary) {
230
+ const absl::string_view robotstxt_empty = "";
231
+ const absl::string_view robotstxt_global =
232
+ "user-agent: *\n"
233
+ "allow: /\n"
234
+ "user-agent: FooBot\n"
235
+ "disallow: /\n";
236
+ const absl::string_view robotstxt_only_specific =
237
+ "user-agent: FooBot\n"
238
+ "allow: /\n"
239
+ "user-agent: BarBot\n"
240
+ "disallow: /\n"
241
+ "user-agent: BazBot\n"
242
+ "disallow: /\n";
243
+ const std::string url = "http://foo.bar/x/y";
244
+
245
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_empty, "FooBot", url));
246
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_global, "FooBot", url));
247
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_global, "BarBot", url));
248
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_only_specific, "QuxBot", url));
249
+ }
250
+
251
+ // Matching rules against URIs is case sensitive.
252
+ // See REP I-D section "The Allow and Disallow lines".
253
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
254
+ TEST(RobotsUnittest, ID_AllowDisallow_Value_CaseSensitive) {
255
+ const absl::string_view robotstxt_lowercase_url =
256
+ "user-agent: FooBot\n"
257
+ "disallow: /x/\n";
258
+ const absl::string_view robotstxt_uppercase_url =
259
+ "user-agent: FooBot\n"
260
+ "disallow: /X/\n";
261
+ const std::string url = "http://foo.bar/x/y";
262
+
263
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lowercase_url, "FooBot", url));
264
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt_uppercase_url, "FooBot", url));
265
+ }
266
+
267
+ // The most specific match found MUST be used. The most specific match is the
268
+ // match that has the most octets. In case of multiple rules with the same
269
+ // length, the least strict rule must be used.
270
+ // See REP I-D section "The Allow and Disallow lines".
271
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
272
+ TEST(RobotsUnittest, ID_LongestMatch) {
273
+ const std::string url = "http://foo.bar/x/page.html";
274
+ {
275
+ const absl::string_view robotstxt =
276
+ "user-agent: FooBot\n"
277
+ "disallow: /x/page.html\n"
278
+ "allow: /x/\n";
279
+
280
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
281
+ }
282
+ {
283
+ const absl::string_view robotstxt =
284
+ "user-agent: FooBot\n"
285
+ "allow: /x/page.html\n"
286
+ "disallow: /x/\n";
287
+
288
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
289
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/"));
290
+ }
291
+ {
292
+ const absl::string_view robotstxt =
293
+ "user-agent: FooBot\n"
294
+ "disallow: \n"
295
+ "allow: \n";
296
+ // In case of equivalent disallow and allow patterns for the same
297
+ // user-agent, allow is used.
298
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
299
+ }
300
+ {
301
+ const absl::string_view robotstxt =
302
+ "user-agent: FooBot\n"
303
+ "disallow: /\n"
304
+ "allow: /\n";
305
+ // In case of equivalent disallow and allow patterns for the same
306
+ // user-agent, allow is used.
307
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
308
+ }
309
+ {
310
+ std::string url_a = "http://foo.bar/x";
311
+ std::string url_b = "http://foo.bar/x/";
312
+ const absl::string_view robotstxt =
313
+ "user-agent: FooBot\n"
314
+ "disallow: /x\n"
315
+ "allow: /x/\n";
316
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_a));
317
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_b));
318
+ }
319
+
320
+ {
321
+ const absl::string_view robotstxt =
322
+ "user-agent: FooBot\n"
323
+ "disallow: /x/page.html\n"
324
+ "allow: /x/page.html\n";
325
+ // In case of equivalent disallow and allow patterns for the same
326
+ // user-agent, allow is used.
327
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
328
+ }
329
+ {
330
+ const absl::string_view robotstxt =
331
+ "user-agent: FooBot\n"
332
+ "allow: /page\n"
333
+ "disallow: /*.html\n";
334
+ // Longest match wins.
335
+ EXPECT_FALSE(
336
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page.html"));
337
+ EXPECT_TRUE(
338
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page"));
339
+ }
340
+ {
341
+ const absl::string_view robotstxt =
342
+ "user-agent: FooBot\n"
343
+ "allow: /x/page.\n"
344
+ "disallow: /*.html\n";
345
+ // Longest match wins.
346
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
347
+ EXPECT_FALSE(
348
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/y.html"));
349
+ }
350
+ {
351
+ const absl::string_view robotstxt =
352
+ "User-agent: *\n"
353
+ "Disallow: /x/\n"
354
+ "User-agent: FooBot\n"
355
+ "Disallow: /y/\n";
356
+ // Most specific group for FooBot allows implicitly /x/page.
357
+ EXPECT_TRUE(
358
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/page"));
359
+ EXPECT_FALSE(
360
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/y/page"));
361
+ }
362
+ }
363
+
364
+ // Octets in the URI and robots.txt paths outside the range of the US-ASCII
365
+ // coded character set, and those in the reserved range defined by RFC3986,
366
+ // MUST be percent-encoded as defined by RFC3986 prior to comparison.
367
+ // See REP I-D section "The Allow and Disallow lines".
368
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
369
+ //
370
+ // NOTE: It's up to the caller to percent encode a URL before passing it to the
371
+ // parser. Percent encoding URIs in the rules is unnecessary.
372
+ TEST(RobotsUnittest, ID_Encoding) {
373
+ // /foo/bar?baz=http://foo.bar stays unencoded.
374
+ {
375
+ const absl::string_view robotstxt =
376
+ "User-agent: FooBot\n"
377
+ "Disallow: /\n"
378
+ "Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n";
379
+ EXPECT_TRUE(IsUserAgentAllowed(
380
+ robotstxt, "FooBot",
381
+ "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par"));
382
+ }
383
+
384
+ // 3 byte character: /foo/bar/ツ -> /foo/bar/%E3%83%84
385
+ {
386
+ const absl::string_view robotstxt =
387
+ "User-agent: FooBot\n"
388
+ "Disallow: /\n"
389
+ "Allow: /foo/bar/ツ\n";
390
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
391
+ "http://foo.bar/foo/bar/%E3%83%84"));
392
+ // The parser encodes the 3-byte character, but the URL is not %-encoded.
393
+ EXPECT_FALSE(
394
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
395
+ }
396
+ // Percent encoded 3 byte character: /foo/bar/%E3%83%84 -> /foo/bar/%E3%83%84
397
+ {
398
+ const absl::string_view robotstxt =
399
+ "User-agent: FooBot\n"
400
+ "Disallow: /\n"
401
+ "Allow: /foo/bar/%E3%83%84\n";
402
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
403
+ "http://foo.bar/foo/bar/%E3%83%84"));
404
+ EXPECT_FALSE(
405
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
406
+ }
407
+ // Percent encoded unreserved US-ASCII: /foo/bar/%62%61%7A -> NULL
408
+ // This is illegal according to RFC3986 and while it may work here due to
409
+ // simple string matching, it should not be relied on.
410
+ {
411
+ const absl::string_view robotstxt =
412
+ "User-agent: FooBot\n"
413
+ "Disallow: /\n"
414
+ "Allow: /foo/bar/%62%61%7A\n";
415
+ EXPECT_FALSE(
416
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
417
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
418
+ "http://foo.bar/foo/bar/%62%61%7A"));
419
+ }
420
+ }
421
+
422
+ // The REP I-D defines the following characters that have special meaning in
423
+ // robots.txt:
424
+ // # - inline comment.
425
+ // $ - end of pattern.
426
+ // * - any number of characters.
427
+ // See REP I-D section "Special Characters".
428
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.2.3
429
+ TEST(RobotsUnittest, ID_SpecialCharacters) {
430
+ {
431
+ const absl::string_view robotstxt =
432
+ "User-agent: FooBot\n"
433
+ "Disallow: /foo/bar/quz\n"
434
+ "Allow: /foo/*/qux\n";
435
+ EXPECT_FALSE(
436
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/quz"));
437
+ EXPECT_TRUE(
438
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
439
+ EXPECT_TRUE(
440
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo//quz"));
441
+ EXPECT_TRUE(
442
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bax/quz"));
443
+ }
444
+ {
445
+ const absl::string_view robotstxt =
446
+ "User-agent: FooBot\n"
447
+ "Disallow: /foo/bar$\n"
448
+ "Allow: /foo/bar/qux\n";
449
+ EXPECT_FALSE(
450
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
451
+ EXPECT_TRUE(
452
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/qux"));
453
+ EXPECT_TRUE(
454
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/"));
455
+ EXPECT_TRUE(
456
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
457
+ }
458
+ {
459
+ const absl::string_view robotstxt =
460
+ "User-agent: FooBot\n"
461
+ "# Disallow: /\n"
462
+ "Disallow: /foo/quz#qux\n"
463
+ "Allow: /\n";
464
+ EXPECT_TRUE(
465
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
466
+ EXPECT_FALSE(
467
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
468
+ }
469
+ }
470
+
471
+ // Google-specific: "index.html" (and only that) at the end of a pattern is
472
+ // equivalent to "/".
473
+ TEST(RobotsUnittest, GoogleOnly_IndexHTMLisDirectory) {
474
+ const absl::string_view robotstxt =
475
+ "User-Agent: *\n"
476
+ "Allow: /allowed-slash/index.html\n"
477
+ "Disallow: /\n";
478
+ // If index.html is allowed, we interpret this as / being allowed too.
479
+ EXPECT_TRUE(
480
+ IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/"));
481
+ // Does not exatly match.
482
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "foobot",
483
+ "http://foo.com/allowed-slash/index.htm"));
484
+ // Exact match.
485
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "foobot",
486
+ "http://foo.com/allowed-slash/index.html"));
487
+ EXPECT_FALSE(
488
+ IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/anyother-url"));
489
+ }
490
+
491
+ // Google-specific: long lines are ignored after 8 * 2083 bytes. See comment in
492
+ // RobotsTxtParser::Parse().
493
+ TEST(RobotsUnittest, GoogleOnly_LineTooLong) {
494
+ size_t kEOLLen = std::string("\n").length();
495
+ int kMaxLineLen = 2083 * 8;
496
+ std::string allow = "allow: ";
497
+ std::string disallow = "disallow: ";
498
+
499
+ // Disallow rule pattern matches the URL after being cut off at kMaxLineLen.
500
+ {
501
+ std::string robotstxt = "user-agent: FooBot\n";
502
+ std::string longline = "/x/";
503
+ size_t max_length =
504
+ kMaxLineLen - longline.length() - disallow.length() + kEOLLen;
505
+ while (longline.size() < max_length) {
506
+ absl::StrAppend(&longline, "a");
507
+ }
508
+ absl::StrAppend(&robotstxt, disallow, longline, "/qux\n");
509
+
510
+ // Matches nothing, so URL is allowed.
511
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fux"));
512
+ // Matches cut off disallow rule.
513
+ EXPECT_FALSE(IsUserAgentAllowed(
514
+ robotstxt, "FooBot", absl::StrCat("http://foo.bar", longline, "/fux")));
515
+ }
516
+
517
+ {
518
+ std::string robotstxt =
519
+ "user-agent: FooBot\n"
520
+ "disallow: /\n";
521
+ std::string longline_a = "/x/";
522
+ std::string longline_b = "/x/";
523
+ size_t max_length =
524
+ kMaxLineLen - longline_a.length() - allow.length() + kEOLLen;
525
+ while (longline_a.size() < max_length) {
526
+ absl::StrAppend(&longline_a, "a");
527
+ absl::StrAppend(&longline_b, "b");
528
+ }
529
+ absl::StrAppend(&robotstxt, allow, longline_a, "/qux\n");
530
+ absl::StrAppend(&robotstxt, allow, longline_b, "/qux\n");
531
+
532
+ // URL matches the disallow rule.
533
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/"));
534
+ // Matches the allow rule exactly.
535
+ EXPECT_TRUE(
536
+ IsUserAgentAllowed(robotstxt, "FooBot",
537
+ absl::StrCat("http://foo.bar", longline_a, "/qux")));
538
+ // Matches cut off allow rule.
539
+ EXPECT_TRUE(
540
+ IsUserAgentAllowed(robotstxt, "FooBot",
541
+ absl::StrCat("http://foo.bar", longline_b, "/fux")));
542
+ }
543
+ }
544
+
545
+ TEST(RobotsUnittest, GoogleOnly_DocumentationChecks) {
546
+ // Test documentation from
547
+ // https://developers.google.com/search/reference/robots_txt
548
+ // Section "URL matching based on path values".
549
+ {
550
+ std::string robotstxt =
551
+ "user-agent: FooBot\n"
552
+ "disallow: /\n"
553
+ "allow: /fish\n";
554
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
555
+
556
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
557
+ EXPECT_TRUE(
558
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
559
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
560
+ "http://foo.bar/fish/salmon.html"));
561
+ EXPECT_TRUE(
562
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
563
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
564
+ "http://foo.bar/fishheads/yummy.html"));
565
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
566
+ "http://foo.bar/fish.html?id=anything"));
567
+
568
+ EXPECT_FALSE(
569
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.asp"));
570
+ EXPECT_FALSE(
571
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
572
+ EXPECT_FALSE(
573
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
574
+ }
575
+ // "/fish*" equals "/fish"
576
+ {
577
+ std::string robotstxt =
578
+ "user-agent: FooBot\n"
579
+ "disallow: /\n"
580
+ "allow: /fish*\n";
581
+ EXPECT_FALSE(
582
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
583
+
584
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
585
+ EXPECT_TRUE(
586
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
587
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
588
+ "http://foo.bar/fish/salmon.html"));
589
+ EXPECT_TRUE(
590
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
591
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
592
+ "http://foo.bar/fishheads/yummy.html"));
593
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
594
+ "http://foo.bar/fish.html?id=anything"));
595
+
596
+ EXPECT_FALSE(
597
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.bar"));
598
+ EXPECT_FALSE(
599
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
600
+ EXPECT_FALSE(
601
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
602
+ }
603
+ // "/fish/" does not equal "/fish"
604
+ {
605
+ std::string robotstxt =
606
+ "user-agent: FooBot\n"
607
+ "disallow: /\n"
608
+ "allow: /fish/\n";
609
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
610
+
611
+ EXPECT_TRUE(
612
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/"));
613
+ EXPECT_TRUE(
614
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon"));
615
+ EXPECT_TRUE(
616
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?salmon"));
617
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
618
+ "http://foo.bar/fish/salmon.html"));
619
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
620
+ "http://foo.bar/fish/?id=anything"));
621
+
622
+ EXPECT_FALSE(
623
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
624
+ EXPECT_FALSE(
625
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
626
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
627
+ "http://foo.bar/Fish/Salmon.html"));
628
+ }
629
+ // "/*.php"
630
+ {
631
+ std::string robotstxt =
632
+ "user-agent: FooBot\n"
633
+ "disallow: /\n"
634
+ "allow: /*.php\n";
635
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
636
+
637
+ EXPECT_TRUE(
638
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
639
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
640
+ "http://foo.bar/folder/filename.php"));
641
+ EXPECT_TRUE(IsUserAgentAllowed(
642
+ robotstxt, "FooBot", "http://foo.bar/folder/filename.php?parameters"));
643
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
644
+ "http://foo.bar//folder/any.php.file.html"));
645
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
646
+ "http://foo.bar/filename.php/"));
647
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
648
+ "http://foo.bar/index?f=filename.php/"));
649
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
650
+ "http://foo.bar/php/"));
651
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
652
+ "http://foo.bar/index?php"));
653
+
654
+ EXPECT_FALSE(
655
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/windows.PHP"));
656
+ }
657
+ // "/*.php$"
658
+ {
659
+ std::string robotstxt =
660
+ "user-agent: FooBot\n"
661
+ "disallow: /\n"
662
+ "allow: /*.php$\n";
663
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
664
+
665
+ EXPECT_TRUE(
666
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
667
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
668
+ "http://foo.bar/folder/filename.php"));
669
+
670
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
671
+ "http://foo.bar/filename.php?parameters"));
672
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
673
+ "http://foo.bar/filename.php/"));
674
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
675
+ "http://foo.bar/filename.php5"));
676
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
677
+ "http://foo.bar/php/"));
678
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
679
+ "http://foo.bar/filename?php"));
680
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
681
+ "http://foo.bar/aaaphpaaa"));
682
+ EXPECT_FALSE(
683
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//windows.PHP"));
684
+ }
685
+ // "/fish*.php"
686
+ {
687
+ std::string robotstxt =
688
+ "user-agent: FooBot\n"
689
+ "disallow: /\n"
690
+ "allow: /fish*.php\n";
691
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
692
+
693
+ EXPECT_TRUE(
694
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.php"));
695
+ EXPECT_TRUE(
696
+ IsUserAgentAllowed(robotstxt, "FooBot",
697
+ "http://foo.bar/fishheads/catfish.php?parameters"));
698
+
699
+ EXPECT_FALSE(
700
+ IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.PHP"));
701
+ }
702
+ // Section "Order of precedence for group-member records".
703
+ {
704
+ std::string robotstxt =
705
+ "user-agent: FooBot\n"
706
+ "allow: /p\n"
707
+ "disallow: /\n";
708
+ std::string url = "http://example.com/page";
709
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
710
+ }
711
+ {
712
+ std::string robotstxt =
713
+ "user-agent: FooBot\n"
714
+ "allow: /folder\n"
715
+ "disallow: /folder\n";
716
+ std::string url = "http://example.com/folder/page";
717
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
718
+ }
719
+ {
720
+ std::string robotstxt =
721
+ "user-agent: FooBot\n"
722
+ "allow: /page\n"
723
+ "disallow: /*.htm\n";
724
+ std::string url = "http://example.com/page.htm";
725
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
726
+ }
727
+ {
728
+ std::string robotstxt =
729
+ "user-agent: FooBot\n"
730
+ "allow: /$\n"
731
+ "disallow: /\n";
732
+ std::string url = "http://example.com/";
733
+ std::string url_page = "http://example.com/page.html";
734
+ EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
735
+ EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_page));
736
+ }
737
+ }
738
+
739
+ class RobotsStatsReporter : public googlebot::RobotsParseHandler {
740
+ public:
741
+ void HandleRobotsStart() override {
742
+ last_line_seen_ = 0;
743
+ valid_directives_ = 0;
744
+ unknown_directives_ = 0;
745
+ sitemap_.clear();
746
+ }
747
+ void HandleRobotsEnd() override {}
748
+
749
+ void HandleUserAgent(int line_num, absl::string_view value) override {
750
+ Digest(line_num);
751
+ }
752
+ void HandleAllow(int line_num, absl::string_view value) override {
753
+ Digest(line_num);
754
+ }
755
+ void HandleDisallow(int line_num, absl::string_view value) override {
756
+ Digest(line_num);
757
+ }
758
+
759
+ void HandleSitemap(int line_num, absl::string_view value) override {
760
+ Digest(line_num);
761
+ sitemap_.append(value.data(), value.length());
762
+ }
763
+
764
+ // Any other unrecognized name/v pairs.
765
+ void HandleUnknownAction(int line_num, absl::string_view action,
766
+ absl::string_view value) override {
767
+ last_line_seen_ = line_num;
768
+ unknown_directives_++;
769
+ }
770
+
771
+ int last_line_seen() const { return last_line_seen_; }
772
+
773
+ // All directives found, including unknown.
774
+ int valid_directives() const { return valid_directives_; }
775
+
776
+ // Number of unknown directives.
777
+ int unknown_directives() const { return unknown_directives_; }
778
+
779
+ // Parsed sitemap line.
780
+ std::string sitemap() const { return sitemap_; }
781
+
782
+ private:
783
+ void Digest(int line_num) {
784
+ ASSERT_GE(line_num, last_line_seen_);
785
+ last_line_seen_ = line_num;
786
+ valid_directives_++;
787
+ }
788
+
789
+ int last_line_seen_ = 0;
790
+ int valid_directives_ = 0;
791
+ int unknown_directives_ = 0;
792
+ std::string sitemap_;
793
+ };
794
+
795
+ // Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A
796
+ TEST(RobotsUnittest, ID_LinesNumbersAreCountedCorrectly) {
797
+ RobotsStatsReporter report;
798
+ static const char kUnixFile[] =
799
+ "User-Agent: foo\n"
800
+ "Allow: /some/path\n"
801
+ "User-Agent: bar\n"
802
+ "\n"
803
+ "\n"
804
+ "Disallow: /\n";
805
+ googlebot::ParseRobotsTxt(kUnixFile, &report);
806
+ EXPECT_EQ(4, report.valid_directives());
807
+ EXPECT_EQ(6, report.last_line_seen());
808
+
809
+ static const char kDosFile[] =
810
+ "User-Agent: foo\r\n"
811
+ "Allow: /some/path\r\n"
812
+ "User-Agent: bar\r\n"
813
+ "\r\n"
814
+ "\r\n"
815
+ "Disallow: /\r\n";
816
+ googlebot::ParseRobotsTxt(kDosFile, &report);
817
+ EXPECT_EQ(4, report.valid_directives());
818
+ EXPECT_EQ(6, report.last_line_seen());
819
+
820
+ static const char kMacFile[] =
821
+ "User-Agent: foo\r"
822
+ "Allow: /some/path\r"
823
+ "User-Agent: bar\r"
824
+ "\r"
825
+ "\r"
826
+ "Disallow: /\r";
827
+ googlebot::ParseRobotsTxt(kMacFile, &report);
828
+ EXPECT_EQ(4, report.valid_directives());
829
+ EXPECT_EQ(6, report.last_line_seen());
830
+
831
+ static const char kNoFinalNewline[] =
832
+ "User-Agent: foo\n"
833
+ "Allow: /some/path\n"
834
+ "User-Agent: bar\n"
835
+ "\n"
836
+ "\n"
837
+ "Disallow: /";
838
+ googlebot::ParseRobotsTxt(kNoFinalNewline, &report);
839
+ EXPECT_EQ(4, report.valid_directives());
840
+ EXPECT_EQ(6, report.last_line_seen());
841
+
842
+ static const char kMixedFile[] =
843
+ "User-Agent: foo\n"
844
+ "Allow: /some/path\r\n"
845
+ "User-Agent: bar\n"
846
+ "\r\n"
847
+ "\n"
848
+ "Disallow: /";
849
+ googlebot::ParseRobotsTxt(kMixedFile, &report);
850
+ EXPECT_EQ(4, report.valid_directives());
851
+ EXPECT_EQ(6, report.last_line_seen());
852
+ }
853
+
854
+ // BOM characters are unparseable and thus skipped. The rules following the line
855
+ // are used.
856
+ TEST(RobotsUnittest, ID_UTF8ByteOrderMarkIsSkipped) {
857
+ RobotsStatsReporter report;
858
+ static const char kUtf8FileFullBOM[] =
859
+ "\xEF\xBB\xBF"
860
+ "User-Agent: foo\n"
861
+ "Allow: /AnyValue\n";
862
+ googlebot::ParseRobotsTxt(kUtf8FileFullBOM, &report);
863
+ EXPECT_EQ(2, report.valid_directives());
864
+ EXPECT_EQ(0, report.unknown_directives());
865
+
866
+ // We allow as well partial ByteOrderMarks.
867
+ static const char kUtf8FilePartial2BOM[] =
868
+ "\xEF\xBB"
869
+ "User-Agent: foo\n"
870
+ "Allow: /AnyValue\n";
871
+ googlebot::ParseRobotsTxt(kUtf8FilePartial2BOM, &report);
872
+ EXPECT_EQ(2, report.valid_directives());
873
+ EXPECT_EQ(0, report.unknown_directives());
874
+
875
+ static const char kUtf8FilePartial1BOM[] =
876
+ "\xEF"
877
+ "User-Agent: foo\n"
878
+ "Allow: /AnyValue\n";
879
+ googlebot::ParseRobotsTxt(kUtf8FilePartial1BOM, &report);
880
+ EXPECT_EQ(2, report.valid_directives());
881
+ EXPECT_EQ(0, report.unknown_directives());
882
+
883
+ // If the BOM is not the right sequence, the first line looks like garbage
884
+ // that is skipped (we essentially see "\x11\xBFUser-Agent").
885
+ static const char kUtf8FileBrokenBOM[] =
886
+ "\xEF\x11\xBF"
887
+ "User-Agent: foo\n"
888
+ "Allow: /AnyValue\n";
889
+ googlebot::ParseRobotsTxt(kUtf8FileBrokenBOM, &report);
890
+ EXPECT_EQ(1, report.valid_directives());
891
+ EXPECT_EQ(1, report.unknown_directives()); // We get one broken line.
892
+
893
+ // Some other messed up file: BOMs only valid in the beginning of the file.
894
+ static const char kUtf8BOMSomewhereInMiddleOfFile[] =
895
+ "User-Agent: foo\n"
896
+ "\xEF\xBB\xBF"
897
+ "Allow: /AnyValue\n";
898
+ googlebot::ParseRobotsTxt(kUtf8BOMSomewhereInMiddleOfFile, &report);
899
+ EXPECT_EQ(1, report.valid_directives());
900
+ EXPECT_EQ(1, report.unknown_directives());
901
+ }
902
+
903
+ // Google specific: the I-D allows any line that crawlers might need, such as
904
+ // sitemaps, which Google supports.
905
+ // See REP I-D section "Other records".
906
+ // https://tools.ietf.org/html/draft-koster-rep#section-2.2.4
907
+ TEST(RobotsUnittest, ID_NonStandardLineExample_Sitemap) {
908
+ RobotsStatsReporter report;
909
+ {
910
+ std::string sitemap_loc = "http://foo.bar/sitemap.xml";
911
+ std::string robotstxt =
912
+ "User-Agent: foo\n"
913
+ "Allow: /some/path\n"
914
+ "User-Agent: bar\n"
915
+ "\n"
916
+ "\n";
917
+ absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n");
918
+
919
+ googlebot::ParseRobotsTxt(robotstxt, &report);
920
+ EXPECT_EQ(sitemap_loc, report.sitemap());
921
+ }
922
+ // A sitemap line may appear anywhere in the file.
923
+ {
924
+ std::string robotstxt;
925
+ std::string sitemap_loc = "http://foo.bar/sitemap.xml";
926
+ std::string robotstxt_temp =
927
+ "User-Agent: foo\n"
928
+ "Allow: /some/path\n"
929
+ "User-Agent: bar\n"
930
+ "\n"
931
+ "\n";
932
+ absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n", robotstxt_temp);
933
+
934
+ googlebot::ParseRobotsTxt(robotstxt, &report);
935
+ EXPECT_EQ(sitemap_loc, report.sitemap());
936
+ }
937
+ }
938
+
939
+ } // namespace
940
+
941
+ // Integrity tests. These functions are available to the linker, but not in the
942
+ // header, because they should only be used for testing.
943
+ namespace googlebot {
944
+ std::string GetPathParamsQuery(const std::string& url);
945
+ bool MaybeEscapePattern(const char* src, char** dst);
946
+ } // namespace googlebot
947
+
948
+ void TestPath(const std::string& url, const std::string& expected_path) {
949
+ EXPECT_EQ(expected_path, googlebot::GetPathParamsQuery(url));
950
+ }
951
+
952
+ void TestEscape(const std::string& url, const std::string& expected) {
953
+ char* escaped_value = nullptr;
954
+ const bool is_escaped =
955
+ googlebot::MaybeEscapePattern(url.c_str(), &escaped_value);
956
+ const std::string escaped = escaped_value;
957
+ if (is_escaped) delete[] escaped_value;
958
+
959
+ EXPECT_EQ(expected, escaped);
960
+ }
961
+
962
+ TEST(RobotsUnittest, TestGetPathParamsQuery) {
963
+ // Only testing URLs that are already correctly escaped here.
964
+ TestPath("", "/");
965
+ TestPath("http://www.example.com", "/");
966
+ TestPath("http://www.example.com/", "/");
967
+ TestPath("http://www.example.com/a", "/a");
968
+ TestPath("http://www.example.com/a/", "/a/");
969
+ TestPath("http://www.example.com/a/b?c=http://d.e/", "/a/b?c=http://d.e/");
970
+ TestPath("http://www.example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
971
+ TestPath("example.com", "/");
972
+ TestPath("example.com/", "/");
973
+ TestPath("example.com/a", "/a");
974
+ TestPath("example.com/a/", "/a/");
975
+ TestPath("example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
976
+ TestPath("a", "/");
977
+ TestPath("a/", "/");
978
+ TestPath("/a", "/a");
979
+ TestPath("a/b", "/b");
980
+ TestPath("example.com?a", "/?a");
981
+ TestPath("example.com/a;b#c", "/a;b");
982
+ TestPath("//a/b/c", "/b/c");
983
+ }
984
+
985
+ TEST(RobotsUnittest, TestMaybeEscapePattern) {
986
+ TestEscape("http://www.example.com", "http://www.example.com");
987
+ TestEscape("/a/b/c", "/a/b/c");
988
+ TestEscape("á", "%C3%A1");
989
+ TestEscape("%aa", "%AA");
990
+ }