google_robotstxt_parser 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +28 -0
- data/.gitmodules +3 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +6 -0
- data/Guardfile +16 -0
- data/LICENSE +22 -0
- data/README.md +57 -0
- data/Rakefile +6 -0
- data/ext/robotstxt/.DS_Store +0 -0
- data/ext/robotstxt/extconf.rb +83 -0
- data/ext/robotstxt/robotstxt/.gitignore +1 -0
- data/ext/robotstxt/robotstxt/BUILD +40 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt +174 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt.in +30 -0
- data/ext/robotstxt/robotstxt/CONTRIBUTING.md +30 -0
- data/ext/robotstxt/robotstxt/LICENSE +203 -0
- data/ext/robotstxt/robotstxt/README.md +134 -0
- data/ext/robotstxt/robotstxt/WORKSPACE +28 -0
- data/ext/robotstxt/robotstxt/protocol-draft/README.md +9 -0
- data/ext/robotstxt/robotstxt/protocol-draft/draft-koster-rep-00.txt +529 -0
- data/ext/robotstxt/robotstxt/robots.cc +706 -0
- data/ext/robotstxt/robotstxt/robots.h +241 -0
- data/ext/robotstxt/robotstxt/robots_main.cc +101 -0
- data/ext/robotstxt/robotstxt/robots_test.cc +990 -0
- data/ext/robotstxt/robotstxt.cc +32 -0
- data/google_robotstxt_parser.gemspec +45 -0
- data/lib/google_robotstxt_parser/version.rb +6 -0
- data/lib/google_robotstxt_parser.rb +4 -0
- data/spec/google_robotstxt_parser_spec.rb +33 -0
- data/spec/spec_helper.rb +19 -0
- metadata +146 -0
@@ -0,0 +1,990 @@
|
|
1
|
+
// Copyright 2019 Google LLC
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// https://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// This file tests the robots.txt parsing and matching code found in robots.cc
|
16
|
+
// against the current Robots Exclusion Protocol (REP) internet draft (I-D).
|
17
|
+
// https://tools.ietf.org/html/draft-koster-rep
|
18
|
+
#include "robots.h"
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
|
22
|
+
#include "gtest/gtest.h"
|
23
|
+
#include "absl/strings/str_cat.h"
|
24
|
+
#include "absl/strings/string_view.h"
|
25
|
+
|
26
|
+
namespace {
|
27
|
+
|
28
|
+
using ::googlebot::RobotsMatcher;
|
29
|
+
|
30
|
+
bool IsUserAgentAllowed(const absl::string_view robotstxt,
|
31
|
+
const std::string& useragent, const std::string& url) {
|
32
|
+
RobotsMatcher matcher;
|
33
|
+
return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
|
34
|
+
}
|
35
|
+
|
36
|
+
// Google-specific: system test.
|
37
|
+
TEST(RobotsUnittest, GoogleOnly_SystemTest) {
|
38
|
+
const absl::string_view robotstxt =
|
39
|
+
"user-agent: FooBot\n"
|
40
|
+
"disallow: /\n";
|
41
|
+
// Empty robots.txt: everything allowed.
|
42
|
+
EXPECT_TRUE(IsUserAgentAllowed("", "FooBot", ""));
|
43
|
+
|
44
|
+
// Empty user-agent to be matched: everything allowed.
|
45
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "", ""));
|
46
|
+
|
47
|
+
// Empty url: implicitly disallowed, see method comment for GetPathParamsQuery
|
48
|
+
// in robots.cc.
|
49
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", ""));
|
50
|
+
|
51
|
+
// All params empty: same as robots.txt empty, everything allowed.
|
52
|
+
EXPECT_TRUE(IsUserAgentAllowed("", "", ""));
|
53
|
+
}
|
54
|
+
// Rules are colon separated name-value pairs. The following names are
|
55
|
+
// provisioned:
|
56
|
+
// user-agent: <value>
|
57
|
+
// allow: <value>
|
58
|
+
// disallow: <value>
|
59
|
+
// See REP I-D section "Protocol Definition".
|
60
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.1
|
61
|
+
//
|
62
|
+
// Google specific: webmasters sometimes miss the colon separator, but it's
|
63
|
+
// obvious what they mean by "disallow /", so we assume the colon if it's
|
64
|
+
// missing.
|
65
|
+
TEST(RobotsUnittest, ID_LineSyntax_Line) {
|
66
|
+
const absl::string_view robotstxt_correct =
|
67
|
+
"user-agent: FooBot\n"
|
68
|
+
"disallow: /\n";
|
69
|
+
const absl::string_view robotstxt_incorrect =
|
70
|
+
"foo: FooBot\n"
|
71
|
+
"bar: /\n";
|
72
|
+
const absl::string_view robotstxt_incorrect_accepted =
|
73
|
+
"user-agent FooBot\n"
|
74
|
+
"disallow /\n";
|
75
|
+
const std::string url = "http://foo.bar/x/y";
|
76
|
+
|
77
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_correct, "FooBot", url));
|
78
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_incorrect, "FooBot", url));
|
79
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_incorrect_accepted, "FooBot", url));
|
80
|
+
}
|
81
|
+
|
82
|
+
// A group is one or more user-agent line followed by rules, and terminated
|
83
|
+
// by a another user-agent line. Rules for same user-agents are combined
|
84
|
+
// opaquely into one group. Rules outside groups are ignored.
|
85
|
+
// See REP I-D section "Protocol Definition".
|
86
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.1
|
87
|
+
TEST(RobotsUnittest, ID_LineSyntax_Groups) {
|
88
|
+
const absl::string_view robotstxt =
|
89
|
+
"allow: /foo/bar/\n"
|
90
|
+
"\n"
|
91
|
+
"user-agent: FooBot\n"
|
92
|
+
"disallow: /\n"
|
93
|
+
"allow: /x/\n"
|
94
|
+
"user-agent: BarBot\n"
|
95
|
+
"disallow: /\n"
|
96
|
+
"allow: /y/\n"
|
97
|
+
"\n"
|
98
|
+
"\n"
|
99
|
+
"allow: /w/\n"
|
100
|
+
"user-agent: BazBot\n"
|
101
|
+
"\n"
|
102
|
+
"user-agent: FooBot\n"
|
103
|
+
"allow: /z/\n"
|
104
|
+
"disallow: /\n";
|
105
|
+
|
106
|
+
const std::string url_w = "http://foo.bar/w/a";
|
107
|
+
const std::string url_x = "http://foo.bar/x/b";
|
108
|
+
const std::string url_y = "http://foo.bar/y/c";
|
109
|
+
const std::string url_z = "http://foo.bar/z/d";
|
110
|
+
const std::string url_foo = "http://foo.bar/foo/bar/";
|
111
|
+
|
112
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_x));
|
113
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_z));
|
114
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_y));
|
115
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_y));
|
116
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_w));
|
117
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_z));
|
118
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BazBot", url_z));
|
119
|
+
|
120
|
+
// Lines with rules outside groups are ignored.
|
121
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_foo));
|
122
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_foo));
|
123
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
|
124
|
+
}
|
125
|
+
|
126
|
+
// REP lines are case insensitive. See REP I-D section "Protocol Definition".
|
127
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.1
|
128
|
+
TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {
|
129
|
+
const absl::string_view robotstxt_upper =
|
130
|
+
"USER-AGENT: FooBot\n"
|
131
|
+
"ALLOW: /x/\n"
|
132
|
+
"DISALLOW: /\n";
|
133
|
+
const absl::string_view robotstxt_lower =
|
134
|
+
"user-agent: FooBot\n"
|
135
|
+
"allow: /x/\n"
|
136
|
+
"disallow: /\n";
|
137
|
+
const absl::string_view robotstxt_camel =
|
138
|
+
"uSeR-aGeNt: FooBot\n"
|
139
|
+
"AlLoW: /x/\n"
|
140
|
+
"dIsAlLoW: /\n";
|
141
|
+
const std::string url_allowed = "http://foo.bar/x/y";
|
142
|
+
const std::string url_disallowed = "http://foo.bar/a/b";
|
143
|
+
|
144
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_allowed));
|
145
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_allowed));
|
146
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_allowed));
|
147
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_disallowed));
|
148
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_disallowed));
|
149
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_disallowed));
|
150
|
+
}
|
151
|
+
|
152
|
+
// A user-agent line is expected to contain only [a-zA-Z_-] characters and must
|
153
|
+
// not be empty. See REP I-D section "The user-agent line".
|
154
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
|
155
|
+
TEST(RobotsUnittest, ID_VerifyValidUserAgentsToObey) {
|
156
|
+
EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot"));
|
157
|
+
EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot-Bar"));
|
158
|
+
EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foo_Bar"));
|
159
|
+
|
160
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(absl::string_view()));
|
161
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(""));
|
162
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("ツ"));
|
163
|
+
|
164
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot*"));
|
165
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(" Foobot "));
|
166
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot/2.1"));
|
167
|
+
|
168
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
|
169
|
+
}
|
170
|
+
|
171
|
+
// User-agent line values are case insensitive. See REP I-D section "The
|
172
|
+
// user-agent line".
|
173
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
|
174
|
+
TEST(RobotsUnittest, ID_UserAgentValueCaseInsensitive) {
|
175
|
+
const absl::string_view robotstxt_upper =
|
176
|
+
"User-Agent: FOO BAR\n"
|
177
|
+
"Allow: /x/\n"
|
178
|
+
"Disallow: /\n";
|
179
|
+
const absl::string_view robotstxt_lower =
|
180
|
+
"User-Agent: foo bar\n"
|
181
|
+
"Allow: /x/\n"
|
182
|
+
"Disallow: /\n";
|
183
|
+
const absl::string_view robotstxt_camel =
|
184
|
+
"User-Agent: FoO bAr\n"
|
185
|
+
"Allow: /x/\n"
|
186
|
+
"Disallow: /\n";
|
187
|
+
const std::string url_allowed = "http://foo.bar/x/y";
|
188
|
+
const std::string url_disallowed = "http://foo.bar/a/b";
|
189
|
+
|
190
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_allowed));
|
191
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_allowed));
|
192
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_allowed));
|
193
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_disallowed));
|
194
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_disallowed));
|
195
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_disallowed));
|
196
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "foo", url_allowed));
|
197
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "foo", url_allowed));
|
198
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "foo", url_allowed));
|
199
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "foo", url_disallowed));
|
200
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "foo", url_disallowed));
|
201
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "foo", url_disallowed));
|
202
|
+
}
|
203
|
+
|
204
|
+
// Google specific: accept user-agent value up to the first space. Space is not
|
205
|
+
// allowed in user-agent values, but that doesn't stop webmasters from using
|
206
|
+
// them. This is more restrictive than the I-D, since in case of the bad value
|
207
|
+
// "Googlebot Images" we'd still obey the rules with "Googlebot".
|
208
|
+
// Extends REP I-D section "The user-agent line"
|
209
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
|
210
|
+
TEST(RobotsUnittest, GoogleOnly_AcceptUserAgentUpToFirstSpace) {
|
211
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
|
212
|
+
const absl::string_view robotstxt =
|
213
|
+
"User-Agent: *\n"
|
214
|
+
"Disallow: /\n"
|
215
|
+
"User-Agent: Foo Bar\n"
|
216
|
+
"Allow: /x/\n"
|
217
|
+
"Disallow: /\n";
|
218
|
+
const std::string url = "http://foo.bar/x/y";
|
219
|
+
|
220
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "Foo", url));
|
221
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "Foo Bar", url));
|
222
|
+
}
|
223
|
+
|
224
|
+
// If no group matches the user-agent, crawlers must obey the first group with a
|
225
|
+
// user-agent line with a "*" value, if present. If no group satisfies either
|
226
|
+
// condition, or no groups are present at all, no rules apply.
|
227
|
+
// See REP I-D section "The user-agent line".
|
228
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
|
229
|
+
TEST(RobotsUnittest, ID_GlobalGroups_Secondary) {
|
230
|
+
const absl::string_view robotstxt_empty = "";
|
231
|
+
const absl::string_view robotstxt_global =
|
232
|
+
"user-agent: *\n"
|
233
|
+
"allow: /\n"
|
234
|
+
"user-agent: FooBot\n"
|
235
|
+
"disallow: /\n";
|
236
|
+
const absl::string_view robotstxt_only_specific =
|
237
|
+
"user-agent: FooBot\n"
|
238
|
+
"allow: /\n"
|
239
|
+
"user-agent: BarBot\n"
|
240
|
+
"disallow: /\n"
|
241
|
+
"user-agent: BazBot\n"
|
242
|
+
"disallow: /\n";
|
243
|
+
const std::string url = "http://foo.bar/x/y";
|
244
|
+
|
245
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_empty, "FooBot", url));
|
246
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_global, "FooBot", url));
|
247
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_global, "BarBot", url));
|
248
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_only_specific, "QuxBot", url));
|
249
|
+
}
|
250
|
+
|
251
|
+
// Matching rules against URIs is case sensitive.
|
252
|
+
// See REP I-D section "The Allow and Disallow lines".
|
253
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
|
254
|
+
TEST(RobotsUnittest, ID_AllowDisallow_Value_CaseSensitive) {
|
255
|
+
const absl::string_view robotstxt_lowercase_url =
|
256
|
+
"user-agent: FooBot\n"
|
257
|
+
"disallow: /x/\n";
|
258
|
+
const absl::string_view robotstxt_uppercase_url =
|
259
|
+
"user-agent: FooBot\n"
|
260
|
+
"disallow: /X/\n";
|
261
|
+
const std::string url = "http://foo.bar/x/y";
|
262
|
+
|
263
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lowercase_url, "FooBot", url));
|
264
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_uppercase_url, "FooBot", url));
|
265
|
+
}
|
266
|
+
|
267
|
+
// The most specific match found MUST be used. The most specific match is the
|
268
|
+
// match that has the most octets. In case of multiple rules with the same
|
269
|
+
// length, the least strict rule must be used.
|
270
|
+
// See REP I-D section "The Allow and Disallow lines".
|
271
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
|
272
|
+
TEST(RobotsUnittest, ID_LongestMatch) {
|
273
|
+
const std::string url = "http://foo.bar/x/page.html";
|
274
|
+
{
|
275
|
+
const absl::string_view robotstxt =
|
276
|
+
"user-agent: FooBot\n"
|
277
|
+
"disallow: /x/page.html\n"
|
278
|
+
"allow: /x/\n";
|
279
|
+
|
280
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
281
|
+
}
|
282
|
+
{
|
283
|
+
const absl::string_view robotstxt =
|
284
|
+
"user-agent: FooBot\n"
|
285
|
+
"allow: /x/page.html\n"
|
286
|
+
"disallow: /x/\n";
|
287
|
+
|
288
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
289
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/"));
|
290
|
+
}
|
291
|
+
{
|
292
|
+
const absl::string_view robotstxt =
|
293
|
+
"user-agent: FooBot\n"
|
294
|
+
"disallow: \n"
|
295
|
+
"allow: \n";
|
296
|
+
// In case of equivalent disallow and allow patterns for the same
|
297
|
+
// user-agent, allow is used.
|
298
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
299
|
+
}
|
300
|
+
{
|
301
|
+
const absl::string_view robotstxt =
|
302
|
+
"user-agent: FooBot\n"
|
303
|
+
"disallow: /\n"
|
304
|
+
"allow: /\n";
|
305
|
+
// In case of equivalent disallow and allow patterns for the same
|
306
|
+
// user-agent, allow is used.
|
307
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
308
|
+
}
|
309
|
+
{
|
310
|
+
std::string url_a = "http://foo.bar/x";
|
311
|
+
std::string url_b = "http://foo.bar/x/";
|
312
|
+
const absl::string_view robotstxt =
|
313
|
+
"user-agent: FooBot\n"
|
314
|
+
"disallow: /x\n"
|
315
|
+
"allow: /x/\n";
|
316
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_a));
|
317
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_b));
|
318
|
+
}
|
319
|
+
|
320
|
+
{
|
321
|
+
const absl::string_view robotstxt =
|
322
|
+
"user-agent: FooBot\n"
|
323
|
+
"disallow: /x/page.html\n"
|
324
|
+
"allow: /x/page.html\n";
|
325
|
+
// In case of equivalent disallow and allow patterns for the same
|
326
|
+
// user-agent, allow is used.
|
327
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
328
|
+
}
|
329
|
+
{
|
330
|
+
const absl::string_view robotstxt =
|
331
|
+
"user-agent: FooBot\n"
|
332
|
+
"allow: /page\n"
|
333
|
+
"disallow: /*.html\n";
|
334
|
+
// Longest match wins.
|
335
|
+
EXPECT_FALSE(
|
336
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page.html"));
|
337
|
+
EXPECT_TRUE(
|
338
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page"));
|
339
|
+
}
|
340
|
+
{
|
341
|
+
const absl::string_view robotstxt =
|
342
|
+
"user-agent: FooBot\n"
|
343
|
+
"allow: /x/page.\n"
|
344
|
+
"disallow: /*.html\n";
|
345
|
+
// Longest match wins.
|
346
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
347
|
+
EXPECT_FALSE(
|
348
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/y.html"));
|
349
|
+
}
|
350
|
+
{
|
351
|
+
const absl::string_view robotstxt =
|
352
|
+
"User-agent: *\n"
|
353
|
+
"Disallow: /x/\n"
|
354
|
+
"User-agent: FooBot\n"
|
355
|
+
"Disallow: /y/\n";
|
356
|
+
// Most specific group for FooBot allows implicitly /x/page.
|
357
|
+
EXPECT_TRUE(
|
358
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/page"));
|
359
|
+
EXPECT_FALSE(
|
360
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/y/page"));
|
361
|
+
}
|
362
|
+
}
|
363
|
+
|
364
|
+
// Octets in the URI and robots.txt paths outside the range of the US-ASCII
|
365
|
+
// coded character set, and those in the reserved range defined by RFC3986,
|
366
|
+
// MUST be percent-encoded as defined by RFC3986 prior to comparison.
|
367
|
+
// See REP I-D section "The Allow and Disallow lines".
|
368
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
|
369
|
+
//
|
370
|
+
// NOTE: It's up to the caller to percent encode a URL before passing it to the
|
371
|
+
// parser. Percent encoding URIs in the rules is unnecessary.
|
372
|
+
TEST(RobotsUnittest, ID_Encoding) {
|
373
|
+
// /foo/bar?baz=http://foo.bar stays unencoded.
|
374
|
+
{
|
375
|
+
const absl::string_view robotstxt =
|
376
|
+
"User-agent: FooBot\n"
|
377
|
+
"Disallow: /\n"
|
378
|
+
"Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n";
|
379
|
+
EXPECT_TRUE(IsUserAgentAllowed(
|
380
|
+
robotstxt, "FooBot",
|
381
|
+
"http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par"));
|
382
|
+
}
|
383
|
+
|
384
|
+
// 3 byte character: /foo/bar/ツ -> /foo/bar/%E3%83%84
|
385
|
+
{
|
386
|
+
const absl::string_view robotstxt =
|
387
|
+
"User-agent: FooBot\n"
|
388
|
+
"Disallow: /\n"
|
389
|
+
"Allow: /foo/bar/ツ\n";
|
390
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
391
|
+
"http://foo.bar/foo/bar/%E3%83%84"));
|
392
|
+
// The parser encodes the 3-byte character, but the URL is not %-encoded.
|
393
|
+
EXPECT_FALSE(
|
394
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
|
395
|
+
}
|
396
|
+
// Percent encoded 3 byte character: /foo/bar/%E3%83%84 -> /foo/bar/%E3%83%84
|
397
|
+
{
|
398
|
+
const absl::string_view robotstxt =
|
399
|
+
"User-agent: FooBot\n"
|
400
|
+
"Disallow: /\n"
|
401
|
+
"Allow: /foo/bar/%E3%83%84\n";
|
402
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
403
|
+
"http://foo.bar/foo/bar/%E3%83%84"));
|
404
|
+
EXPECT_FALSE(
|
405
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
|
406
|
+
}
|
407
|
+
// Percent encoded unreserved US-ASCII: /foo/bar/%62%61%7A -> NULL
|
408
|
+
// This is illegal according to RFC3986 and while it may work here due to
|
409
|
+
// simple string matching, it should not be relied on.
|
410
|
+
{
|
411
|
+
const absl::string_view robotstxt =
|
412
|
+
"User-agent: FooBot\n"
|
413
|
+
"Disallow: /\n"
|
414
|
+
"Allow: /foo/bar/%62%61%7A\n";
|
415
|
+
EXPECT_FALSE(
|
416
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
|
417
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
418
|
+
"http://foo.bar/foo/bar/%62%61%7A"));
|
419
|
+
}
|
420
|
+
}
|
421
|
+
|
422
|
+
// The REP I-D defines the following characters that have special meaning in
|
423
|
+
// robots.txt:
|
424
|
+
// # - inline comment.
|
425
|
+
// $ - end of pattern.
|
426
|
+
// * - any number of characters.
|
427
|
+
// See REP I-D section "Special Characters".
|
428
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.3
|
429
|
+
TEST(RobotsUnittest, ID_SpecialCharacters) {
|
430
|
+
{
|
431
|
+
const absl::string_view robotstxt =
|
432
|
+
"User-agent: FooBot\n"
|
433
|
+
"Disallow: /foo/bar/quz\n"
|
434
|
+
"Allow: /foo/*/qux\n";
|
435
|
+
EXPECT_FALSE(
|
436
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/quz"));
|
437
|
+
EXPECT_TRUE(
|
438
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
|
439
|
+
EXPECT_TRUE(
|
440
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo//quz"));
|
441
|
+
EXPECT_TRUE(
|
442
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bax/quz"));
|
443
|
+
}
|
444
|
+
{
|
445
|
+
const absl::string_view robotstxt =
|
446
|
+
"User-agent: FooBot\n"
|
447
|
+
"Disallow: /foo/bar$\n"
|
448
|
+
"Allow: /foo/bar/qux\n";
|
449
|
+
EXPECT_FALSE(
|
450
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
|
451
|
+
EXPECT_TRUE(
|
452
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/qux"));
|
453
|
+
EXPECT_TRUE(
|
454
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/"));
|
455
|
+
EXPECT_TRUE(
|
456
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
|
457
|
+
}
|
458
|
+
{
|
459
|
+
const absl::string_view robotstxt =
|
460
|
+
"User-agent: FooBot\n"
|
461
|
+
"# Disallow: /\n"
|
462
|
+
"Disallow: /foo/quz#qux\n"
|
463
|
+
"Allow: /\n";
|
464
|
+
EXPECT_TRUE(
|
465
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
|
466
|
+
EXPECT_FALSE(
|
467
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
|
468
|
+
}
|
469
|
+
}
|
470
|
+
|
471
|
+
// Google-specific: "index.html" (and only that) at the end of a pattern is
|
472
|
+
// equivalent to "/".
|
473
|
+
TEST(RobotsUnittest, GoogleOnly_IndexHTMLisDirectory) {
|
474
|
+
const absl::string_view robotstxt =
|
475
|
+
"User-Agent: *\n"
|
476
|
+
"Allow: /allowed-slash/index.html\n"
|
477
|
+
"Disallow: /\n";
|
478
|
+
// If index.html is allowed, we interpret this as / being allowed too.
|
479
|
+
EXPECT_TRUE(
|
480
|
+
IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/"));
|
481
|
+
// Does not exatly match.
|
482
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "foobot",
|
483
|
+
"http://foo.com/allowed-slash/index.htm"));
|
484
|
+
// Exact match.
|
485
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "foobot",
|
486
|
+
"http://foo.com/allowed-slash/index.html"));
|
487
|
+
EXPECT_FALSE(
|
488
|
+
IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/anyother-url"));
|
489
|
+
}
|
490
|
+
|
491
|
+
// Google-specific: long lines are ignored after 8 * 2083 bytes. See comment in
|
492
|
+
// RobotsTxtParser::Parse().
|
493
|
+
TEST(RobotsUnittest, GoogleOnly_LineTooLong) {
|
494
|
+
size_t kEOLLen = std::string("\n").length();
|
495
|
+
int kMaxLineLen = 2083 * 8;
|
496
|
+
std::string allow = "allow: ";
|
497
|
+
std::string disallow = "disallow: ";
|
498
|
+
|
499
|
+
// Disallow rule pattern matches the URL after being cut off at kMaxLineLen.
|
500
|
+
{
|
501
|
+
std::string robotstxt = "user-agent: FooBot\n";
|
502
|
+
std::string longline = "/x/";
|
503
|
+
size_t max_length =
|
504
|
+
kMaxLineLen - longline.length() - disallow.length() + kEOLLen;
|
505
|
+
while (longline.size() < max_length) {
|
506
|
+
absl::StrAppend(&longline, "a");
|
507
|
+
}
|
508
|
+
absl::StrAppend(&robotstxt, disallow, longline, "/qux\n");
|
509
|
+
|
510
|
+
// Matches nothing, so URL is allowed.
|
511
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fux"));
|
512
|
+
// Matches cut off disallow rule.
|
513
|
+
EXPECT_FALSE(IsUserAgentAllowed(
|
514
|
+
robotstxt, "FooBot", absl::StrCat("http://foo.bar", longline, "/fux")));
|
515
|
+
}
|
516
|
+
|
517
|
+
{
|
518
|
+
std::string robotstxt =
|
519
|
+
"user-agent: FooBot\n"
|
520
|
+
"disallow: /\n";
|
521
|
+
std::string longline_a = "/x/";
|
522
|
+
std::string longline_b = "/x/";
|
523
|
+
size_t max_length =
|
524
|
+
kMaxLineLen - longline_a.length() - allow.length() + kEOLLen;
|
525
|
+
while (longline_a.size() < max_length) {
|
526
|
+
absl::StrAppend(&longline_a, "a");
|
527
|
+
absl::StrAppend(&longline_b, "b");
|
528
|
+
}
|
529
|
+
absl::StrAppend(&robotstxt, allow, longline_a, "/qux\n");
|
530
|
+
absl::StrAppend(&robotstxt, allow, longline_b, "/qux\n");
|
531
|
+
|
532
|
+
// URL matches the disallow rule.
|
533
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/"));
|
534
|
+
// Matches the allow rule exactly.
|
535
|
+
EXPECT_TRUE(
|
536
|
+
IsUserAgentAllowed(robotstxt, "FooBot",
|
537
|
+
absl::StrCat("http://foo.bar", longline_a, "/qux")));
|
538
|
+
// Matches cut off allow rule.
|
539
|
+
EXPECT_TRUE(
|
540
|
+
IsUserAgentAllowed(robotstxt, "FooBot",
|
541
|
+
absl::StrCat("http://foo.bar", longline_b, "/fux")));
|
542
|
+
}
|
543
|
+
}
|
544
|
+
|
545
|
+
TEST(RobotsUnittest, GoogleOnly_DocumentationChecks) {
|
546
|
+
// Test documentation from
|
547
|
+
// https://developers.google.com/search/reference/robots_txt
|
548
|
+
// Section "URL matching based on path values".
|
549
|
+
{
|
550
|
+
std::string robotstxt =
|
551
|
+
"user-agent: FooBot\n"
|
552
|
+
"disallow: /\n"
|
553
|
+
"allow: /fish\n";
|
554
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
555
|
+
|
556
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
|
557
|
+
EXPECT_TRUE(
|
558
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
|
559
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
560
|
+
"http://foo.bar/fish/salmon.html"));
|
561
|
+
EXPECT_TRUE(
|
562
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
|
563
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
564
|
+
"http://foo.bar/fishheads/yummy.html"));
|
565
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
566
|
+
"http://foo.bar/fish.html?id=anything"));
|
567
|
+
|
568
|
+
EXPECT_FALSE(
|
569
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.asp"));
|
570
|
+
EXPECT_FALSE(
|
571
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
|
572
|
+
EXPECT_FALSE(
|
573
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
|
574
|
+
}
|
575
|
+
// "/fish*" equals "/fish"
|
576
|
+
{
|
577
|
+
std::string robotstxt =
|
578
|
+
"user-agent: FooBot\n"
|
579
|
+
"disallow: /\n"
|
580
|
+
"allow: /fish*\n";
|
581
|
+
EXPECT_FALSE(
|
582
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
583
|
+
|
584
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
|
585
|
+
EXPECT_TRUE(
|
586
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
|
587
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
588
|
+
"http://foo.bar/fish/salmon.html"));
|
589
|
+
EXPECT_TRUE(
|
590
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
|
591
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
592
|
+
"http://foo.bar/fishheads/yummy.html"));
|
593
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
594
|
+
"http://foo.bar/fish.html?id=anything"));
|
595
|
+
|
596
|
+
EXPECT_FALSE(
|
597
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.bar"));
|
598
|
+
EXPECT_FALSE(
|
599
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
|
600
|
+
EXPECT_FALSE(
|
601
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
|
602
|
+
}
|
603
|
+
// "/fish/" does not equal "/fish"
|
604
|
+
{
|
605
|
+
std::string robotstxt =
|
606
|
+
"user-agent: FooBot\n"
|
607
|
+
"disallow: /\n"
|
608
|
+
"allow: /fish/\n";
|
609
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
610
|
+
|
611
|
+
EXPECT_TRUE(
|
612
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/"));
|
613
|
+
EXPECT_TRUE(
|
614
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon"));
|
615
|
+
EXPECT_TRUE(
|
616
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?salmon"));
|
617
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
618
|
+
"http://foo.bar/fish/salmon.html"));
|
619
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
620
|
+
"http://foo.bar/fish/?id=anything"));
|
621
|
+
|
622
|
+
EXPECT_FALSE(
|
623
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
|
624
|
+
EXPECT_FALSE(
|
625
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
|
626
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
627
|
+
"http://foo.bar/Fish/Salmon.html"));
|
628
|
+
}
|
629
|
+
// "/*.php"
|
630
|
+
{
|
631
|
+
std::string robotstxt =
|
632
|
+
"user-agent: FooBot\n"
|
633
|
+
"disallow: /\n"
|
634
|
+
"allow: /*.php\n";
|
635
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
636
|
+
|
637
|
+
EXPECT_TRUE(
|
638
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
|
639
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
640
|
+
"http://foo.bar/folder/filename.php"));
|
641
|
+
EXPECT_TRUE(IsUserAgentAllowed(
|
642
|
+
robotstxt, "FooBot", "http://foo.bar/folder/filename.php?parameters"));
|
643
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
644
|
+
"http://foo.bar//folder/any.php.file.html"));
|
645
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
646
|
+
"http://foo.bar/filename.php/"));
|
647
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
648
|
+
"http://foo.bar/index?f=filename.php/"));
|
649
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
650
|
+
"http://foo.bar/php/"));
|
651
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
652
|
+
"http://foo.bar/index?php"));
|
653
|
+
|
654
|
+
EXPECT_FALSE(
|
655
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/windows.PHP"));
|
656
|
+
}
|
657
|
+
// "/*.php$"
|
658
|
+
{
|
659
|
+
std::string robotstxt =
|
660
|
+
"user-agent: FooBot\n"
|
661
|
+
"disallow: /\n"
|
662
|
+
"allow: /*.php$\n";
|
663
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
664
|
+
|
665
|
+
EXPECT_TRUE(
|
666
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
|
667
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
668
|
+
"http://foo.bar/folder/filename.php"));
|
669
|
+
|
670
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
671
|
+
"http://foo.bar/filename.php?parameters"));
|
672
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
673
|
+
"http://foo.bar/filename.php/"));
|
674
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
675
|
+
"http://foo.bar/filename.php5"));
|
676
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
677
|
+
"http://foo.bar/php/"));
|
678
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
679
|
+
"http://foo.bar/filename?php"));
|
680
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
681
|
+
"http://foo.bar/aaaphpaaa"));
|
682
|
+
EXPECT_FALSE(
|
683
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//windows.PHP"));
|
684
|
+
}
|
685
|
+
// "/fish*.php"
|
686
|
+
{
|
687
|
+
std::string robotstxt =
|
688
|
+
"user-agent: FooBot\n"
|
689
|
+
"disallow: /\n"
|
690
|
+
"allow: /fish*.php\n";
|
691
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
692
|
+
|
693
|
+
EXPECT_TRUE(
|
694
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.php"));
|
695
|
+
EXPECT_TRUE(
|
696
|
+
IsUserAgentAllowed(robotstxt, "FooBot",
|
697
|
+
"http://foo.bar/fishheads/catfish.php?parameters"));
|
698
|
+
|
699
|
+
EXPECT_FALSE(
|
700
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.PHP"));
|
701
|
+
}
|
702
|
+
// Section "Order of precedence for group-member records".
|
703
|
+
{
|
704
|
+
std::string robotstxt =
|
705
|
+
"user-agent: FooBot\n"
|
706
|
+
"allow: /p\n"
|
707
|
+
"disallow: /\n";
|
708
|
+
std::string url = "http://example.com/page";
|
709
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
710
|
+
}
|
711
|
+
{
|
712
|
+
std::string robotstxt =
|
713
|
+
"user-agent: FooBot\n"
|
714
|
+
"allow: /folder\n"
|
715
|
+
"disallow: /folder\n";
|
716
|
+
std::string url = "http://example.com/folder/page";
|
717
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
718
|
+
}
|
719
|
+
{
|
720
|
+
std::string robotstxt =
|
721
|
+
"user-agent: FooBot\n"
|
722
|
+
"allow: /page\n"
|
723
|
+
"disallow: /*.htm\n";
|
724
|
+
std::string url = "http://example.com/page.htm";
|
725
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
726
|
+
}
|
727
|
+
{
|
728
|
+
std::string robotstxt =
|
729
|
+
"user-agent: FooBot\n"
|
730
|
+
"allow: /$\n"
|
731
|
+
"disallow: /\n";
|
732
|
+
std::string url = "http://example.com/";
|
733
|
+
std::string url_page = "http://example.com/page.html";
|
734
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
735
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_page));
|
736
|
+
}
|
737
|
+
}
|
738
|
+
|
739
|
+
class RobotsStatsReporter : public googlebot::RobotsParseHandler {
|
740
|
+
public:
|
741
|
+
void HandleRobotsStart() override {
|
742
|
+
last_line_seen_ = 0;
|
743
|
+
valid_directives_ = 0;
|
744
|
+
unknown_directives_ = 0;
|
745
|
+
sitemap_.clear();
|
746
|
+
}
|
747
|
+
void HandleRobotsEnd() override {}
|
748
|
+
|
749
|
+
void HandleUserAgent(int line_num, absl::string_view value) override {
|
750
|
+
Digest(line_num);
|
751
|
+
}
|
752
|
+
void HandleAllow(int line_num, absl::string_view value) override {
|
753
|
+
Digest(line_num);
|
754
|
+
}
|
755
|
+
void HandleDisallow(int line_num, absl::string_view value) override {
|
756
|
+
Digest(line_num);
|
757
|
+
}
|
758
|
+
|
759
|
+
void HandleSitemap(int line_num, absl::string_view value) override {
|
760
|
+
Digest(line_num);
|
761
|
+
sitemap_.append(value.data(), value.length());
|
762
|
+
}
|
763
|
+
|
764
|
+
// Any other unrecognized name/v pairs.
|
765
|
+
void HandleUnknownAction(int line_num, absl::string_view action,
|
766
|
+
absl::string_view value) override {
|
767
|
+
last_line_seen_ = line_num;
|
768
|
+
unknown_directives_++;
|
769
|
+
}
|
770
|
+
|
771
|
+
int last_line_seen() const { return last_line_seen_; }
|
772
|
+
|
773
|
+
// All directives found, including unknown.
|
774
|
+
int valid_directives() const { return valid_directives_; }
|
775
|
+
|
776
|
+
// Number of unknown directives.
|
777
|
+
int unknown_directives() const { return unknown_directives_; }
|
778
|
+
|
779
|
+
// Parsed sitemap line.
|
780
|
+
std::string sitemap() const { return sitemap_; }
|
781
|
+
|
782
|
+
private:
|
783
|
+
void Digest(int line_num) {
|
784
|
+
ASSERT_GE(line_num, last_line_seen_);
|
785
|
+
last_line_seen_ = line_num;
|
786
|
+
valid_directives_++;
|
787
|
+
}
|
788
|
+
|
789
|
+
int last_line_seen_ = 0;
|
790
|
+
int valid_directives_ = 0;
|
791
|
+
int unknown_directives_ = 0;
|
792
|
+
std::string sitemap_;
|
793
|
+
};
|
794
|
+
|
795
|
+
// Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A
|
796
|
+
TEST(RobotsUnittest, ID_LinesNumbersAreCountedCorrectly) {
|
797
|
+
RobotsStatsReporter report;
|
798
|
+
static const char kUnixFile[] =
|
799
|
+
"User-Agent: foo\n"
|
800
|
+
"Allow: /some/path\n"
|
801
|
+
"User-Agent: bar\n"
|
802
|
+
"\n"
|
803
|
+
"\n"
|
804
|
+
"Disallow: /\n";
|
805
|
+
googlebot::ParseRobotsTxt(kUnixFile, &report);
|
806
|
+
EXPECT_EQ(4, report.valid_directives());
|
807
|
+
EXPECT_EQ(6, report.last_line_seen());
|
808
|
+
|
809
|
+
static const char kDosFile[] =
|
810
|
+
"User-Agent: foo\r\n"
|
811
|
+
"Allow: /some/path\r\n"
|
812
|
+
"User-Agent: bar\r\n"
|
813
|
+
"\r\n"
|
814
|
+
"\r\n"
|
815
|
+
"Disallow: /\r\n";
|
816
|
+
googlebot::ParseRobotsTxt(kDosFile, &report);
|
817
|
+
EXPECT_EQ(4, report.valid_directives());
|
818
|
+
EXPECT_EQ(6, report.last_line_seen());
|
819
|
+
|
820
|
+
static const char kMacFile[] =
|
821
|
+
"User-Agent: foo\r"
|
822
|
+
"Allow: /some/path\r"
|
823
|
+
"User-Agent: bar\r"
|
824
|
+
"\r"
|
825
|
+
"\r"
|
826
|
+
"Disallow: /\r";
|
827
|
+
googlebot::ParseRobotsTxt(kMacFile, &report);
|
828
|
+
EXPECT_EQ(4, report.valid_directives());
|
829
|
+
EXPECT_EQ(6, report.last_line_seen());
|
830
|
+
|
831
|
+
static const char kNoFinalNewline[] =
|
832
|
+
"User-Agent: foo\n"
|
833
|
+
"Allow: /some/path\n"
|
834
|
+
"User-Agent: bar\n"
|
835
|
+
"\n"
|
836
|
+
"\n"
|
837
|
+
"Disallow: /";
|
838
|
+
googlebot::ParseRobotsTxt(kNoFinalNewline, &report);
|
839
|
+
EXPECT_EQ(4, report.valid_directives());
|
840
|
+
EXPECT_EQ(6, report.last_line_seen());
|
841
|
+
|
842
|
+
static const char kMixedFile[] =
|
843
|
+
"User-Agent: foo\n"
|
844
|
+
"Allow: /some/path\r\n"
|
845
|
+
"User-Agent: bar\n"
|
846
|
+
"\r\n"
|
847
|
+
"\n"
|
848
|
+
"Disallow: /";
|
849
|
+
googlebot::ParseRobotsTxt(kMixedFile, &report);
|
850
|
+
EXPECT_EQ(4, report.valid_directives());
|
851
|
+
EXPECT_EQ(6, report.last_line_seen());
|
852
|
+
}
|
853
|
+
|
854
|
+
// BOM characters are unparseable and thus skipped. The rules following the line
|
855
|
+
// are used.
|
856
|
+
TEST(RobotsUnittest, ID_UTF8ByteOrderMarkIsSkipped) {
|
857
|
+
RobotsStatsReporter report;
|
858
|
+
static const char kUtf8FileFullBOM[] =
|
859
|
+
"\xEF\xBB\xBF"
|
860
|
+
"User-Agent: foo\n"
|
861
|
+
"Allow: /AnyValue\n";
|
862
|
+
googlebot::ParseRobotsTxt(kUtf8FileFullBOM, &report);
|
863
|
+
EXPECT_EQ(2, report.valid_directives());
|
864
|
+
EXPECT_EQ(0, report.unknown_directives());
|
865
|
+
|
866
|
+
// We allow as well partial ByteOrderMarks.
|
867
|
+
static const char kUtf8FilePartial2BOM[] =
|
868
|
+
"\xEF\xBB"
|
869
|
+
"User-Agent: foo\n"
|
870
|
+
"Allow: /AnyValue\n";
|
871
|
+
googlebot::ParseRobotsTxt(kUtf8FilePartial2BOM, &report);
|
872
|
+
EXPECT_EQ(2, report.valid_directives());
|
873
|
+
EXPECT_EQ(0, report.unknown_directives());
|
874
|
+
|
875
|
+
static const char kUtf8FilePartial1BOM[] =
|
876
|
+
"\xEF"
|
877
|
+
"User-Agent: foo\n"
|
878
|
+
"Allow: /AnyValue\n";
|
879
|
+
googlebot::ParseRobotsTxt(kUtf8FilePartial1BOM, &report);
|
880
|
+
EXPECT_EQ(2, report.valid_directives());
|
881
|
+
EXPECT_EQ(0, report.unknown_directives());
|
882
|
+
|
883
|
+
// If the BOM is not the right sequence, the first line looks like garbage
|
884
|
+
// that is skipped (we essentially see "\x11\xBFUser-Agent").
|
885
|
+
static const char kUtf8FileBrokenBOM[] =
|
886
|
+
"\xEF\x11\xBF"
|
887
|
+
"User-Agent: foo\n"
|
888
|
+
"Allow: /AnyValue\n";
|
889
|
+
googlebot::ParseRobotsTxt(kUtf8FileBrokenBOM, &report);
|
890
|
+
EXPECT_EQ(1, report.valid_directives());
|
891
|
+
EXPECT_EQ(1, report.unknown_directives()); // We get one broken line.
|
892
|
+
|
893
|
+
// Some other messed up file: BOMs only valid in the beginning of the file.
|
894
|
+
static const char kUtf8BOMSomewhereInMiddleOfFile[] =
|
895
|
+
"User-Agent: foo\n"
|
896
|
+
"\xEF\xBB\xBF"
|
897
|
+
"Allow: /AnyValue\n";
|
898
|
+
googlebot::ParseRobotsTxt(kUtf8BOMSomewhereInMiddleOfFile, &report);
|
899
|
+
EXPECT_EQ(1, report.valid_directives());
|
900
|
+
EXPECT_EQ(1, report.unknown_directives());
|
901
|
+
}
|
902
|
+
|
903
|
+
// Google specific: the I-D allows any line that crawlers might need, such as
|
904
|
+
// sitemaps, which Google supports.
|
905
|
+
// See REP I-D section "Other records".
|
906
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.4
|
907
|
+
TEST(RobotsUnittest, ID_NonStandardLineExample_Sitemap) {
|
908
|
+
RobotsStatsReporter report;
|
909
|
+
{
|
910
|
+
std::string sitemap_loc = "http://foo.bar/sitemap.xml";
|
911
|
+
std::string robotstxt =
|
912
|
+
"User-Agent: foo\n"
|
913
|
+
"Allow: /some/path\n"
|
914
|
+
"User-Agent: bar\n"
|
915
|
+
"\n"
|
916
|
+
"\n";
|
917
|
+
absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n");
|
918
|
+
|
919
|
+
googlebot::ParseRobotsTxt(robotstxt, &report);
|
920
|
+
EXPECT_EQ(sitemap_loc, report.sitemap());
|
921
|
+
}
|
922
|
+
// A sitemap line may appear anywhere in the file.
|
923
|
+
{
|
924
|
+
std::string robotstxt;
|
925
|
+
std::string sitemap_loc = "http://foo.bar/sitemap.xml";
|
926
|
+
std::string robotstxt_temp =
|
927
|
+
"User-Agent: foo\n"
|
928
|
+
"Allow: /some/path\n"
|
929
|
+
"User-Agent: bar\n"
|
930
|
+
"\n"
|
931
|
+
"\n";
|
932
|
+
absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n", robotstxt_temp);
|
933
|
+
|
934
|
+
googlebot::ParseRobotsTxt(robotstxt, &report);
|
935
|
+
EXPECT_EQ(sitemap_loc, report.sitemap());
|
936
|
+
}
|
937
|
+
}
|
938
|
+
|
939
|
+
} // namespace
|
940
|
+
|
941
|
+
// Integrity tests. These functions are available to the linker, but not in the
|
942
|
+
// header, because they should only be used for testing.
|
943
|
+
namespace googlebot {
|
944
|
+
std::string GetPathParamsQuery(const std::string& url);
|
945
|
+
bool MaybeEscapePattern(const char* src, char** dst);
|
946
|
+
} // namespace googlebot
|
947
|
+
|
948
|
+
void TestPath(const std::string& url, const std::string& expected_path) {
|
949
|
+
EXPECT_EQ(expected_path, googlebot::GetPathParamsQuery(url));
|
950
|
+
}
|
951
|
+
|
952
|
+
void TestEscape(const std::string& url, const std::string& expected) {
|
953
|
+
char* escaped_value = nullptr;
|
954
|
+
const bool is_escaped =
|
955
|
+
googlebot::MaybeEscapePattern(url.c_str(), &escaped_value);
|
956
|
+
const std::string escaped = escaped_value;
|
957
|
+
if (is_escaped) delete[] escaped_value;
|
958
|
+
|
959
|
+
EXPECT_EQ(expected, escaped);
|
960
|
+
}
|
961
|
+
|
962
|
+
TEST(RobotsUnittest, TestGetPathParamsQuery) {
|
963
|
+
// Only testing URLs that are already correctly escaped here.
|
964
|
+
TestPath("", "/");
|
965
|
+
TestPath("http://www.example.com", "/");
|
966
|
+
TestPath("http://www.example.com/", "/");
|
967
|
+
TestPath("http://www.example.com/a", "/a");
|
968
|
+
TestPath("http://www.example.com/a/", "/a/");
|
969
|
+
TestPath("http://www.example.com/a/b?c=http://d.e/", "/a/b?c=http://d.e/");
|
970
|
+
TestPath("http://www.example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
|
971
|
+
TestPath("example.com", "/");
|
972
|
+
TestPath("example.com/", "/");
|
973
|
+
TestPath("example.com/a", "/a");
|
974
|
+
TestPath("example.com/a/", "/a/");
|
975
|
+
TestPath("example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
|
976
|
+
TestPath("a", "/");
|
977
|
+
TestPath("a/", "/");
|
978
|
+
TestPath("/a", "/a");
|
979
|
+
TestPath("a/b", "/b");
|
980
|
+
TestPath("example.com?a", "/?a");
|
981
|
+
TestPath("example.com/a;b#c", "/a;b");
|
982
|
+
TestPath("//a/b/c", "/b/c");
|
983
|
+
}
|
984
|
+
|
985
|
+
TEST(RobotsUnittest, TestMaybeEscapePattern) {
|
986
|
+
TestEscape("http://www.example.com", "http://www.example.com");
|
987
|
+
TestEscape("/a/b/c", "/a/b/c");
|
988
|
+
TestEscape("á", "%C3%A1");
|
989
|
+
TestEscape("%aa", "%AA");
|
990
|
+
}
|