google_robotstxt_parser 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +28 -0
- data/.gitmodules +3 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +6 -0
- data/Guardfile +16 -0
- data/LICENSE +22 -0
- data/README.md +57 -0
- data/Rakefile +6 -0
- data/ext/robotstxt/.DS_Store +0 -0
- data/ext/robotstxt/extconf.rb +83 -0
- data/ext/robotstxt/robotstxt/.gitignore +1 -0
- data/ext/robotstxt/robotstxt/BUILD +40 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt +174 -0
- data/ext/robotstxt/robotstxt/CMakeLists.txt.in +30 -0
- data/ext/robotstxt/robotstxt/CONTRIBUTING.md +30 -0
- data/ext/robotstxt/robotstxt/LICENSE +203 -0
- data/ext/robotstxt/robotstxt/README.md +134 -0
- data/ext/robotstxt/robotstxt/WORKSPACE +28 -0
- data/ext/robotstxt/robotstxt/protocol-draft/README.md +9 -0
- data/ext/robotstxt/robotstxt/protocol-draft/draft-koster-rep-00.txt +529 -0
- data/ext/robotstxt/robotstxt/robots.cc +706 -0
- data/ext/robotstxt/robotstxt/robots.h +241 -0
- data/ext/robotstxt/robotstxt/robots_main.cc +101 -0
- data/ext/robotstxt/robotstxt/robots_test.cc +990 -0
- data/ext/robotstxt/robotstxt.cc +32 -0
- data/google_robotstxt_parser.gemspec +45 -0
- data/lib/google_robotstxt_parser/version.rb +6 -0
- data/lib/google_robotstxt_parser.rb +4 -0
- data/spec/google_robotstxt_parser_spec.rb +33 -0
- data/spec/spec_helper.rb +19 -0
- metadata +146 -0
@@ -0,0 +1,990 @@
|
|
1
|
+
// Copyright 2019 Google LLC
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// https://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// This file tests the robots.txt parsing and matching code found in robots.cc
|
16
|
+
// against the current Robots Exclusion Protocol (REP) internet draft (I-D).
|
17
|
+
// https://tools.ietf.org/html/draft-koster-rep
|
18
|
+
#include "robots.h"
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
|
22
|
+
#include "gtest/gtest.h"
|
23
|
+
#include "absl/strings/str_cat.h"
|
24
|
+
#include "absl/strings/string_view.h"
|
25
|
+
|
26
|
+
namespace {
|
27
|
+
|
28
|
+
using ::googlebot::RobotsMatcher;
|
29
|
+
|
30
|
+
bool IsUserAgentAllowed(const absl::string_view robotstxt,
|
31
|
+
const std::string& useragent, const std::string& url) {
|
32
|
+
RobotsMatcher matcher;
|
33
|
+
return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
|
34
|
+
}
|
35
|
+
|
36
|
+
// Google-specific: system test.
|
37
|
+
TEST(RobotsUnittest, GoogleOnly_SystemTest) {
|
38
|
+
const absl::string_view robotstxt =
|
39
|
+
"user-agent: FooBot\n"
|
40
|
+
"disallow: /\n";
|
41
|
+
// Empty robots.txt: everything allowed.
|
42
|
+
EXPECT_TRUE(IsUserAgentAllowed("", "FooBot", ""));
|
43
|
+
|
44
|
+
// Empty user-agent to be matched: everything allowed.
|
45
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "", ""));
|
46
|
+
|
47
|
+
// Empty url: implicitly disallowed, see method comment for GetPathParamsQuery
|
48
|
+
// in robots.cc.
|
49
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", ""));
|
50
|
+
|
51
|
+
// All params empty: same as robots.txt empty, everything allowed.
|
52
|
+
EXPECT_TRUE(IsUserAgentAllowed("", "", ""));
|
53
|
+
}
|
54
|
+
// Rules are colon separated name-value pairs. The following names are
|
55
|
+
// provisioned:
|
56
|
+
// user-agent: <value>
|
57
|
+
// allow: <value>
|
58
|
+
// disallow: <value>
|
59
|
+
// See REP I-D section "Protocol Definition".
|
60
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.1
|
61
|
+
//
|
62
|
+
// Google specific: webmasters sometimes miss the colon separator, but it's
|
63
|
+
// obvious what they mean by "disallow /", so we assume the colon if it's
|
64
|
+
// missing.
|
65
|
+
TEST(RobotsUnittest, ID_LineSyntax_Line) {
|
66
|
+
const absl::string_view robotstxt_correct =
|
67
|
+
"user-agent: FooBot\n"
|
68
|
+
"disallow: /\n";
|
69
|
+
const absl::string_view robotstxt_incorrect =
|
70
|
+
"foo: FooBot\n"
|
71
|
+
"bar: /\n";
|
72
|
+
const absl::string_view robotstxt_incorrect_accepted =
|
73
|
+
"user-agent FooBot\n"
|
74
|
+
"disallow /\n";
|
75
|
+
const std::string url = "http://foo.bar/x/y";
|
76
|
+
|
77
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_correct, "FooBot", url));
|
78
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_incorrect, "FooBot", url));
|
79
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_incorrect_accepted, "FooBot", url));
|
80
|
+
}
|
81
|
+
|
82
|
+
// A group is one or more user-agent line followed by rules, and terminated
|
83
|
+
// by a another user-agent line. Rules for same user-agents are combined
|
84
|
+
// opaquely into one group. Rules outside groups are ignored.
|
85
|
+
// See REP I-D section "Protocol Definition".
|
86
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.1
|
87
|
+
TEST(RobotsUnittest, ID_LineSyntax_Groups) {
|
88
|
+
const absl::string_view robotstxt =
|
89
|
+
"allow: /foo/bar/\n"
|
90
|
+
"\n"
|
91
|
+
"user-agent: FooBot\n"
|
92
|
+
"disallow: /\n"
|
93
|
+
"allow: /x/\n"
|
94
|
+
"user-agent: BarBot\n"
|
95
|
+
"disallow: /\n"
|
96
|
+
"allow: /y/\n"
|
97
|
+
"\n"
|
98
|
+
"\n"
|
99
|
+
"allow: /w/\n"
|
100
|
+
"user-agent: BazBot\n"
|
101
|
+
"\n"
|
102
|
+
"user-agent: FooBot\n"
|
103
|
+
"allow: /z/\n"
|
104
|
+
"disallow: /\n";
|
105
|
+
|
106
|
+
const std::string url_w = "http://foo.bar/w/a";
|
107
|
+
const std::string url_x = "http://foo.bar/x/b";
|
108
|
+
const std::string url_y = "http://foo.bar/y/c";
|
109
|
+
const std::string url_z = "http://foo.bar/z/d";
|
110
|
+
const std::string url_foo = "http://foo.bar/foo/bar/";
|
111
|
+
|
112
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_x));
|
113
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_z));
|
114
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_y));
|
115
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_y));
|
116
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BarBot", url_w));
|
117
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_z));
|
118
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "BazBot", url_z));
|
119
|
+
|
120
|
+
// Lines with rules outside groups are ignored.
|
121
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_foo));
|
122
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BarBot", url_foo));
|
123
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
|
124
|
+
}
|
125
|
+
|
126
|
+
// REP lines are case insensitive. See REP I-D section "Protocol Definition".
|
127
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.1
|
128
|
+
TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {
|
129
|
+
const absl::string_view robotstxt_upper =
|
130
|
+
"USER-AGENT: FooBot\n"
|
131
|
+
"ALLOW: /x/\n"
|
132
|
+
"DISALLOW: /\n";
|
133
|
+
const absl::string_view robotstxt_lower =
|
134
|
+
"user-agent: FooBot\n"
|
135
|
+
"allow: /x/\n"
|
136
|
+
"disallow: /\n";
|
137
|
+
const absl::string_view robotstxt_camel =
|
138
|
+
"uSeR-aGeNt: FooBot\n"
|
139
|
+
"AlLoW: /x/\n"
|
140
|
+
"dIsAlLoW: /\n";
|
141
|
+
const std::string url_allowed = "http://foo.bar/x/y";
|
142
|
+
const std::string url_disallowed = "http://foo.bar/a/b";
|
143
|
+
|
144
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_allowed));
|
145
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_allowed));
|
146
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_allowed));
|
147
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "FooBot", url_disallowed));
|
148
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "FooBot", url_disallowed));
|
149
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "FooBot", url_disallowed));
|
150
|
+
}
|
151
|
+
|
152
|
+
// A user-agent line is expected to contain only [a-zA-Z_-] characters and must
|
153
|
+
// not be empty. See REP I-D section "The user-agent line".
|
154
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
|
155
|
+
TEST(RobotsUnittest, ID_VerifyValidUserAgentsToObey) {
|
156
|
+
EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot"));
|
157
|
+
EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foobot-Bar"));
|
158
|
+
EXPECT_TRUE(RobotsMatcher::IsValidUserAgentToObey("Foo_Bar"));
|
159
|
+
|
160
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(absl::string_view()));
|
161
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(""));
|
162
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("ツ"));
|
163
|
+
|
164
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot*"));
|
165
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey(" Foobot "));
|
166
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot/2.1"));
|
167
|
+
|
168
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
|
169
|
+
}
|
170
|
+
|
171
|
+
// User-agent line values are case insensitive. See REP I-D section "The
|
172
|
+
// user-agent line".
|
173
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
|
174
|
+
TEST(RobotsUnittest, ID_UserAgentValueCaseInsensitive) {
|
175
|
+
const absl::string_view robotstxt_upper =
|
176
|
+
"User-Agent: FOO BAR\n"
|
177
|
+
"Allow: /x/\n"
|
178
|
+
"Disallow: /\n";
|
179
|
+
const absl::string_view robotstxt_lower =
|
180
|
+
"User-Agent: foo bar\n"
|
181
|
+
"Allow: /x/\n"
|
182
|
+
"Disallow: /\n";
|
183
|
+
const absl::string_view robotstxt_camel =
|
184
|
+
"User-Agent: FoO bAr\n"
|
185
|
+
"Allow: /x/\n"
|
186
|
+
"Disallow: /\n";
|
187
|
+
const std::string url_allowed = "http://foo.bar/x/y";
|
188
|
+
const std::string url_disallowed = "http://foo.bar/a/b";
|
189
|
+
|
190
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_allowed));
|
191
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_allowed));
|
192
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_allowed));
|
193
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "Foo", url_disallowed));
|
194
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "Foo", url_disallowed));
|
195
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "Foo", url_disallowed));
|
196
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_upper, "foo", url_allowed));
|
197
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_lower, "foo", url_allowed));
|
198
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_camel, "foo", url_allowed));
|
199
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_upper, "foo", url_disallowed));
|
200
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lower, "foo", url_disallowed));
|
201
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_camel, "foo", url_disallowed));
|
202
|
+
}
|
203
|
+
|
204
|
+
// Google specific: accept user-agent value up to the first space. Space is not
|
205
|
+
// allowed in user-agent values, but that doesn't stop webmasters from using
|
206
|
+
// them. This is more restrictive than the I-D, since in case of the bad value
|
207
|
+
// "Googlebot Images" we'd still obey the rules with "Googlebot".
|
208
|
+
// Extends REP I-D section "The user-agent line"
|
209
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
|
210
|
+
TEST(RobotsUnittest, GoogleOnly_AcceptUserAgentUpToFirstSpace) {
|
211
|
+
EXPECT_FALSE(RobotsMatcher::IsValidUserAgentToObey("Foobot Bar"));
|
212
|
+
const absl::string_view robotstxt =
|
213
|
+
"User-Agent: *\n"
|
214
|
+
"Disallow: /\n"
|
215
|
+
"User-Agent: Foo Bar\n"
|
216
|
+
"Allow: /x/\n"
|
217
|
+
"Disallow: /\n";
|
218
|
+
const std::string url = "http://foo.bar/x/y";
|
219
|
+
|
220
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "Foo", url));
|
221
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "Foo Bar", url));
|
222
|
+
}
|
223
|
+
|
224
|
+
// If no group matches the user-agent, crawlers must obey the first group with a
|
225
|
+
// user-agent line with a "*" value, if present. If no group satisfies either
|
226
|
+
// condition, or no groups are present at all, no rules apply.
|
227
|
+
// See REP I-D section "The user-agent line".
|
228
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
|
229
|
+
TEST(RobotsUnittest, ID_GlobalGroups_Secondary) {
|
230
|
+
const absl::string_view robotstxt_empty = "";
|
231
|
+
const absl::string_view robotstxt_global =
|
232
|
+
"user-agent: *\n"
|
233
|
+
"allow: /\n"
|
234
|
+
"user-agent: FooBot\n"
|
235
|
+
"disallow: /\n";
|
236
|
+
const absl::string_view robotstxt_only_specific =
|
237
|
+
"user-agent: FooBot\n"
|
238
|
+
"allow: /\n"
|
239
|
+
"user-agent: BarBot\n"
|
240
|
+
"disallow: /\n"
|
241
|
+
"user-agent: BazBot\n"
|
242
|
+
"disallow: /\n";
|
243
|
+
const std::string url = "http://foo.bar/x/y";
|
244
|
+
|
245
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_empty, "FooBot", url));
|
246
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_global, "FooBot", url));
|
247
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_global, "BarBot", url));
|
248
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_only_specific, "QuxBot", url));
|
249
|
+
}
|
250
|
+
|
251
|
+
// Matching rules against URIs is case sensitive.
|
252
|
+
// See REP I-D section "The Allow and Disallow lines".
|
253
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
|
254
|
+
TEST(RobotsUnittest, ID_AllowDisallow_Value_CaseSensitive) {
|
255
|
+
const absl::string_view robotstxt_lowercase_url =
|
256
|
+
"user-agent: FooBot\n"
|
257
|
+
"disallow: /x/\n";
|
258
|
+
const absl::string_view robotstxt_uppercase_url =
|
259
|
+
"user-agent: FooBot\n"
|
260
|
+
"disallow: /X/\n";
|
261
|
+
const std::string url = "http://foo.bar/x/y";
|
262
|
+
|
263
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt_lowercase_url, "FooBot", url));
|
264
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt_uppercase_url, "FooBot", url));
|
265
|
+
}
|
266
|
+
|
267
|
+
// The most specific match found MUST be used. The most specific match is the
|
268
|
+
// match that has the most octets. In case of multiple rules with the same
|
269
|
+
// length, the least strict rule must be used.
|
270
|
+
// See REP I-D section "The Allow and Disallow lines".
|
271
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
|
272
|
+
TEST(RobotsUnittest, ID_LongestMatch) {
|
273
|
+
const std::string url = "http://foo.bar/x/page.html";
|
274
|
+
{
|
275
|
+
const absl::string_view robotstxt =
|
276
|
+
"user-agent: FooBot\n"
|
277
|
+
"disallow: /x/page.html\n"
|
278
|
+
"allow: /x/\n";
|
279
|
+
|
280
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
281
|
+
}
|
282
|
+
{
|
283
|
+
const absl::string_view robotstxt =
|
284
|
+
"user-agent: FooBot\n"
|
285
|
+
"allow: /x/page.html\n"
|
286
|
+
"disallow: /x/\n";
|
287
|
+
|
288
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
289
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/"));
|
290
|
+
}
|
291
|
+
{
|
292
|
+
const absl::string_view robotstxt =
|
293
|
+
"user-agent: FooBot\n"
|
294
|
+
"disallow: \n"
|
295
|
+
"allow: \n";
|
296
|
+
// In case of equivalent disallow and allow patterns for the same
|
297
|
+
// user-agent, allow is used.
|
298
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
299
|
+
}
|
300
|
+
{
|
301
|
+
const absl::string_view robotstxt =
|
302
|
+
"user-agent: FooBot\n"
|
303
|
+
"disallow: /\n"
|
304
|
+
"allow: /\n";
|
305
|
+
// In case of equivalent disallow and allow patterns for the same
|
306
|
+
// user-agent, allow is used.
|
307
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
308
|
+
}
|
309
|
+
{
|
310
|
+
std::string url_a = "http://foo.bar/x";
|
311
|
+
std::string url_b = "http://foo.bar/x/";
|
312
|
+
const absl::string_view robotstxt =
|
313
|
+
"user-agent: FooBot\n"
|
314
|
+
"disallow: /x\n"
|
315
|
+
"allow: /x/\n";
|
316
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_a));
|
317
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url_b));
|
318
|
+
}
|
319
|
+
|
320
|
+
{
|
321
|
+
const absl::string_view robotstxt =
|
322
|
+
"user-agent: FooBot\n"
|
323
|
+
"disallow: /x/page.html\n"
|
324
|
+
"allow: /x/page.html\n";
|
325
|
+
// In case of equivalent disallow and allow patterns for the same
|
326
|
+
// user-agent, allow is used.
|
327
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
328
|
+
}
|
329
|
+
{
|
330
|
+
const absl::string_view robotstxt =
|
331
|
+
"user-agent: FooBot\n"
|
332
|
+
"allow: /page\n"
|
333
|
+
"disallow: /*.html\n";
|
334
|
+
// Longest match wins.
|
335
|
+
EXPECT_FALSE(
|
336
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page.html"));
|
337
|
+
EXPECT_TRUE(
|
338
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page"));
|
339
|
+
}
|
340
|
+
{
|
341
|
+
const absl::string_view robotstxt =
|
342
|
+
"user-agent: FooBot\n"
|
343
|
+
"allow: /x/page.\n"
|
344
|
+
"disallow: /*.html\n";
|
345
|
+
// Longest match wins.
|
346
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
347
|
+
EXPECT_FALSE(
|
348
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/y.html"));
|
349
|
+
}
|
350
|
+
{
|
351
|
+
const absl::string_view robotstxt =
|
352
|
+
"User-agent: *\n"
|
353
|
+
"Disallow: /x/\n"
|
354
|
+
"User-agent: FooBot\n"
|
355
|
+
"Disallow: /y/\n";
|
356
|
+
// Most specific group for FooBot allows implicitly /x/page.
|
357
|
+
EXPECT_TRUE(
|
358
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/page"));
|
359
|
+
EXPECT_FALSE(
|
360
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/y/page"));
|
361
|
+
}
|
362
|
+
}
|
363
|
+
|
364
|
+
// Octets in the URI and robots.txt paths outside the range of the US-ASCII
|
365
|
+
// coded character set, and those in the reserved range defined by RFC3986,
|
366
|
+
// MUST be percent-encoded as defined by RFC3986 prior to comparison.
|
367
|
+
// See REP I-D section "The Allow and Disallow lines".
|
368
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
|
369
|
+
//
|
370
|
+
// NOTE: It's up to the caller to percent encode a URL before passing it to the
|
371
|
+
// parser. Percent encoding URIs in the rules is unnecessary.
|
372
|
+
TEST(RobotsUnittest, ID_Encoding) {
|
373
|
+
// /foo/bar?baz=http://foo.bar stays unencoded.
|
374
|
+
{
|
375
|
+
const absl::string_view robotstxt =
|
376
|
+
"User-agent: FooBot\n"
|
377
|
+
"Disallow: /\n"
|
378
|
+
"Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n";
|
379
|
+
EXPECT_TRUE(IsUserAgentAllowed(
|
380
|
+
robotstxt, "FooBot",
|
381
|
+
"http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par"));
|
382
|
+
}
|
383
|
+
|
384
|
+
// 3 byte character: /foo/bar/ツ -> /foo/bar/%E3%83%84
|
385
|
+
{
|
386
|
+
const absl::string_view robotstxt =
|
387
|
+
"User-agent: FooBot\n"
|
388
|
+
"Disallow: /\n"
|
389
|
+
"Allow: /foo/bar/ツ\n";
|
390
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
391
|
+
"http://foo.bar/foo/bar/%E3%83%84"));
|
392
|
+
// The parser encodes the 3-byte character, but the URL is not %-encoded.
|
393
|
+
EXPECT_FALSE(
|
394
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
|
395
|
+
}
|
396
|
+
// Percent encoded 3 byte character: /foo/bar/%E3%83%84 -> /foo/bar/%E3%83%84
|
397
|
+
{
|
398
|
+
const absl::string_view robotstxt =
|
399
|
+
"User-agent: FooBot\n"
|
400
|
+
"Disallow: /\n"
|
401
|
+
"Allow: /foo/bar/%E3%83%84\n";
|
402
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
403
|
+
"http://foo.bar/foo/bar/%E3%83%84"));
|
404
|
+
EXPECT_FALSE(
|
405
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
|
406
|
+
}
|
407
|
+
// Percent encoded unreserved US-ASCII: /foo/bar/%62%61%7A -> NULL
|
408
|
+
// This is illegal according to RFC3986 and while it may work here due to
|
409
|
+
// simple string matching, it should not be relied on.
|
410
|
+
{
|
411
|
+
const absl::string_view robotstxt =
|
412
|
+
"User-agent: FooBot\n"
|
413
|
+
"Disallow: /\n"
|
414
|
+
"Allow: /foo/bar/%62%61%7A\n";
|
415
|
+
EXPECT_FALSE(
|
416
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
|
417
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
418
|
+
"http://foo.bar/foo/bar/%62%61%7A"));
|
419
|
+
}
|
420
|
+
}
|
421
|
+
|
422
|
+
// The REP I-D defines the following characters that have special meaning in
|
423
|
+
// robots.txt:
|
424
|
+
// # - inline comment.
|
425
|
+
// $ - end of pattern.
|
426
|
+
// * - any number of characters.
|
427
|
+
// See REP I-D section "Special Characters".
|
428
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.3
|
429
|
+
TEST(RobotsUnittest, ID_SpecialCharacters) {
|
430
|
+
{
|
431
|
+
const absl::string_view robotstxt =
|
432
|
+
"User-agent: FooBot\n"
|
433
|
+
"Disallow: /foo/bar/quz\n"
|
434
|
+
"Allow: /foo/*/qux\n";
|
435
|
+
EXPECT_FALSE(
|
436
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/quz"));
|
437
|
+
EXPECT_TRUE(
|
438
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
|
439
|
+
EXPECT_TRUE(
|
440
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo//quz"));
|
441
|
+
EXPECT_TRUE(
|
442
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bax/quz"));
|
443
|
+
}
|
444
|
+
{
|
445
|
+
const absl::string_view robotstxt =
|
446
|
+
"User-agent: FooBot\n"
|
447
|
+
"Disallow: /foo/bar$\n"
|
448
|
+
"Allow: /foo/bar/qux\n";
|
449
|
+
EXPECT_FALSE(
|
450
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
|
451
|
+
EXPECT_TRUE(
|
452
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/qux"));
|
453
|
+
EXPECT_TRUE(
|
454
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/"));
|
455
|
+
EXPECT_TRUE(
|
456
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
|
457
|
+
}
|
458
|
+
{
|
459
|
+
const absl::string_view robotstxt =
|
460
|
+
"User-agent: FooBot\n"
|
461
|
+
"# Disallow: /\n"
|
462
|
+
"Disallow: /foo/quz#qux\n"
|
463
|
+
"Allow: /\n";
|
464
|
+
EXPECT_TRUE(
|
465
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
|
466
|
+
EXPECT_FALSE(
|
467
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
|
468
|
+
}
|
469
|
+
}
|
470
|
+
|
471
|
+
// Google-specific: "index.html" (and only that) at the end of a pattern is
|
472
|
+
// equivalent to "/".
|
473
|
+
TEST(RobotsUnittest, GoogleOnly_IndexHTMLisDirectory) {
|
474
|
+
const absl::string_view robotstxt =
|
475
|
+
"User-Agent: *\n"
|
476
|
+
"Allow: /allowed-slash/index.html\n"
|
477
|
+
"Disallow: /\n";
|
478
|
+
// If index.html is allowed, we interpret this as / being allowed too.
|
479
|
+
EXPECT_TRUE(
|
480
|
+
IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/"));
|
481
|
+
// Does not exatly match.
|
482
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "foobot",
|
483
|
+
"http://foo.com/allowed-slash/index.htm"));
|
484
|
+
// Exact match.
|
485
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "foobot",
|
486
|
+
"http://foo.com/allowed-slash/index.html"));
|
487
|
+
EXPECT_FALSE(
|
488
|
+
IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/anyother-url"));
|
489
|
+
}
|
490
|
+
|
491
|
+
// Google-specific: long lines are ignored after 8 * 2083 bytes. See comment in
|
492
|
+
// RobotsTxtParser::Parse().
|
493
|
+
TEST(RobotsUnittest, GoogleOnly_LineTooLong) {
|
494
|
+
size_t kEOLLen = std::string("\n").length();
|
495
|
+
int kMaxLineLen = 2083 * 8;
|
496
|
+
std::string allow = "allow: ";
|
497
|
+
std::string disallow = "disallow: ";
|
498
|
+
|
499
|
+
// Disallow rule pattern matches the URL after being cut off at kMaxLineLen.
|
500
|
+
{
|
501
|
+
std::string robotstxt = "user-agent: FooBot\n";
|
502
|
+
std::string longline = "/x/";
|
503
|
+
size_t max_length =
|
504
|
+
kMaxLineLen - longline.length() - disallow.length() + kEOLLen;
|
505
|
+
while (longline.size() < max_length) {
|
506
|
+
absl::StrAppend(&longline, "a");
|
507
|
+
}
|
508
|
+
absl::StrAppend(&robotstxt, disallow, longline, "/qux\n");
|
509
|
+
|
510
|
+
// Matches nothing, so URL is allowed.
|
511
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fux"));
|
512
|
+
// Matches cut off disallow rule.
|
513
|
+
EXPECT_FALSE(IsUserAgentAllowed(
|
514
|
+
robotstxt, "FooBot", absl::StrCat("http://foo.bar", longline, "/fux")));
|
515
|
+
}
|
516
|
+
|
517
|
+
{
|
518
|
+
std::string robotstxt =
|
519
|
+
"user-agent: FooBot\n"
|
520
|
+
"disallow: /\n";
|
521
|
+
std::string longline_a = "/x/";
|
522
|
+
std::string longline_b = "/x/";
|
523
|
+
size_t max_length =
|
524
|
+
kMaxLineLen - longline_a.length() - allow.length() + kEOLLen;
|
525
|
+
while (longline_a.size() < max_length) {
|
526
|
+
absl::StrAppend(&longline_a, "a");
|
527
|
+
absl::StrAppend(&longline_b, "b");
|
528
|
+
}
|
529
|
+
absl::StrAppend(&robotstxt, allow, longline_a, "/qux\n");
|
530
|
+
absl::StrAppend(&robotstxt, allow, longline_b, "/qux\n");
|
531
|
+
|
532
|
+
// URL matches the disallow rule.
|
533
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/"));
|
534
|
+
// Matches the allow rule exactly.
|
535
|
+
EXPECT_TRUE(
|
536
|
+
IsUserAgentAllowed(robotstxt, "FooBot",
|
537
|
+
absl::StrCat("http://foo.bar", longline_a, "/qux")));
|
538
|
+
// Matches cut off allow rule.
|
539
|
+
EXPECT_TRUE(
|
540
|
+
IsUserAgentAllowed(robotstxt, "FooBot",
|
541
|
+
absl::StrCat("http://foo.bar", longline_b, "/fux")));
|
542
|
+
}
|
543
|
+
}
|
544
|
+
|
545
|
+
TEST(RobotsUnittest, GoogleOnly_DocumentationChecks) {
|
546
|
+
// Test documentation from
|
547
|
+
// https://developers.google.com/search/reference/robots_txt
|
548
|
+
// Section "URL matching based on path values".
|
549
|
+
{
|
550
|
+
std::string robotstxt =
|
551
|
+
"user-agent: FooBot\n"
|
552
|
+
"disallow: /\n"
|
553
|
+
"allow: /fish\n";
|
554
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
555
|
+
|
556
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
|
557
|
+
EXPECT_TRUE(
|
558
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
|
559
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
560
|
+
"http://foo.bar/fish/salmon.html"));
|
561
|
+
EXPECT_TRUE(
|
562
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
|
563
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
564
|
+
"http://foo.bar/fishheads/yummy.html"));
|
565
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
566
|
+
"http://foo.bar/fish.html?id=anything"));
|
567
|
+
|
568
|
+
EXPECT_FALSE(
|
569
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.asp"));
|
570
|
+
EXPECT_FALSE(
|
571
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
|
572
|
+
EXPECT_FALSE(
|
573
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
|
574
|
+
}
|
575
|
+
// "/fish*" equals "/fish"
|
576
|
+
{
|
577
|
+
std::string robotstxt =
|
578
|
+
"user-agent: FooBot\n"
|
579
|
+
"disallow: /\n"
|
580
|
+
"allow: /fish*\n";
|
581
|
+
EXPECT_FALSE(
|
582
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
583
|
+
|
584
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
|
585
|
+
EXPECT_TRUE(
|
586
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
|
587
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
588
|
+
"http://foo.bar/fish/salmon.html"));
|
589
|
+
EXPECT_TRUE(
|
590
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
|
591
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
592
|
+
"http://foo.bar/fishheads/yummy.html"));
|
593
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
594
|
+
"http://foo.bar/fish.html?id=anything"));
|
595
|
+
|
596
|
+
EXPECT_FALSE(
|
597
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.bar"));
|
598
|
+
EXPECT_FALSE(
|
599
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
|
600
|
+
EXPECT_FALSE(
|
601
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
|
602
|
+
}
|
603
|
+
// "/fish/" does not equal "/fish"
|
604
|
+
{
|
605
|
+
std::string robotstxt =
|
606
|
+
"user-agent: FooBot\n"
|
607
|
+
"disallow: /\n"
|
608
|
+
"allow: /fish/\n";
|
609
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
610
|
+
|
611
|
+
EXPECT_TRUE(
|
612
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/"));
|
613
|
+
EXPECT_TRUE(
|
614
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon"));
|
615
|
+
EXPECT_TRUE(
|
616
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?salmon"));
|
617
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
618
|
+
"http://foo.bar/fish/salmon.html"));
|
619
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
620
|
+
"http://foo.bar/fish/?id=anything"));
|
621
|
+
|
622
|
+
EXPECT_FALSE(
|
623
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
|
624
|
+
EXPECT_FALSE(
|
625
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
|
626
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
627
|
+
"http://foo.bar/Fish/Salmon.html"));
|
628
|
+
}
|
629
|
+
// "/*.php"
|
630
|
+
{
|
631
|
+
std::string robotstxt =
|
632
|
+
"user-agent: FooBot\n"
|
633
|
+
"disallow: /\n"
|
634
|
+
"allow: /*.php\n";
|
635
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
636
|
+
|
637
|
+
EXPECT_TRUE(
|
638
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
|
639
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
640
|
+
"http://foo.bar/folder/filename.php"));
|
641
|
+
EXPECT_TRUE(IsUserAgentAllowed(
|
642
|
+
robotstxt, "FooBot", "http://foo.bar/folder/filename.php?parameters"));
|
643
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
644
|
+
"http://foo.bar//folder/any.php.file.html"));
|
645
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
646
|
+
"http://foo.bar/filename.php/"));
|
647
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
648
|
+
"http://foo.bar/index?f=filename.php/"));
|
649
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
650
|
+
"http://foo.bar/php/"));
|
651
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
652
|
+
"http://foo.bar/index?php"));
|
653
|
+
|
654
|
+
EXPECT_FALSE(
|
655
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/windows.PHP"));
|
656
|
+
}
|
657
|
+
// "/*.php$"
|
658
|
+
{
|
659
|
+
std::string robotstxt =
|
660
|
+
"user-agent: FooBot\n"
|
661
|
+
"disallow: /\n"
|
662
|
+
"allow: /*.php$\n";
|
663
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
664
|
+
|
665
|
+
EXPECT_TRUE(
|
666
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
|
667
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot",
|
668
|
+
"http://foo.bar/folder/filename.php"));
|
669
|
+
|
670
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
671
|
+
"http://foo.bar/filename.php?parameters"));
|
672
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
673
|
+
"http://foo.bar/filename.php/"));
|
674
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
675
|
+
"http://foo.bar/filename.php5"));
|
676
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
677
|
+
"http://foo.bar/php/"));
|
678
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
679
|
+
"http://foo.bar/filename?php"));
|
680
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot",
|
681
|
+
"http://foo.bar/aaaphpaaa"));
|
682
|
+
EXPECT_FALSE(
|
683
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//windows.PHP"));
|
684
|
+
}
|
685
|
+
// "/fish*.php"
|
686
|
+
{
|
687
|
+
std::string robotstxt =
|
688
|
+
"user-agent: FooBot\n"
|
689
|
+
"disallow: /\n"
|
690
|
+
"allow: /fish*.php\n";
|
691
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
|
692
|
+
|
693
|
+
EXPECT_TRUE(
|
694
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.php"));
|
695
|
+
EXPECT_TRUE(
|
696
|
+
IsUserAgentAllowed(robotstxt, "FooBot",
|
697
|
+
"http://foo.bar/fishheads/catfish.php?parameters"));
|
698
|
+
|
699
|
+
EXPECT_FALSE(
|
700
|
+
IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.PHP"));
|
701
|
+
}
|
702
|
+
// Section "Order of precedence for group-member records".
|
703
|
+
{
|
704
|
+
std::string robotstxt =
|
705
|
+
"user-agent: FooBot\n"
|
706
|
+
"allow: /p\n"
|
707
|
+
"disallow: /\n";
|
708
|
+
std::string url = "http://example.com/page";
|
709
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
710
|
+
}
|
711
|
+
{
|
712
|
+
std::string robotstxt =
|
713
|
+
"user-agent: FooBot\n"
|
714
|
+
"allow: /folder\n"
|
715
|
+
"disallow: /folder\n";
|
716
|
+
std::string url = "http://example.com/folder/page";
|
717
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
718
|
+
}
|
719
|
+
{
|
720
|
+
std::string robotstxt =
|
721
|
+
"user-agent: FooBot\n"
|
722
|
+
"allow: /page\n"
|
723
|
+
"disallow: /*.htm\n";
|
724
|
+
std::string url = "http://example.com/page.htm";
|
725
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
726
|
+
}
|
727
|
+
{
|
728
|
+
std::string robotstxt =
|
729
|
+
"user-agent: FooBot\n"
|
730
|
+
"allow: /$\n"
|
731
|
+
"disallow: /\n";
|
732
|
+
std::string url = "http://example.com/";
|
733
|
+
std::string url_page = "http://example.com/page.html";
|
734
|
+
EXPECT_TRUE(IsUserAgentAllowed(robotstxt, "FooBot", url));
|
735
|
+
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "FooBot", url_page));
|
736
|
+
}
|
737
|
+
}
|
738
|
+
|
739
|
+
class RobotsStatsReporter : public googlebot::RobotsParseHandler {
|
740
|
+
public:
|
741
|
+
void HandleRobotsStart() override {
|
742
|
+
last_line_seen_ = 0;
|
743
|
+
valid_directives_ = 0;
|
744
|
+
unknown_directives_ = 0;
|
745
|
+
sitemap_.clear();
|
746
|
+
}
|
747
|
+
void HandleRobotsEnd() override {}
|
748
|
+
|
749
|
+
void HandleUserAgent(int line_num, absl::string_view value) override {
|
750
|
+
Digest(line_num);
|
751
|
+
}
|
752
|
+
void HandleAllow(int line_num, absl::string_view value) override {
|
753
|
+
Digest(line_num);
|
754
|
+
}
|
755
|
+
void HandleDisallow(int line_num, absl::string_view value) override {
|
756
|
+
Digest(line_num);
|
757
|
+
}
|
758
|
+
|
759
|
+
void HandleSitemap(int line_num, absl::string_view value) override {
|
760
|
+
Digest(line_num);
|
761
|
+
sitemap_.append(value.data(), value.length());
|
762
|
+
}
|
763
|
+
|
764
|
+
// Any other unrecognized name/v pairs.
|
765
|
+
void HandleUnknownAction(int line_num, absl::string_view action,
|
766
|
+
absl::string_view value) override {
|
767
|
+
last_line_seen_ = line_num;
|
768
|
+
unknown_directives_++;
|
769
|
+
}
|
770
|
+
|
771
|
+
int last_line_seen() const { return last_line_seen_; }
|
772
|
+
|
773
|
+
// All directives found, including unknown.
|
774
|
+
int valid_directives() const { return valid_directives_; }
|
775
|
+
|
776
|
+
// Number of unknown directives.
|
777
|
+
int unknown_directives() const { return unknown_directives_; }
|
778
|
+
|
779
|
+
// Parsed sitemap line.
|
780
|
+
std::string sitemap() const { return sitemap_; }
|
781
|
+
|
782
|
+
private:
|
783
|
+
void Digest(int line_num) {
|
784
|
+
ASSERT_GE(line_num, last_line_seen_);
|
785
|
+
last_line_seen_ = line_num;
|
786
|
+
valid_directives_++;
|
787
|
+
}
|
788
|
+
|
789
|
+
int last_line_seen_ = 0;
|
790
|
+
int valid_directives_ = 0;
|
791
|
+
int unknown_directives_ = 0;
|
792
|
+
std::string sitemap_;
|
793
|
+
};
|
794
|
+
|
795
|
+
// Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A
|
796
|
+
TEST(RobotsUnittest, ID_LinesNumbersAreCountedCorrectly) {
|
797
|
+
RobotsStatsReporter report;
|
798
|
+
static const char kUnixFile[] =
|
799
|
+
"User-Agent: foo\n"
|
800
|
+
"Allow: /some/path\n"
|
801
|
+
"User-Agent: bar\n"
|
802
|
+
"\n"
|
803
|
+
"\n"
|
804
|
+
"Disallow: /\n";
|
805
|
+
googlebot::ParseRobotsTxt(kUnixFile, &report);
|
806
|
+
EXPECT_EQ(4, report.valid_directives());
|
807
|
+
EXPECT_EQ(6, report.last_line_seen());
|
808
|
+
|
809
|
+
static const char kDosFile[] =
|
810
|
+
"User-Agent: foo\r\n"
|
811
|
+
"Allow: /some/path\r\n"
|
812
|
+
"User-Agent: bar\r\n"
|
813
|
+
"\r\n"
|
814
|
+
"\r\n"
|
815
|
+
"Disallow: /\r\n";
|
816
|
+
googlebot::ParseRobotsTxt(kDosFile, &report);
|
817
|
+
EXPECT_EQ(4, report.valid_directives());
|
818
|
+
EXPECT_EQ(6, report.last_line_seen());
|
819
|
+
|
820
|
+
static const char kMacFile[] =
|
821
|
+
"User-Agent: foo\r"
|
822
|
+
"Allow: /some/path\r"
|
823
|
+
"User-Agent: bar\r"
|
824
|
+
"\r"
|
825
|
+
"\r"
|
826
|
+
"Disallow: /\r";
|
827
|
+
googlebot::ParseRobotsTxt(kMacFile, &report);
|
828
|
+
EXPECT_EQ(4, report.valid_directives());
|
829
|
+
EXPECT_EQ(6, report.last_line_seen());
|
830
|
+
|
831
|
+
static const char kNoFinalNewline[] =
|
832
|
+
"User-Agent: foo\n"
|
833
|
+
"Allow: /some/path\n"
|
834
|
+
"User-Agent: bar\n"
|
835
|
+
"\n"
|
836
|
+
"\n"
|
837
|
+
"Disallow: /";
|
838
|
+
googlebot::ParseRobotsTxt(kNoFinalNewline, &report);
|
839
|
+
EXPECT_EQ(4, report.valid_directives());
|
840
|
+
EXPECT_EQ(6, report.last_line_seen());
|
841
|
+
|
842
|
+
static const char kMixedFile[] =
|
843
|
+
"User-Agent: foo\n"
|
844
|
+
"Allow: /some/path\r\n"
|
845
|
+
"User-Agent: bar\n"
|
846
|
+
"\r\n"
|
847
|
+
"\n"
|
848
|
+
"Disallow: /";
|
849
|
+
googlebot::ParseRobotsTxt(kMixedFile, &report);
|
850
|
+
EXPECT_EQ(4, report.valid_directives());
|
851
|
+
EXPECT_EQ(6, report.last_line_seen());
|
852
|
+
}
|
853
|
+
|
854
|
+
// BOM characters are unparseable and thus skipped. The rules following the line
|
855
|
+
// are used.
|
856
|
+
TEST(RobotsUnittest, ID_UTF8ByteOrderMarkIsSkipped) {
|
857
|
+
RobotsStatsReporter report;
|
858
|
+
static const char kUtf8FileFullBOM[] =
|
859
|
+
"\xEF\xBB\xBF"
|
860
|
+
"User-Agent: foo\n"
|
861
|
+
"Allow: /AnyValue\n";
|
862
|
+
googlebot::ParseRobotsTxt(kUtf8FileFullBOM, &report);
|
863
|
+
EXPECT_EQ(2, report.valid_directives());
|
864
|
+
EXPECT_EQ(0, report.unknown_directives());
|
865
|
+
|
866
|
+
// We allow as well partial ByteOrderMarks.
|
867
|
+
static const char kUtf8FilePartial2BOM[] =
|
868
|
+
"\xEF\xBB"
|
869
|
+
"User-Agent: foo\n"
|
870
|
+
"Allow: /AnyValue\n";
|
871
|
+
googlebot::ParseRobotsTxt(kUtf8FilePartial2BOM, &report);
|
872
|
+
EXPECT_EQ(2, report.valid_directives());
|
873
|
+
EXPECT_EQ(0, report.unknown_directives());
|
874
|
+
|
875
|
+
static const char kUtf8FilePartial1BOM[] =
|
876
|
+
"\xEF"
|
877
|
+
"User-Agent: foo\n"
|
878
|
+
"Allow: /AnyValue\n";
|
879
|
+
googlebot::ParseRobotsTxt(kUtf8FilePartial1BOM, &report);
|
880
|
+
EXPECT_EQ(2, report.valid_directives());
|
881
|
+
EXPECT_EQ(0, report.unknown_directives());
|
882
|
+
|
883
|
+
// If the BOM is not the right sequence, the first line looks like garbage
|
884
|
+
// that is skipped (we essentially see "\x11\xBFUser-Agent").
|
885
|
+
static const char kUtf8FileBrokenBOM[] =
|
886
|
+
"\xEF\x11\xBF"
|
887
|
+
"User-Agent: foo\n"
|
888
|
+
"Allow: /AnyValue\n";
|
889
|
+
googlebot::ParseRobotsTxt(kUtf8FileBrokenBOM, &report);
|
890
|
+
EXPECT_EQ(1, report.valid_directives());
|
891
|
+
EXPECT_EQ(1, report.unknown_directives()); // We get one broken line.
|
892
|
+
|
893
|
+
// Some other messed up file: BOMs only valid in the beginning of the file.
|
894
|
+
static const char kUtf8BOMSomewhereInMiddleOfFile[] =
|
895
|
+
"User-Agent: foo\n"
|
896
|
+
"\xEF\xBB\xBF"
|
897
|
+
"Allow: /AnyValue\n";
|
898
|
+
googlebot::ParseRobotsTxt(kUtf8BOMSomewhereInMiddleOfFile, &report);
|
899
|
+
EXPECT_EQ(1, report.valid_directives());
|
900
|
+
EXPECT_EQ(1, report.unknown_directives());
|
901
|
+
}
|
902
|
+
|
903
|
+
// Google specific: the I-D allows any line that crawlers might need, such as
|
904
|
+
// sitemaps, which Google supports.
|
905
|
+
// See REP I-D section "Other records".
|
906
|
+
// https://tools.ietf.org/html/draft-koster-rep#section-2.2.4
|
907
|
+
TEST(RobotsUnittest, ID_NonStandardLineExample_Sitemap) {
|
908
|
+
RobotsStatsReporter report;
|
909
|
+
{
|
910
|
+
std::string sitemap_loc = "http://foo.bar/sitemap.xml";
|
911
|
+
std::string robotstxt =
|
912
|
+
"User-Agent: foo\n"
|
913
|
+
"Allow: /some/path\n"
|
914
|
+
"User-Agent: bar\n"
|
915
|
+
"\n"
|
916
|
+
"\n";
|
917
|
+
absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n");
|
918
|
+
|
919
|
+
googlebot::ParseRobotsTxt(robotstxt, &report);
|
920
|
+
EXPECT_EQ(sitemap_loc, report.sitemap());
|
921
|
+
}
|
922
|
+
// A sitemap line may appear anywhere in the file.
|
923
|
+
{
|
924
|
+
std::string robotstxt;
|
925
|
+
std::string sitemap_loc = "http://foo.bar/sitemap.xml";
|
926
|
+
std::string robotstxt_temp =
|
927
|
+
"User-Agent: foo\n"
|
928
|
+
"Allow: /some/path\n"
|
929
|
+
"User-Agent: bar\n"
|
930
|
+
"\n"
|
931
|
+
"\n";
|
932
|
+
absl::StrAppend(&robotstxt, "Sitemap: ", sitemap_loc, "\n", robotstxt_temp);
|
933
|
+
|
934
|
+
googlebot::ParseRobotsTxt(robotstxt, &report);
|
935
|
+
EXPECT_EQ(sitemap_loc, report.sitemap());
|
936
|
+
}
|
937
|
+
}
|
938
|
+
|
939
|
+
} // namespace
|
940
|
+
|
941
|
+
// Integrity tests. These functions are available to the linker, but not in the
|
942
|
+
// header, because they should only be used for testing.
|
943
|
+
namespace googlebot {
|
944
|
+
std::string GetPathParamsQuery(const std::string& url);
|
945
|
+
bool MaybeEscapePattern(const char* src, char** dst);
|
946
|
+
} // namespace googlebot
|
947
|
+
|
948
|
+
void TestPath(const std::string& url, const std::string& expected_path) {
|
949
|
+
EXPECT_EQ(expected_path, googlebot::GetPathParamsQuery(url));
|
950
|
+
}
|
951
|
+
|
952
|
+
void TestEscape(const std::string& url, const std::string& expected) {
|
953
|
+
char* escaped_value = nullptr;
|
954
|
+
const bool is_escaped =
|
955
|
+
googlebot::MaybeEscapePattern(url.c_str(), &escaped_value);
|
956
|
+
const std::string escaped = escaped_value;
|
957
|
+
if (is_escaped) delete[] escaped_value;
|
958
|
+
|
959
|
+
EXPECT_EQ(expected, escaped);
|
960
|
+
}
|
961
|
+
|
962
|
+
TEST(RobotsUnittest, TestGetPathParamsQuery) {
|
963
|
+
// Only testing URLs that are already correctly escaped here.
|
964
|
+
TestPath("", "/");
|
965
|
+
TestPath("http://www.example.com", "/");
|
966
|
+
TestPath("http://www.example.com/", "/");
|
967
|
+
TestPath("http://www.example.com/a", "/a");
|
968
|
+
TestPath("http://www.example.com/a/", "/a/");
|
969
|
+
TestPath("http://www.example.com/a/b?c=http://d.e/", "/a/b?c=http://d.e/");
|
970
|
+
TestPath("http://www.example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
|
971
|
+
TestPath("example.com", "/");
|
972
|
+
TestPath("example.com/", "/");
|
973
|
+
TestPath("example.com/a", "/a");
|
974
|
+
TestPath("example.com/a/", "/a/");
|
975
|
+
TestPath("example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
|
976
|
+
TestPath("a", "/");
|
977
|
+
TestPath("a/", "/");
|
978
|
+
TestPath("/a", "/a");
|
979
|
+
TestPath("a/b", "/b");
|
980
|
+
TestPath("example.com?a", "/?a");
|
981
|
+
TestPath("example.com/a;b#c", "/a;b");
|
982
|
+
TestPath("//a/b/c", "/b/c");
|
983
|
+
}
|
984
|
+
|
985
|
+
TEST(RobotsUnittest, TestMaybeEscapePattern) {
|
986
|
+
TestEscape("http://www.example.com", "http://www.example.com");
|
987
|
+
TestEscape("/a/b/c", "/a/b/c");
|
988
|
+
TestEscape("á", "%C3%A1");
|
989
|
+
TestEscape("%aa", "%AA");
|
990
|
+
}
|