robots 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +3 -0
- data/VERSION +1 -1
- data/lib/robots.rb +12 -9
- data/robots.gemspec +8 -4
- data/test/fixtures/emptyish.txt +2 -0
- data/test/fixtures/eventbrite.txt +435 -0
- data/test/fixtures/google.txt +215 -0
- data/test/fixtures/reddit.txt +14 -0
- data/test/fixtures/yelp.txt +68 -0
- data/test/test_robots.rb +46 -15
- metadata +23 -10
- data/test/fixtures/robots1.txt +0 -0
data/CHANGELOG
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.0
|
data/lib/robots.rb
CHANGED
@@ -12,15 +12,7 @@ class Robots
|
|
12
12
|
def initialize(uri, user_agent)
|
13
13
|
@last_accessed = Time.at(1)
|
14
14
|
|
15
|
-
io =
|
16
|
-
begin
|
17
|
-
Timeout::timeout(Robots.timeout) do
|
18
|
-
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
19
|
-
end
|
20
|
-
rescue Timeout::Error
|
21
|
-
STDERR.puts "robots.txt request timed out"
|
22
|
-
end
|
23
|
-
|
15
|
+
io = Robots.get_robots_txt(uri)
|
24
16
|
|
25
17
|
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
26
18
|
io = StringIO.new("User-agent: *\nAllow: /\n")
|
@@ -99,12 +91,23 @@ class Robots
|
|
99
91
|
protected
|
100
92
|
|
101
93
|
def to_regex(pattern)
|
94
|
+
return /should-not-match-anything-123456789/ if pattern.strip.empty?
|
102
95
|
pattern = Regexp.escape(pattern)
|
103
96
|
pattern.gsub!(Regexp.escape("*"), ".*")
|
104
97
|
Regexp.compile("^#{pattern}")
|
105
98
|
end
|
106
99
|
end
|
107
100
|
|
101
|
+
def self.get_robots_txt(uri)
|
102
|
+
begin
|
103
|
+
Timeout::timeout(Robots.timeout) do
|
104
|
+
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
105
|
+
end
|
106
|
+
rescue Timeout::Error
|
107
|
+
STDERR.puts "robots.txt request timed out"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
108
111
|
def self.timeout=(t)
|
109
112
|
@timeout = t
|
110
113
|
end
|
data/robots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{robots}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.9.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Kyle Maxwell"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-05-29}
|
13
13
|
s.description = %q{It parses robots.txt files}
|
14
14
|
s.email = %q{kyle@kylemaxwell.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -23,13 +23,17 @@ Gem::Specification.new do |s|
|
|
23
23
|
"VERSION",
|
24
24
|
"lib/robots.rb",
|
25
25
|
"robots.gemspec",
|
26
|
-
"test/fixtures/
|
26
|
+
"test/fixtures/emptyish.txt",
|
27
|
+
"test/fixtures/eventbrite.txt",
|
28
|
+
"test/fixtures/google.txt",
|
29
|
+
"test/fixtures/reddit.txt",
|
30
|
+
"test/fixtures/yelp.txt",
|
27
31
|
"test/test_robots.rb"
|
28
32
|
]
|
29
33
|
s.homepage = %q{http://github.com/fizx/robots}
|
30
34
|
s.rdoc_options = ["--charset=UTF-8"]
|
31
35
|
s.require_paths = ["lib"]
|
32
|
-
s.rubygems_version = %q{1.3.
|
36
|
+
s.rubygems_version = %q{1.3.6}
|
33
37
|
s.summary = %q{Simple robots.txt parser}
|
34
38
|
s.test_files = [
|
35
39
|
"test/test_robots.rb"
|
@@ -0,0 +1,435 @@
|
|
1
|
+
# These entries assist in minimizing bandwidth usage caused
|
2
|
+
# by questionable robots spidering your site. Some of these
|
3
|
+
# robots or agents are used by web-stripping sofware.
|
4
|
+
# Please do not remove these entries, but feel free to add
|
5
|
+
# your own at the end of the list.
|
6
|
+
# If you have any questions regarding this file, please
|
7
|
+
# contact support@thinkhost.com
|
8
|
+
|
9
|
+
User-agent: *
|
10
|
+
Disallow: /rest/
|
11
|
+
Disallow: /xml/
|
12
|
+
Disallow: /json/
|
13
|
+
Disallow: /atom/
|
14
|
+
Disallow: /opml/
|
15
|
+
Disallow: /widget/
|
16
|
+
Disallow: /register
|
17
|
+
Disallow: /review
|
18
|
+
Disallow: /orderconfirmation
|
19
|
+
Disallow: /venues/
|
20
|
+
Disallow: /*?
|
21
|
+
|
22
|
+
Sitemap: http://www.eventbrite.com/sitemap_index.xml
|
23
|
+
|
24
|
+
User-agent: msnbot
|
25
|
+
Crawl-delay: 4
|
26
|
+
|
27
|
+
User-agent: Slurp
|
28
|
+
Crawl-delay: 4
|
29
|
+
|
30
|
+
User-agent: Balihoo
|
31
|
+
Disallow: /
|
32
|
+
|
33
|
+
User-agent: BotRightHere
|
34
|
+
Disallow: /
|
35
|
+
|
36
|
+
User-agent: WebZip
|
37
|
+
Disallow: /
|
38
|
+
|
39
|
+
User-agent: larbin
|
40
|
+
Disallow: /
|
41
|
+
|
42
|
+
User-agent: b2w/0.1
|
43
|
+
Disallow: /
|
44
|
+
|
45
|
+
User-agent: Copernic
|
46
|
+
Disallow: /
|
47
|
+
|
48
|
+
User-agent: psbot
|
49
|
+
Disallow: /
|
50
|
+
|
51
|
+
User-agent: Python-urllib
|
52
|
+
Disallow: /
|
53
|
+
|
54
|
+
User-agent: NetMechanic
|
55
|
+
Disallow: /
|
56
|
+
|
57
|
+
User-agent: URL_Spider_Pro
|
58
|
+
Disallow: /
|
59
|
+
|
60
|
+
User-agent: CherryPicker
|
61
|
+
Disallow: /
|
62
|
+
|
63
|
+
User-agent: EmailCollector
|
64
|
+
Disallow: /
|
65
|
+
|
66
|
+
User-agent: EmailSiphon
|
67
|
+
Disallow: /
|
68
|
+
|
69
|
+
User-agent: WebBandit
|
70
|
+
Disallow: /
|
71
|
+
|
72
|
+
User-agent: EmailWolf
|
73
|
+
Disallow: /
|
74
|
+
|
75
|
+
User-agent: ExtractorPro
|
76
|
+
Disallow: /
|
77
|
+
|
78
|
+
User-agent: CopyRightCheck
|
79
|
+
Disallow: /
|
80
|
+
|
81
|
+
User-agent: Crescent
|
82
|
+
Disallow: /
|
83
|
+
|
84
|
+
User-agent: SiteSnagger
|
85
|
+
Disallow: /
|
86
|
+
|
87
|
+
User-agent: ProWebWalker
|
88
|
+
Disallow: /
|
89
|
+
|
90
|
+
User-agent: CheeseBot
|
91
|
+
Disallow: /
|
92
|
+
|
93
|
+
User-agent: LNSpiderguy
|
94
|
+
Disallow: /
|
95
|
+
|
96
|
+
User-agent: Alexibot
|
97
|
+
Disallow: /
|
98
|
+
|
99
|
+
User-agent: Teleport
|
100
|
+
Disallow: /
|
101
|
+
|
102
|
+
User-agent: TeleportPro
|
103
|
+
Disallow: /
|
104
|
+
|
105
|
+
User-agent: MIIxpc
|
106
|
+
Disallow: /
|
107
|
+
|
108
|
+
User-agent: Telesoft
|
109
|
+
Disallow: /
|
110
|
+
|
111
|
+
User-agent: Website Quester
|
112
|
+
Disallow: /
|
113
|
+
|
114
|
+
User-agent: WebZip
|
115
|
+
Disallow: /
|
116
|
+
|
117
|
+
User-agent: moget/2.1
|
118
|
+
Disallow: /
|
119
|
+
|
120
|
+
User-agent: WebZip/4.0
|
121
|
+
Disallow: /
|
122
|
+
|
123
|
+
User-agent: WebStripper
|
124
|
+
Disallow: /
|
125
|
+
|
126
|
+
User-agent: WebSauger
|
127
|
+
Disallow: /
|
128
|
+
|
129
|
+
User-agent: WebCopier
|
130
|
+
Disallow: /
|
131
|
+
|
132
|
+
User-agent: NetAnts
|
133
|
+
Disallow: /
|
134
|
+
|
135
|
+
User-agent: Mister PiX
|
136
|
+
Disallow: /
|
137
|
+
|
138
|
+
User-agent: WebAuto
|
139
|
+
Disallow: /
|
140
|
+
|
141
|
+
User-agent: TheNomad
|
142
|
+
Disallow: /
|
143
|
+
|
144
|
+
User-agent: WWW-Collector-E
|
145
|
+
Disallow: /
|
146
|
+
|
147
|
+
User-agent: RMA
|
148
|
+
Disallow: /
|
149
|
+
|
150
|
+
User-agent: libWeb/clsHTTP
|
151
|
+
Disallow: /
|
152
|
+
|
153
|
+
User-agent: asterias
|
154
|
+
Disallow: /
|
155
|
+
|
156
|
+
User-agent: httplib
|
157
|
+
Disallow: /
|
158
|
+
|
159
|
+
User-agent: turingos
|
160
|
+
Disallow: /
|
161
|
+
|
162
|
+
User-agent: spanner
|
163
|
+
Disallow: /
|
164
|
+
|
165
|
+
User-agent: InfoNaviRobot
|
166
|
+
Disallow: /
|
167
|
+
|
168
|
+
User-agent: Harvest/1.5
|
169
|
+
Disallow: /
|
170
|
+
|
171
|
+
User-agent: Bullseye/1.0
|
172
|
+
Disallow: /
|
173
|
+
|
174
|
+
User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95)
|
175
|
+
Disallow: /
|
176
|
+
|
177
|
+
User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0
|
178
|
+
Disallow: /
|
179
|
+
|
180
|
+
User-agent: CherryPickerSE/1.0
|
181
|
+
Disallow: /
|
182
|
+
|
183
|
+
User-agent: CherryPickerElite/1.0
|
184
|
+
Disallow: /
|
185
|
+
|
186
|
+
User-agent: WebBandit/3.50
|
187
|
+
Disallow: /
|
188
|
+
|
189
|
+
User-agent: NICErsPRO
|
190
|
+
Disallow: /
|
191
|
+
|
192
|
+
User-agent: Microsoft URL Control - 5.01.4511
|
193
|
+
Disallow: /
|
194
|
+
|
195
|
+
User-agent: DittoSpyder
|
196
|
+
Disallow: /
|
197
|
+
|
198
|
+
User-agent: Foobot
|
199
|
+
Disallow: /
|
200
|
+
|
201
|
+
User-agent: SpankBot
|
202
|
+
Disallow: /
|
203
|
+
|
204
|
+
User-agent: BotALot
|
205
|
+
Disallow: /
|
206
|
+
|
207
|
+
User-agent: lwp-trivial/1.34
|
208
|
+
Disallow: /
|
209
|
+
|
210
|
+
User-agent: lwp-trivial
|
211
|
+
Disallow: /
|
212
|
+
|
213
|
+
User-agent: BunnySlippers
|
214
|
+
Disallow: /
|
215
|
+
|
216
|
+
User-agent: Microsoft URL Control - 6.00.8169
|
217
|
+
Disallow: /
|
218
|
+
|
219
|
+
User-agent: URLy Warning
|
220
|
+
Disallow: /
|
221
|
+
|
222
|
+
User-agent: Wget/1.6
|
223
|
+
Disallow: /
|
224
|
+
|
225
|
+
User-agent: Wget/1.5.3
|
226
|
+
Disallow: /
|
227
|
+
|
228
|
+
User-agent: Wget
|
229
|
+
Disallow: /
|
230
|
+
|
231
|
+
User-agent: LinkWalker
|
232
|
+
Disallow: /
|
233
|
+
|
234
|
+
User-agent: cosmos
|
235
|
+
Disallow: /
|
236
|
+
|
237
|
+
User-agent: moget
|
238
|
+
Disallow: /
|
239
|
+
|
240
|
+
User-agent: hloader
|
241
|
+
Disallow: /
|
242
|
+
|
243
|
+
User-agent: humanlinks
|
244
|
+
Disallow: /
|
245
|
+
|
246
|
+
User-agent: LinkextractorPro
|
247
|
+
Disallow: /
|
248
|
+
|
249
|
+
User-agent: Offline Explorer
|
250
|
+
Disallow: /
|
251
|
+
|
252
|
+
User-agent: Mata Hari
|
253
|
+
Disallow: /
|
254
|
+
|
255
|
+
User-agent: LexiBot
|
256
|
+
Disallow: /
|
257
|
+
|
258
|
+
User-agent: Web Image Collector
|
259
|
+
Disallow: /
|
260
|
+
|
261
|
+
User-agent: The Intraformant
|
262
|
+
Disallow: /
|
263
|
+
|
264
|
+
User-agent: True_Robot/1.0
|
265
|
+
Disallow: /
|
266
|
+
|
267
|
+
User-agent: True_Robot
|
268
|
+
Disallow: /
|
269
|
+
|
270
|
+
User-agent: BlowFish/1.0
|
271
|
+
Disallow: /
|
272
|
+
|
273
|
+
User-agent: JennyBot
|
274
|
+
Disallow: /
|
275
|
+
|
276
|
+
User-agent: MIIxpc/4.2
|
277
|
+
Disallow: /
|
278
|
+
|
279
|
+
User-agent: BuiltBotTough
|
280
|
+
Disallow: /
|
281
|
+
|
282
|
+
User-agent: ProPowerBot/2.14
|
283
|
+
Disallow: /
|
284
|
+
|
285
|
+
User-agent: BackDoorBot/1.0
|
286
|
+
Disallow: /
|
287
|
+
|
288
|
+
User-agent: toCrawl/UrlDispatcher
|
289
|
+
Disallow: /
|
290
|
+
|
291
|
+
User-agent: WebEnhancer
|
292
|
+
Disallow: /
|
293
|
+
|
294
|
+
User-agent: suzuran
|
295
|
+
Disallow: /
|
296
|
+
|
297
|
+
User-agent: TightTwatBot
|
298
|
+
Disallow: /
|
299
|
+
|
300
|
+
User-agent: VCI WebViewer VCI WebViewer Win32
|
301
|
+
Disallow: /
|
302
|
+
|
303
|
+
User-agent: VCI
|
304
|
+
Disallow: /
|
305
|
+
|
306
|
+
User-agent: Szukacz/1.4
|
307
|
+
Disallow: /
|
308
|
+
|
309
|
+
User-agent: QueryN Metasearch
|
310
|
+
Disallow: /
|
311
|
+
|
312
|
+
User-agent: Openfind data gatherer
|
313
|
+
Disallow: /
|
314
|
+
|
315
|
+
User-agent: Openfind
|
316
|
+
Disallow: /
|
317
|
+
|
318
|
+
User-agent: Xenu's Link Sleuth 1.1c
|
319
|
+
Disallow: /
|
320
|
+
|
321
|
+
User-agent: Xenu's
|
322
|
+
Disallow: /
|
323
|
+
|
324
|
+
User-agent: Zeus
|
325
|
+
Disallow: /
|
326
|
+
|
327
|
+
User-agent: RepoMonkey Bait & Tackle/v1.01
|
328
|
+
Disallow: /
|
329
|
+
|
330
|
+
User-agent: RepoMonkey
|
331
|
+
Disallow: /
|
332
|
+
|
333
|
+
User-agent: Microsoft URL Control
|
334
|
+
Disallow: /
|
335
|
+
|
336
|
+
User-agent: Openbot
|
337
|
+
Disallow: /
|
338
|
+
|
339
|
+
User-agent: URL Control
|
340
|
+
Disallow: /
|
341
|
+
|
342
|
+
User-agent: Zeus Link Scout
|
343
|
+
Disallow: /
|
344
|
+
|
345
|
+
User-agent: Zeus 32297 Webster Pro V2.9 Win32
|
346
|
+
Disallow: /
|
347
|
+
|
348
|
+
User-agent: Webster Pro
|
349
|
+
Disallow: /
|
350
|
+
|
351
|
+
User-agent: EroCrawler
|
352
|
+
Disallow: /
|
353
|
+
|
354
|
+
User-agent: LinkScan/8.1a Unix
|
355
|
+
Disallow: /
|
356
|
+
|
357
|
+
User-agent: Keyword Density/0.9
|
358
|
+
Disallow: /
|
359
|
+
|
360
|
+
User-agent: Kenjin Spider
|
361
|
+
Disallow: /
|
362
|
+
|
363
|
+
User-agent: Iron33/1.0.2
|
364
|
+
Disallow: /
|
365
|
+
|
366
|
+
User-agent: Bookmark search tool
|
367
|
+
Disallow: /
|
368
|
+
|
369
|
+
User-agent: GetRight/4.2
|
370
|
+
Disallow: /
|
371
|
+
|
372
|
+
User-agent: FairAd Client
|
373
|
+
Disallow: /
|
374
|
+
|
375
|
+
User-agent: Gaisbot
|
376
|
+
Disallow: /
|
377
|
+
|
378
|
+
User-agent: Aqua_Products
|
379
|
+
Disallow: /
|
380
|
+
|
381
|
+
User-agent: Radiation Retriever 1.1
|
382
|
+
Disallow: /
|
383
|
+
|
384
|
+
User-agent: Flaming AttackBot
|
385
|
+
Disallow: /
|
386
|
+
|
387
|
+
User-agent: Oracle Ultra Search
|
388
|
+
Disallow: /
|
389
|
+
|
390
|
+
User-agent: MSIECrawler
|
391
|
+
Disallow: /
|
392
|
+
|
393
|
+
User-agent: PerMan
|
394
|
+
Disallow: /
|
395
|
+
|
396
|
+
User-agent: searchpreview
|
397
|
+
Disallow: /
|
398
|
+
|
399
|
+
User-agent: TurnitinBot
|
400
|
+
Disallow: /
|
401
|
+
|
402
|
+
User-agent: wget
|
403
|
+
Disallow: /
|
404
|
+
|
405
|
+
User-agent: ExtractorPro
|
406
|
+
Disallow: /
|
407
|
+
|
408
|
+
User-agent: WebZIP/4.21
|
409
|
+
Disallow: /
|
410
|
+
|
411
|
+
User-agent: WebZIP/5.0
|
412
|
+
Disallow: /
|
413
|
+
|
414
|
+
User-agent: HTTrack 3.0
|
415
|
+
Disallow: /
|
416
|
+
|
417
|
+
User-agent: TurnitinBot/1.5
|
418
|
+
Disallow: /
|
419
|
+
|
420
|
+
User-agent: WebCopier v3.2a
|
421
|
+
Disallow: /
|
422
|
+
|
423
|
+
User-agent: WebCapture 2.0
|
424
|
+
Disallow: /
|
425
|
+
|
426
|
+
User-agent: WebCopier v.2.2
|
427
|
+
Disallow: /
|
428
|
+
|
429
|
+
User-agent: Spinn3r
|
430
|
+
Disallow: /
|
431
|
+
|
432
|
+
User-agent: Tailrank
|
433
|
+
Disallow: /
|
434
|
+
|
435
|
+
Sitemap: http://www.eventbrite.com/sitemap_index.xml
|
@@ -0,0 +1,215 @@
|
|
1
|
+
User-agent: *
|
2
|
+
Disallow: /search
|
3
|
+
Disallow: /groups
|
4
|
+
Disallow: /images
|
5
|
+
Disallow: /catalogs
|
6
|
+
Disallow: /catalogues
|
7
|
+
Disallow: /news
|
8
|
+
Allow: /news/directory
|
9
|
+
Disallow: /nwshp
|
10
|
+
Disallow: /setnewsprefs?
|
11
|
+
Disallow: /index.html?
|
12
|
+
Disallow: /?
|
13
|
+
Disallow: /addurl/image?
|
14
|
+
Disallow: /pagead/
|
15
|
+
Disallow: /relpage/
|
16
|
+
Disallow: /relcontent
|
17
|
+
Disallow: /imgres
|
18
|
+
Disallow: /imglanding
|
19
|
+
Disallow: /keyword/
|
20
|
+
Disallow: /u/
|
21
|
+
Disallow: /univ/
|
22
|
+
Disallow: /cobrand
|
23
|
+
Disallow: /custom
|
24
|
+
Disallow: /advanced_group_search
|
25
|
+
Disallow: /googlesite
|
26
|
+
Disallow: /preferences
|
27
|
+
Disallow: /setprefs
|
28
|
+
Disallow: /swr
|
29
|
+
Disallow: /url
|
30
|
+
Disallow: /default
|
31
|
+
Disallow: /m?
|
32
|
+
Disallow: /m/?
|
33
|
+
Disallow: /m/blogs?
|
34
|
+
Disallow: /m/ig
|
35
|
+
Disallow: /m/images?
|
36
|
+
Disallow: /m/local?
|
37
|
+
Disallow: /m/movies?
|
38
|
+
Disallow: /m/news?
|
39
|
+
Disallow: /m/news/i?
|
40
|
+
Disallow: /m/place?
|
41
|
+
Disallow: /m/setnewsprefs?
|
42
|
+
Disallow: /m/search?
|
43
|
+
Disallow: /m/swmloptin?
|
44
|
+
Disallow: /m/trends
|
45
|
+
Disallow: /wml?
|
46
|
+
Disallow: /wml/?
|
47
|
+
Disallow: /wml/search?
|
48
|
+
Disallow: /xhtml?
|
49
|
+
Disallow: /xhtml/?
|
50
|
+
Disallow: /xhtml/search?
|
51
|
+
Disallow: /xml?
|
52
|
+
Disallow: /imode?
|
53
|
+
Disallow: /imode/?
|
54
|
+
Disallow: /imode/search?
|
55
|
+
Disallow: /jsky?
|
56
|
+
Disallow: /jsky/?
|
57
|
+
Disallow: /jsky/search?
|
58
|
+
Disallow: /pda?
|
59
|
+
Disallow: /pda/?
|
60
|
+
Disallow: /pda/search?
|
61
|
+
Disallow: /sprint_xhtml
|
62
|
+
Disallow: /sprint_wml
|
63
|
+
Disallow: /pqa
|
64
|
+
Disallow: /palm
|
65
|
+
Disallow: /gwt/
|
66
|
+
Disallow: /purchases
|
67
|
+
Disallow: /hws
|
68
|
+
Disallow: /bsd?
|
69
|
+
Disallow: /linux?
|
70
|
+
Disallow: /mac?
|
71
|
+
Disallow: /microsoft?
|
72
|
+
Disallow: /unclesam?
|
73
|
+
Disallow: /answers/search?q=
|
74
|
+
Disallow: /local?
|
75
|
+
Disallow: /local_url
|
76
|
+
Disallow: /froogle?
|
77
|
+
Disallow: /products?
|
78
|
+
Disallow: /products/
|
79
|
+
Disallow: /froogle_
|
80
|
+
Disallow: /product_
|
81
|
+
Disallow: /products_
|
82
|
+
Disallow: /print
|
83
|
+
Disallow: /books
|
84
|
+
Disallow: /bkshp?q=
|
85
|
+
Allow: /booksrightsholders
|
86
|
+
Disallow: /patents?
|
87
|
+
Disallow: /patents/
|
88
|
+
Allow: /patents/about
|
89
|
+
Disallow: /scholar
|
90
|
+
Disallow: /complete
|
91
|
+
Disallow: /sponsoredlinks
|
92
|
+
Disallow: /videosearch?
|
93
|
+
Disallow: /videopreview?
|
94
|
+
Disallow: /videoprograminfo?
|
95
|
+
Disallow: /maps?
|
96
|
+
Disallow: /mapstt?
|
97
|
+
Disallow: /mapslt?
|
98
|
+
Disallow: /maps/stk/
|
99
|
+
Disallow: /maps/br?
|
100
|
+
Disallow: /mapabcpoi?
|
101
|
+
Disallow: /maphp?
|
102
|
+
Disallow: /places/
|
103
|
+
Disallow: /maps/place
|
104
|
+
Disallow: /help/maps/streetview/partners/welcome/
|
105
|
+
Disallow: /lochp?
|
106
|
+
Disallow: /center
|
107
|
+
Disallow: /ie?
|
108
|
+
Disallow: /sms/demo?
|
109
|
+
Disallow: /katrina?
|
110
|
+
Disallow: /blogsearch?
|
111
|
+
Disallow: /blogsearch/
|
112
|
+
Disallow: /blogsearch_feeds
|
113
|
+
Disallow: /advanced_blog_search
|
114
|
+
Disallow: /reader/
|
115
|
+
Allow: /reader/play
|
116
|
+
Disallow: /uds/
|
117
|
+
Disallow: /chart?
|
118
|
+
Disallow: /transit?
|
119
|
+
Disallow: /mbd?
|
120
|
+
Disallow: /extern_js/
|
121
|
+
Disallow: /calendar/feeds/
|
122
|
+
Disallow: /calendar/ical/
|
123
|
+
Disallow: /cl2/feeds/
|
124
|
+
Disallow: /cl2/ical/
|
125
|
+
Disallow: /coop/directory
|
126
|
+
Disallow: /coop/manage
|
127
|
+
Disallow: /trends?
|
128
|
+
Disallow: /trends/music?
|
129
|
+
Disallow: /trends/hottrends?
|
130
|
+
Disallow: /trends/viz?
|
131
|
+
Disallow: /notebook/search?
|
132
|
+
Disallow: /musica
|
133
|
+
Disallow: /musicad
|
134
|
+
Disallow: /musicas
|
135
|
+
Disallow: /musicl
|
136
|
+
Disallow: /musics
|
137
|
+
Disallow: /musicsearch
|
138
|
+
Disallow: /musicsp
|
139
|
+
Disallow: /musiclp
|
140
|
+
Disallow: /browsersync
|
141
|
+
Disallow: /call
|
142
|
+
Disallow: /archivesearch?
|
143
|
+
Disallow: /archivesearch/url
|
144
|
+
Disallow: /archivesearch/advanced_search
|
145
|
+
Disallow: /base/reportbadoffer
|
146
|
+
Disallow: /urchin_test/
|
147
|
+
Disallow: /movies?
|
148
|
+
Disallow: /codesearch?
|
149
|
+
Disallow: /codesearch/feeds/search?
|
150
|
+
Disallow: /wapsearch?
|
151
|
+
Disallow: /safebrowsing
|
152
|
+
Allow: /safebrowsing/diagnostic
|
153
|
+
Allow: /safebrowsing/report_error/
|
154
|
+
Allow: /safebrowsing/report_phish/
|
155
|
+
Disallow: /reviews/search?
|
156
|
+
Disallow: /orkut/albums
|
157
|
+
Disallow: /jsapi
|
158
|
+
Disallow: /views?
|
159
|
+
Disallow: /c/
|
160
|
+
Disallow: /cbk
|
161
|
+
Disallow: /recharge/dashboard/car
|
162
|
+
Disallow: /recharge/dashboard/static/
|
163
|
+
Disallow: /translate_a/
|
164
|
+
Disallow: /translate_c
|
165
|
+
Disallow: /translate_f
|
166
|
+
Disallow: /translate_static/
|
167
|
+
Disallow: /translate_suggestion
|
168
|
+
Disallow: /profiles/me
|
169
|
+
Allow: /profiles
|
170
|
+
Disallow: /s2/profiles/me
|
171
|
+
Allow: /s2/profiles
|
172
|
+
Allow: /s2/photos
|
173
|
+
Allow: /s2/static
|
174
|
+
Disallow: /s2
|
175
|
+
Disallow: /transconsole/portal/
|
176
|
+
Disallow: /gcc/
|
177
|
+
Disallow: /aclk
|
178
|
+
Disallow: /cse?
|
179
|
+
Disallow: /cse/home
|
180
|
+
Disallow: /cse/panel
|
181
|
+
Disallow: /cse/manage
|
182
|
+
Disallow: /tbproxy/
|
183
|
+
Disallow: /comparisonads/
|
184
|
+
Disallow: /imesync/
|
185
|
+
Disallow: /shenghuo/search?
|
186
|
+
Disallow: /support/forum/search?
|
187
|
+
Disallow: /reviews/polls/
|
188
|
+
Disallow: /hosted/images/
|
189
|
+
Disallow: /ppob/?
|
190
|
+
Disallow: /ppob?
|
191
|
+
Disallow: /ig/add?
|
192
|
+
Disallow: /adwordsresellers
|
193
|
+
Disallow: /accounts/o8
|
194
|
+
Allow: /accounts/o8/id
|
195
|
+
Disallow: /topicsearch?q=
|
196
|
+
Disallow: /xfx7/
|
197
|
+
Disallow: /squared/api
|
198
|
+
Disallow: /squared/search
|
199
|
+
Disallow: /squared/table
|
200
|
+
Disallow: /toolkit/
|
201
|
+
Allow: /toolkit/*.html
|
202
|
+
Disallow: /qnasearch?
|
203
|
+
Disallow: /errors/
|
204
|
+
Disallow: /app/updates
|
205
|
+
Disallow: /sidewiki/entry/
|
206
|
+
Disallow: /quality_form?
|
207
|
+
Disallow: /labs/popgadget/search
|
208
|
+
Disallow: /buzz/post
|
209
|
+
Disallow: /compressiontest/
|
210
|
+
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
|
211
|
+
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
|
212
|
+
Sitemap: http://www.google.com/ventures/sitemap_ventures.xml
|
213
|
+
Sitemap: http://www.google.com/sitemaps_webmasters.xml
|
214
|
+
Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml
|
215
|
+
Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#
|
2
|
+
# 1. A robot may not injure a human being or, through inaction, allow a
|
3
|
+
# human being to come to harm.
|
4
|
+
#
|
5
|
+
# 2. A robot must obey orders given it by human beings except where such
|
6
|
+
# orders would conflict with the First Law.
|
7
|
+
#
|
8
|
+
# 3. A robot must protect its own existence as long as such protection
|
9
|
+
# does not conflict with the First or Second Law.
|
10
|
+
|
11
|
+
User-agent: *
|
12
|
+
Disallow: /advertise?
|
13
|
+
Disallow: /biz_share?
|
14
|
+
Disallow: /biz_attribute
|
15
|
+
Disallow: /biz_link
|
16
|
+
Disallow: /biz_update
|
17
|
+
Disallow: /bookmark?
|
18
|
+
Disallow: /flag_content?
|
19
|
+
Disallow: /invite_friends_service?
|
20
|
+
Disallow: /login?
|
21
|
+
Disallow: /mail?
|
22
|
+
Disallow: /map?
|
23
|
+
Disallow: /redir?
|
24
|
+
Disallow: /writeareview
|
25
|
+
Disallow: /signup?
|
26
|
+
Disallow: /talk/new_topic
|
27
|
+
Disallow: /thanx?
|
28
|
+
Disallow: /user_favorites?
|
29
|
+
Disallow: /weekly/signup
|
30
|
+
Disallow: /elite?
|
31
|
+
Disallow: /member_search_results
|
32
|
+
Disallow: /advertise?
|
33
|
+
Disallow: /syndicate/
|
34
|
+
Disallow: /filtered_reviews
|
35
|
+
Disallow: /language/update?
|
36
|
+
|
37
|
+
User-agent: Fasterfox
|
38
|
+
Disallow: /
|
39
|
+
|
40
|
+
User-agent: Nutch
|
41
|
+
Disallow: /
|
42
|
+
|
43
|
+
User-agent: spock
|
44
|
+
Disallow: /
|
45
|
+
|
46
|
+
User-agent: OmniExplorer_Bot
|
47
|
+
Disallow: /
|
48
|
+
|
49
|
+
User-agent: MJ12bot
|
50
|
+
Disallow: /
|
51
|
+
|
52
|
+
User-agent: TurnitinBot
|
53
|
+
Disallow: /
|
54
|
+
|
55
|
+
User-agent: BecomeBot
|
56
|
+
Disallow: /
|
57
|
+
|
58
|
+
User-agent: genieBot
|
59
|
+
Disallow: /
|
60
|
+
|
61
|
+
User-agent: dotbot
|
62
|
+
Disallow: /
|
63
|
+
|
64
|
+
User-agent: MLBot
|
65
|
+
Disallow: /
|
66
|
+
|
67
|
+
User-agent: 80bot
|
68
|
+
Disallow: /
|
data/test/test_robots.rb
CHANGED
@@ -2,44 +2,75 @@
|
|
2
2
|
require "test/unit"
|
3
3
|
require File.dirname(__FILE__) + "/../lib/robots"
|
4
4
|
|
5
|
-
module
|
6
|
-
|
7
|
-
|
8
|
-
def set_open(key, value)
|
9
|
-
@fake_open_values ||= {}
|
10
|
-
@fake_open_values[key] = value
|
5
|
+
module FakeHttp
|
6
|
+
def content_type
|
7
|
+
"text/plain"
|
11
8
|
end
|
12
9
|
|
13
|
-
def
|
14
|
-
|
15
|
-
@fake_open_values[args.first] || open_old(*args)
|
10
|
+
def status
|
11
|
+
["200", "OK"]
|
16
12
|
end
|
17
13
|
end
|
18
14
|
|
19
15
|
class TestRobots < Test::Unit::TestCase
|
20
16
|
def setup
|
17
|
+
def Robots.get_robots_txt(uri)
|
18
|
+
fixture_file = File.dirname(__FILE__) + "/fixtures/" + uri.host.split(".")[-2] + ".txt"
|
19
|
+
File.open(fixture_file).extend(FakeHttp)
|
20
|
+
end
|
21
|
+
|
21
22
|
@robots = Robots.new "Ruby-Robot.txt Parser Test Script"
|
22
23
|
end
|
23
24
|
|
24
25
|
def test_allowed_if_no_robots
|
25
|
-
|
26
|
+
def Robots.get_robots_txt(uri)
|
27
|
+
return nil
|
28
|
+
end
|
29
|
+
|
30
|
+
assert_allowed("somesite", "/")
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_disallow_nothing
|
34
|
+
assert_allowed("emptyish", "/")
|
35
|
+
assert_allowed("emptyish", "/foo")
|
26
36
|
end
|
27
37
|
|
28
38
|
def test_reddit
|
29
|
-
|
39
|
+
assert_allowed("reddit", "/")
|
30
40
|
end
|
31
41
|
|
32
42
|
def test_other
|
33
|
-
|
34
|
-
|
43
|
+
assert_allowed("yelp", "/foo")
|
44
|
+
assert_disallowed("yelp", "/mail?foo=bar")
|
35
45
|
end
|
36
46
|
|
37
47
|
def test_site_with_disallowed
|
38
|
-
|
48
|
+
assert_allowed("google", "/")
|
39
49
|
end
|
40
50
|
|
41
51
|
def test_other_values
|
42
52
|
sitemap = {"Sitemap" => ["http://www.eventbrite.com/sitemap_index.xml", "http://www.eventbrite.com/sitemap_index.xml"]}
|
43
|
-
|
53
|
+
assert_other_equals("eventbrite", sitemap)
|
54
|
+
end
|
55
|
+
|
56
|
+
def assert_other_equals(name, value)
|
57
|
+
assert_equal(value, @robots.other_values(uri_for_name(name, "/")))
|
58
|
+
end
|
59
|
+
|
60
|
+
def assert_allowed(name, path)
|
61
|
+
assert_allowed_equals(name, path, true)
|
62
|
+
end
|
63
|
+
|
64
|
+
def assert_disallowed(name, path)
|
65
|
+
assert_allowed_equals(name, path, false)
|
66
|
+
end
|
67
|
+
|
68
|
+
def assert_allowed_equals(name, path, value)
|
69
|
+
assert_equal(value, @robots.allowed?(uri_for_name(name, path)), @robots.inspect)
|
70
|
+
end
|
71
|
+
|
72
|
+
def uri_for_name(name, path)
|
73
|
+
uri = name.nil? ? nil : "http://www.#{name}.com#{path}"
|
44
74
|
end
|
75
|
+
|
45
76
|
end
|
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: robots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 9
|
8
|
+
- 0
|
9
|
+
version: 0.9.0
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- Kyle Maxwell
|
@@ -9,19 +14,21 @@ autorequire:
|
|
9
14
|
bindir: bin
|
10
15
|
cert_chain: []
|
11
16
|
|
12
|
-
date: 2010-
|
17
|
+
date: 2010-05-29 00:00:00 -07:00
|
13
18
|
default_executable:
|
14
19
|
dependencies:
|
15
20
|
- !ruby/object:Gem::Dependency
|
16
21
|
name: thoughtbot-shoulda
|
17
|
-
|
18
|
-
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
24
|
requirements:
|
21
25
|
- - ">="
|
22
26
|
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
23
29
|
version: "0"
|
24
|
-
|
30
|
+
type: :development
|
31
|
+
version_requirements: *id001
|
25
32
|
description: It parses robots.txt files
|
26
33
|
email: kyle@kylemaxwell.com
|
27
34
|
executables: []
|
@@ -38,7 +45,11 @@ files:
|
|
38
45
|
- VERSION
|
39
46
|
- lib/robots.rb
|
40
47
|
- robots.gemspec
|
41
|
-
- test/fixtures/
|
48
|
+
- test/fixtures/emptyish.txt
|
49
|
+
- test/fixtures/eventbrite.txt
|
50
|
+
- test/fixtures/google.txt
|
51
|
+
- test/fixtures/reddit.txt
|
52
|
+
- test/fixtures/yelp.txt
|
42
53
|
- test/test_robots.rb
|
43
54
|
has_rdoc: true
|
44
55
|
homepage: http://github.com/fizx/robots
|
@@ -53,18 +64,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
53
64
|
requirements:
|
54
65
|
- - ">="
|
55
66
|
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
56
69
|
version: "0"
|
57
|
-
version:
|
58
70
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
71
|
requirements:
|
60
72
|
- - ">="
|
61
73
|
- !ruby/object:Gem::Version
|
74
|
+
segments:
|
75
|
+
- 0
|
62
76
|
version: "0"
|
63
|
-
version:
|
64
77
|
requirements: []
|
65
78
|
|
66
79
|
rubyforge_project:
|
67
|
-
rubygems_version: 1.3.
|
80
|
+
rubygems_version: 1.3.6
|
68
81
|
signing_key:
|
69
82
|
specification_version: 3
|
70
83
|
summary: Simple robots.txt parser
|
data/test/fixtures/robots1.txt
DELETED
File without changes
|