robots 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,6 @@
1
+ 0.9.0
2
+ - Fix http://github.com/fizx/robots/issues#issue/1
3
+ - Tests don't rely on network.
1
4
  0.8.0
2
5
  - Add multiple values from robots.txt (via joost)
3
6
  0.7.3
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.0
1
+ 0.9.0
@@ -12,15 +12,7 @@ class Robots
12
12
  def initialize(uri, user_agent)
13
13
  @last_accessed = Time.at(1)
14
14
 
15
- io = nil
16
- begin
17
- Timeout::timeout(Robots.timeout) do
18
- io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
19
- end
20
- rescue Timeout::Error
21
- STDERR.puts "robots.txt request timed out"
22
- end
23
-
15
+ io = Robots.get_robots_txt(uri)
24
16
 
25
17
  if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
26
18
  io = StringIO.new("User-agent: *\nAllow: /\n")
@@ -99,12 +91,23 @@ class Robots
99
91
  protected
100
92
 
101
93
  def to_regex(pattern)
94
+ return /should-not-match-anything-123456789/ if pattern.strip.empty?
102
95
  pattern = Regexp.escape(pattern)
103
96
  pattern.gsub!(Regexp.escape("*"), ".*")
104
97
  Regexp.compile("^#{pattern}")
105
98
  end
106
99
  end
107
100
 
101
+ def self.get_robots_txt(uri)
102
+ begin
103
+ Timeout::timeout(Robots.timeout) do
104
+ io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
105
+ end
106
+ rescue Timeout::Error
107
+ STDERR.puts "robots.txt request timed out"
108
+ end
109
+ end
110
+
108
111
  def self.timeout=(t)
109
112
  @timeout = t
110
113
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{robots}
8
- s.version = "0.8.0"
8
+ s.version = "0.9.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Kyle Maxwell"]
12
- s.date = %q{2010-02-08}
12
+ s.date = %q{2010-05-29}
13
13
  s.description = %q{It parses robots.txt files}
14
14
  s.email = %q{kyle@kylemaxwell.com}
15
15
  s.extra_rdoc_files = [
@@ -23,13 +23,17 @@ Gem::Specification.new do |s|
23
23
  "VERSION",
24
24
  "lib/robots.rb",
25
25
  "robots.gemspec",
26
- "test/fixtures/robots1.txt",
26
+ "test/fixtures/emptyish.txt",
27
+ "test/fixtures/eventbrite.txt",
28
+ "test/fixtures/google.txt",
29
+ "test/fixtures/reddit.txt",
30
+ "test/fixtures/yelp.txt",
27
31
  "test/test_robots.rb"
28
32
  ]
29
33
  s.homepage = %q{http://github.com/fizx/robots}
30
34
  s.rdoc_options = ["--charset=UTF-8"]
31
35
  s.require_paths = ["lib"]
32
- s.rubygems_version = %q{1.3.5}
36
+ s.rubygems_version = %q{1.3.6}
33
37
  s.summary = %q{Simple robots.txt parser}
34
38
  s.test_files = [
35
39
  "test/test_robots.rb"
@@ -0,0 +1,2 @@
1
+ User-agent: *
2
+ Disallow:
@@ -0,0 +1,435 @@
1
+ # These entries assist in minimizing bandwidth usage caused
2
+ # by questionable robots spidering your site. Some of these
3
+ # robots or agents are used by web-stripping sofware.
4
+ # Please do not remove these entries, but feel free to add
5
+ # your own at the end of the list.
6
+ # If you have any questions regarding this file, please
7
+ # contact support@thinkhost.com
8
+
9
+ User-agent: *
10
+ Disallow: /rest/
11
+ Disallow: /xml/
12
+ Disallow: /json/
13
+ Disallow: /atom/
14
+ Disallow: /opml/
15
+ Disallow: /widget/
16
+ Disallow: /register
17
+ Disallow: /review
18
+ Disallow: /orderconfirmation
19
+ Disallow: /venues/
20
+ Disallow: /*?
21
+
22
+ Sitemap: http://www.eventbrite.com/sitemap_index.xml
23
+
24
+ User-agent: msnbot
25
+ Crawl-delay: 4
26
+
27
+ User-agent: Slurp
28
+ Crawl-delay: 4
29
+
30
+ User-agent: Balihoo
31
+ Disallow: /
32
+
33
+ User-agent: BotRightHere
34
+ Disallow: /
35
+
36
+ User-agent: WebZip
37
+ Disallow: /
38
+
39
+ User-agent: larbin
40
+ Disallow: /
41
+
42
+ User-agent: b2w/0.1
43
+ Disallow: /
44
+
45
+ User-agent: Copernic
46
+ Disallow: /
47
+
48
+ User-agent: psbot
49
+ Disallow: /
50
+
51
+ User-agent: Python-urllib
52
+ Disallow: /
53
+
54
+ User-agent: NetMechanic
55
+ Disallow: /
56
+
57
+ User-agent: URL_Spider_Pro
58
+ Disallow: /
59
+
60
+ User-agent: CherryPicker
61
+ Disallow: /
62
+
63
+ User-agent: EmailCollector
64
+ Disallow: /
65
+
66
+ User-agent: EmailSiphon
67
+ Disallow: /
68
+
69
+ User-agent: WebBandit
70
+ Disallow: /
71
+
72
+ User-agent: EmailWolf
73
+ Disallow: /
74
+
75
+ User-agent: ExtractorPro
76
+ Disallow: /
77
+
78
+ User-agent: CopyRightCheck
79
+ Disallow: /
80
+
81
+ User-agent: Crescent
82
+ Disallow: /
83
+
84
+ User-agent: SiteSnagger
85
+ Disallow: /
86
+
87
+ User-agent: ProWebWalker
88
+ Disallow: /
89
+
90
+ User-agent: CheeseBot
91
+ Disallow: /
92
+
93
+ User-agent: LNSpiderguy
94
+ Disallow: /
95
+
96
+ User-agent: Alexibot
97
+ Disallow: /
98
+
99
+ User-agent: Teleport
100
+ Disallow: /
101
+
102
+ User-agent: TeleportPro
103
+ Disallow: /
104
+
105
+ User-agent: MIIxpc
106
+ Disallow: /
107
+
108
+ User-agent: Telesoft
109
+ Disallow: /
110
+
111
+ User-agent: Website Quester
112
+ Disallow: /
113
+
114
+ User-agent: WebZip
115
+ Disallow: /
116
+
117
+ User-agent: moget/2.1
118
+ Disallow: /
119
+
120
+ User-agent: WebZip/4.0
121
+ Disallow: /
122
+
123
+ User-agent: WebStripper
124
+ Disallow: /
125
+
126
+ User-agent: WebSauger
127
+ Disallow: /
128
+
129
+ User-agent: WebCopier
130
+ Disallow: /
131
+
132
+ User-agent: NetAnts
133
+ Disallow: /
134
+
135
+ User-agent: Mister PiX
136
+ Disallow: /
137
+
138
+ User-agent: WebAuto
139
+ Disallow: /
140
+
141
+ User-agent: TheNomad
142
+ Disallow: /
143
+
144
+ User-agent: WWW-Collector-E
145
+ Disallow: /
146
+
147
+ User-agent: RMA
148
+ Disallow: /
149
+
150
+ User-agent: libWeb/clsHTTP
151
+ Disallow: /
152
+
153
+ User-agent: asterias
154
+ Disallow: /
155
+
156
+ User-agent: httplib
157
+ Disallow: /
158
+
159
+ User-agent: turingos
160
+ Disallow: /
161
+
162
+ User-agent: spanner
163
+ Disallow: /
164
+
165
+ User-agent: InfoNaviRobot
166
+ Disallow: /
167
+
168
+ User-agent: Harvest/1.5
169
+ Disallow: /
170
+
171
+ User-agent: Bullseye/1.0
172
+ Disallow: /
173
+
174
+ User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95)
175
+ Disallow: /
176
+
177
+ User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0
178
+ Disallow: /
179
+
180
+ User-agent: CherryPickerSE/1.0
181
+ Disallow: /
182
+
183
+ User-agent: CherryPickerElite/1.0
184
+ Disallow: /
185
+
186
+ User-agent: WebBandit/3.50
187
+ Disallow: /
188
+
189
+ User-agent: NICErsPRO
190
+ Disallow: /
191
+
192
+ User-agent: Microsoft URL Control - 5.01.4511
193
+ Disallow: /
194
+
195
+ User-agent: DittoSpyder
196
+ Disallow: /
197
+
198
+ User-agent: Foobot
199
+ Disallow: /
200
+
201
+ User-agent: SpankBot
202
+ Disallow: /
203
+
204
+ User-agent: BotALot
205
+ Disallow: /
206
+
207
+ User-agent: lwp-trivial/1.34
208
+ Disallow: /
209
+
210
+ User-agent: lwp-trivial
211
+ Disallow: /
212
+
213
+ User-agent: BunnySlippers
214
+ Disallow: /
215
+
216
+ User-agent: Microsoft URL Control - 6.00.8169
217
+ Disallow: /
218
+
219
+ User-agent: URLy Warning
220
+ Disallow: /
221
+
222
+ User-agent: Wget/1.6
223
+ Disallow: /
224
+
225
+ User-agent: Wget/1.5.3
226
+ Disallow: /
227
+
228
+ User-agent: Wget
229
+ Disallow: /
230
+
231
+ User-agent: LinkWalker
232
+ Disallow: /
233
+
234
+ User-agent: cosmos
235
+ Disallow: /
236
+
237
+ User-agent: moget
238
+ Disallow: /
239
+
240
+ User-agent: hloader
241
+ Disallow: /
242
+
243
+ User-agent: humanlinks
244
+ Disallow: /
245
+
246
+ User-agent: LinkextractorPro
247
+ Disallow: /
248
+
249
+ User-agent: Offline Explorer
250
+ Disallow: /
251
+
252
+ User-agent: Mata Hari
253
+ Disallow: /
254
+
255
+ User-agent: LexiBot
256
+ Disallow: /
257
+
258
+ User-agent: Web Image Collector
259
+ Disallow: /
260
+
261
+ User-agent: The Intraformant
262
+ Disallow: /
263
+
264
+ User-agent: True_Robot/1.0
265
+ Disallow: /
266
+
267
+ User-agent: True_Robot
268
+ Disallow: /
269
+
270
+ User-agent: BlowFish/1.0
271
+ Disallow: /
272
+
273
+ User-agent: JennyBot
274
+ Disallow: /
275
+
276
+ User-agent: MIIxpc/4.2
277
+ Disallow: /
278
+
279
+ User-agent: BuiltBotTough
280
+ Disallow: /
281
+
282
+ User-agent: ProPowerBot/2.14
283
+ Disallow: /
284
+
285
+ User-agent: BackDoorBot/1.0
286
+ Disallow: /
287
+
288
+ User-agent: toCrawl/UrlDispatcher
289
+ Disallow: /
290
+
291
+ User-agent: WebEnhancer
292
+ Disallow: /
293
+
294
+ User-agent: suzuran
295
+ Disallow: /
296
+
297
+ User-agent: TightTwatBot
298
+ Disallow: /
299
+
300
+ User-agent: VCI WebViewer VCI WebViewer Win32
301
+ Disallow: /
302
+
303
+ User-agent: VCI
304
+ Disallow: /
305
+
306
+ User-agent: Szukacz/1.4
307
+ Disallow: /
308
+
309
+ User-agent: QueryN Metasearch
310
+ Disallow: /
311
+
312
+ User-agent: Openfind data gatherer
313
+ Disallow: /
314
+
315
+ User-agent: Openfind
316
+ Disallow: /
317
+
318
+ User-agent: Xenu's Link Sleuth 1.1c
319
+ Disallow: /
320
+
321
+ User-agent: Xenu's
322
+ Disallow: /
323
+
324
+ User-agent: Zeus
325
+ Disallow: /
326
+
327
+ User-agent: RepoMonkey Bait & Tackle/v1.01
328
+ Disallow: /
329
+
330
+ User-agent: RepoMonkey
331
+ Disallow: /
332
+
333
+ User-agent: Microsoft URL Control
334
+ Disallow: /
335
+
336
+ User-agent: Openbot
337
+ Disallow: /
338
+
339
+ User-agent: URL Control
340
+ Disallow: /
341
+
342
+ User-agent: Zeus Link Scout
343
+ Disallow: /
344
+
345
+ User-agent: Zeus 32297 Webster Pro V2.9 Win32
346
+ Disallow: /
347
+
348
+ User-agent: Webster Pro
349
+ Disallow: /
350
+
351
+ User-agent: EroCrawler
352
+ Disallow: /
353
+
354
+ User-agent: LinkScan/8.1a Unix
355
+ Disallow: /
356
+
357
+ User-agent: Keyword Density/0.9
358
+ Disallow: /
359
+
360
+ User-agent: Kenjin Spider
361
+ Disallow: /
362
+
363
+ User-agent: Iron33/1.0.2
364
+ Disallow: /
365
+
366
+ User-agent: Bookmark search tool
367
+ Disallow: /
368
+
369
+ User-agent: GetRight/4.2
370
+ Disallow: /
371
+
372
+ User-agent: FairAd Client
373
+ Disallow: /
374
+
375
+ User-agent: Gaisbot
376
+ Disallow: /
377
+
378
+ User-agent: Aqua_Products
379
+ Disallow: /
380
+
381
+ User-agent: Radiation Retriever 1.1
382
+ Disallow: /
383
+
384
+ User-agent: Flaming AttackBot
385
+ Disallow: /
386
+
387
+ User-agent: Oracle Ultra Search
388
+ Disallow: /
389
+
390
+ User-agent: MSIECrawler
391
+ Disallow: /
392
+
393
+ User-agent: PerMan
394
+ Disallow: /
395
+
396
+ User-agent: searchpreview
397
+ Disallow: /
398
+
399
+ User-agent: TurnitinBot
400
+ Disallow: /
401
+
402
+ User-agent: wget
403
+ Disallow: /
404
+
405
+ User-agent: ExtractorPro
406
+ Disallow: /
407
+
408
+ User-agent: WebZIP/4.21
409
+ Disallow: /
410
+
411
+ User-agent: WebZIP/5.0
412
+ Disallow: /
413
+
414
+ User-agent: HTTrack 3.0
415
+ Disallow: /
416
+
417
+ User-agent: TurnitinBot/1.5
418
+ Disallow: /
419
+
420
+ User-agent: WebCopier v3.2a
421
+ Disallow: /
422
+
423
+ User-agent: WebCapture 2.0
424
+ Disallow: /
425
+
426
+ User-agent: WebCopier v.2.2
427
+ Disallow: /
428
+
429
+ User-agent: Spinn3r
430
+ Disallow: /
431
+
432
+ User-agent: Tailrank
433
+ Disallow: /
434
+
435
+ Sitemap: http://www.eventbrite.com/sitemap_index.xml
@@ -0,0 +1,215 @@
1
+ User-agent: *
2
+ Disallow: /search
3
+ Disallow: /groups
4
+ Disallow: /images
5
+ Disallow: /catalogs
6
+ Disallow: /catalogues
7
+ Disallow: /news
8
+ Allow: /news/directory
9
+ Disallow: /nwshp
10
+ Disallow: /setnewsprefs?
11
+ Disallow: /index.html?
12
+ Disallow: /?
13
+ Disallow: /addurl/image?
14
+ Disallow: /pagead/
15
+ Disallow: /relpage/
16
+ Disallow: /relcontent
17
+ Disallow: /imgres
18
+ Disallow: /imglanding
19
+ Disallow: /keyword/
20
+ Disallow: /u/
21
+ Disallow: /univ/
22
+ Disallow: /cobrand
23
+ Disallow: /custom
24
+ Disallow: /advanced_group_search
25
+ Disallow: /googlesite
26
+ Disallow: /preferences
27
+ Disallow: /setprefs
28
+ Disallow: /swr
29
+ Disallow: /url
30
+ Disallow: /default
31
+ Disallow: /m?
32
+ Disallow: /m/?
33
+ Disallow: /m/blogs?
34
+ Disallow: /m/ig
35
+ Disallow: /m/images?
36
+ Disallow: /m/local?
37
+ Disallow: /m/movies?
38
+ Disallow: /m/news?
39
+ Disallow: /m/news/i?
40
+ Disallow: /m/place?
41
+ Disallow: /m/setnewsprefs?
42
+ Disallow: /m/search?
43
+ Disallow: /m/swmloptin?
44
+ Disallow: /m/trends
45
+ Disallow: /wml?
46
+ Disallow: /wml/?
47
+ Disallow: /wml/search?
48
+ Disallow: /xhtml?
49
+ Disallow: /xhtml/?
50
+ Disallow: /xhtml/search?
51
+ Disallow: /xml?
52
+ Disallow: /imode?
53
+ Disallow: /imode/?
54
+ Disallow: /imode/search?
55
+ Disallow: /jsky?
56
+ Disallow: /jsky/?
57
+ Disallow: /jsky/search?
58
+ Disallow: /pda?
59
+ Disallow: /pda/?
60
+ Disallow: /pda/search?
61
+ Disallow: /sprint_xhtml
62
+ Disallow: /sprint_wml
63
+ Disallow: /pqa
64
+ Disallow: /palm
65
+ Disallow: /gwt/
66
+ Disallow: /purchases
67
+ Disallow: /hws
68
+ Disallow: /bsd?
69
+ Disallow: /linux?
70
+ Disallow: /mac?
71
+ Disallow: /microsoft?
72
+ Disallow: /unclesam?
73
+ Disallow: /answers/search?q=
74
+ Disallow: /local?
75
+ Disallow: /local_url
76
+ Disallow: /froogle?
77
+ Disallow: /products?
78
+ Disallow: /products/
79
+ Disallow: /froogle_
80
+ Disallow: /product_
81
+ Disallow: /products_
82
+ Disallow: /print
83
+ Disallow: /books
84
+ Disallow: /bkshp?q=
85
+ Allow: /booksrightsholders
86
+ Disallow: /patents?
87
+ Disallow: /patents/
88
+ Allow: /patents/about
89
+ Disallow: /scholar
90
+ Disallow: /complete
91
+ Disallow: /sponsoredlinks
92
+ Disallow: /videosearch?
93
+ Disallow: /videopreview?
94
+ Disallow: /videoprograminfo?
95
+ Disallow: /maps?
96
+ Disallow: /mapstt?
97
+ Disallow: /mapslt?
98
+ Disallow: /maps/stk/
99
+ Disallow: /maps/br?
100
+ Disallow: /mapabcpoi?
101
+ Disallow: /maphp?
102
+ Disallow: /places/
103
+ Disallow: /maps/place
104
+ Disallow: /help/maps/streetview/partners/welcome/
105
+ Disallow: /lochp?
106
+ Disallow: /center
107
+ Disallow: /ie?
108
+ Disallow: /sms/demo?
109
+ Disallow: /katrina?
110
+ Disallow: /blogsearch?
111
+ Disallow: /blogsearch/
112
+ Disallow: /blogsearch_feeds
113
+ Disallow: /advanced_blog_search
114
+ Disallow: /reader/
115
+ Allow: /reader/play
116
+ Disallow: /uds/
117
+ Disallow: /chart?
118
+ Disallow: /transit?
119
+ Disallow: /mbd?
120
+ Disallow: /extern_js/
121
+ Disallow: /calendar/feeds/
122
+ Disallow: /calendar/ical/
123
+ Disallow: /cl2/feeds/
124
+ Disallow: /cl2/ical/
125
+ Disallow: /coop/directory
126
+ Disallow: /coop/manage
127
+ Disallow: /trends?
128
+ Disallow: /trends/music?
129
+ Disallow: /trends/hottrends?
130
+ Disallow: /trends/viz?
131
+ Disallow: /notebook/search?
132
+ Disallow: /musica
133
+ Disallow: /musicad
134
+ Disallow: /musicas
135
+ Disallow: /musicl
136
+ Disallow: /musics
137
+ Disallow: /musicsearch
138
+ Disallow: /musicsp
139
+ Disallow: /musiclp
140
+ Disallow: /browsersync
141
+ Disallow: /call
142
+ Disallow: /archivesearch?
143
+ Disallow: /archivesearch/url
144
+ Disallow: /archivesearch/advanced_search
145
+ Disallow: /base/reportbadoffer
146
+ Disallow: /urchin_test/
147
+ Disallow: /movies?
148
+ Disallow: /codesearch?
149
+ Disallow: /codesearch/feeds/search?
150
+ Disallow: /wapsearch?
151
+ Disallow: /safebrowsing
152
+ Allow: /safebrowsing/diagnostic
153
+ Allow: /safebrowsing/report_error/
154
+ Allow: /safebrowsing/report_phish/
155
+ Disallow: /reviews/search?
156
+ Disallow: /orkut/albums
157
+ Disallow: /jsapi
158
+ Disallow: /views?
159
+ Disallow: /c/
160
+ Disallow: /cbk
161
+ Disallow: /recharge/dashboard/car
162
+ Disallow: /recharge/dashboard/static/
163
+ Disallow: /translate_a/
164
+ Disallow: /translate_c
165
+ Disallow: /translate_f
166
+ Disallow: /translate_static/
167
+ Disallow: /translate_suggestion
168
+ Disallow: /profiles/me
169
+ Allow: /profiles
170
+ Disallow: /s2/profiles/me
171
+ Allow: /s2/profiles
172
+ Allow: /s2/photos
173
+ Allow: /s2/static
174
+ Disallow: /s2
175
+ Disallow: /transconsole/portal/
176
+ Disallow: /gcc/
177
+ Disallow: /aclk
178
+ Disallow: /cse?
179
+ Disallow: /cse/home
180
+ Disallow: /cse/panel
181
+ Disallow: /cse/manage
182
+ Disallow: /tbproxy/
183
+ Disallow: /comparisonads/
184
+ Disallow: /imesync/
185
+ Disallow: /shenghuo/search?
186
+ Disallow: /support/forum/search?
187
+ Disallow: /reviews/polls/
188
+ Disallow: /hosted/images/
189
+ Disallow: /ppob/?
190
+ Disallow: /ppob?
191
+ Disallow: /ig/add?
192
+ Disallow: /adwordsresellers
193
+ Disallow: /accounts/o8
194
+ Allow: /accounts/o8/id
195
+ Disallow: /topicsearch?q=
196
+ Disallow: /xfx7/
197
+ Disallow: /squared/api
198
+ Disallow: /squared/search
199
+ Disallow: /squared/table
200
+ Disallow: /toolkit/
201
+ Allow: /toolkit/*.html
202
+ Disallow: /qnasearch?
203
+ Disallow: /errors/
204
+ Disallow: /app/updates
205
+ Disallow: /sidewiki/entry/
206
+ Disallow: /quality_form?
207
+ Disallow: /labs/popgadget/search
208
+ Disallow: /buzz/post
209
+ Disallow: /compressiontest/
210
+ Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
211
+ Sitemap: http://www.google.com/hostednews/sitemap_index.xml
212
+ Sitemap: http://www.google.com/ventures/sitemap_ventures.xml
213
+ Sitemap: http://www.google.com/sitemaps_webmasters.xml
214
+ Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml
215
+ Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml
@@ -0,0 +1,14 @@
1
+ # 80legs
2
+ User-agent: 008
3
+ Disallow: /
4
+
5
+ User-Agent: *
6
+ Disallow: /goto
7
+ Disallow: /*after=
8
+ Disallow: /*before=
9
+ Disallow: /domain/*t=
10
+ Disallow: /login
11
+ Disallow: /reddits/search
12
+ Disallow: /search
13
+ Disallow: /r/*/search
14
+ Allow: /
@@ -0,0 +1,68 @@
1
+ #
2
+ # 1. A robot may not injure a human being or, through inaction, allow a
3
+ # human being to come to harm.
4
+ #
5
+ # 2. A robot must obey orders given it by human beings except where such
6
+ # orders would conflict with the First Law.
7
+ #
8
+ # 3. A robot must protect its own existence as long as such protection
9
+ # does not conflict with the First or Second Law.
10
+
11
+ User-agent: *
12
+ Disallow: /advertise?
13
+ Disallow: /biz_share?
14
+ Disallow: /biz_attribute
15
+ Disallow: /biz_link
16
+ Disallow: /biz_update
17
+ Disallow: /bookmark?
18
+ Disallow: /flag_content?
19
+ Disallow: /invite_friends_service?
20
+ Disallow: /login?
21
+ Disallow: /mail?
22
+ Disallow: /map?
23
+ Disallow: /redir?
24
+ Disallow: /writeareview
25
+ Disallow: /signup?
26
+ Disallow: /talk/new_topic
27
+ Disallow: /thanx?
28
+ Disallow: /user_favorites?
29
+ Disallow: /weekly/signup
30
+ Disallow: /elite?
31
+ Disallow: /member_search_results
32
+ Disallow: /advertise?
33
+ Disallow: /syndicate/
34
+ Disallow: /filtered_reviews
35
+ Disallow: /language/update?
36
+
37
+ User-agent: Fasterfox
38
+ Disallow: /
39
+
40
+ User-agent: Nutch
41
+ Disallow: /
42
+
43
+ User-agent: spock
44
+ Disallow: /
45
+
46
+ User-agent: OmniExplorer_Bot
47
+ Disallow: /
48
+
49
+ User-agent: MJ12bot
50
+ Disallow: /
51
+
52
+ User-agent: TurnitinBot
53
+ Disallow: /
54
+
55
+ User-agent: BecomeBot
56
+ Disallow: /
57
+
58
+ User-agent: genieBot
59
+ Disallow: /
60
+
61
+ User-agent: dotbot
62
+ Disallow: /
63
+
64
+ User-agent: MLBot
65
+ Disallow: /
66
+
67
+ User-agent: 80bot
68
+ Disallow: /
@@ -2,44 +2,75 @@
2
2
  require "test/unit"
3
3
  require File.dirname(__FILE__) + "/../lib/robots"
4
4
 
5
- module Kernel
6
- alias_method :open_old, :open
7
-
8
- def set_open(key, value)
9
- @fake_open_values ||= {}
10
- @fake_open_values[key] = value
5
+ module FakeHttp
6
+ def content_type
7
+ "text/plain"
11
8
  end
12
9
 
13
- def open(*args)
14
- @fake_open_values ||= {}
15
- @fake_open_values[args.first] || open_old(*args)
10
+ def status
11
+ ["200", "OK"]
16
12
  end
17
13
  end
18
14
 
19
15
  class TestRobots < Test::Unit::TestCase
20
16
  def setup
17
+ def Robots.get_robots_txt(uri)
18
+ fixture_file = File.dirname(__FILE__) + "/fixtures/" + uri.host.split(".")[-2] + ".txt"
19
+ File.open(fixture_file).extend(FakeHttp)
20
+ end
21
+
21
22
  @robots = Robots.new "Ruby-Robot.txt Parser Test Script"
22
23
  end
23
24
 
24
25
  def test_allowed_if_no_robots
25
- assert @robots.allowed?("http://www.yahoo.com")
26
+ def Robots.get_robots_txt(uri)
27
+ return nil
28
+ end
29
+
30
+ assert_allowed("somesite", "/")
31
+ end
32
+
33
+ def test_disallow_nothing
34
+ assert_allowed("emptyish", "/")
35
+ assert_allowed("emptyish", "/foo")
26
36
  end
27
37
 
28
38
  def test_reddit
29
- assert @robots.allowed?("http://reddit.com")
39
+ assert_allowed("reddit", "/")
30
40
  end
31
41
 
32
42
  def test_other
33
- assert @robots.allowed?("http://www.yelp.com/foo")
34
- assert !@robots.allowed?("http://www.yelp.com/mail?foo=bar")
43
+ assert_allowed("yelp", "/foo")
44
+ assert_disallowed("yelp", "/mail?foo=bar")
35
45
  end
36
46
 
37
47
  def test_site_with_disallowed
38
- assert @robots.allowed?("http://www.google.com/")
48
+ assert_allowed("google", "/")
39
49
  end
40
50
 
41
51
  def test_other_values
42
52
  sitemap = {"Sitemap" => ["http://www.eventbrite.com/sitemap_index.xml", "http://www.eventbrite.com/sitemap_index.xml"]}
43
- assert_equal(sitemap, @robots.other_values("http://eventbrite.com"))
53
+ assert_other_equals("eventbrite", sitemap)
54
+ end
55
+
56
+ def assert_other_equals(name, value)
57
+ assert_equal(value, @robots.other_values(uri_for_name(name, "/")))
58
+ end
59
+
60
+ def assert_allowed(name, path)
61
+ assert_allowed_equals(name, path, true)
62
+ end
63
+
64
+ def assert_disallowed(name, path)
65
+ assert_allowed_equals(name, path, false)
66
+ end
67
+
68
+ def assert_allowed_equals(name, path, value)
69
+ assert_equal(value, @robots.allowed?(uri_for_name(name, path)), @robots.inspect)
70
+ end
71
+
72
+ def uri_for_name(name, path)
73
+ uri = name.nil? ? nil : "http://www.#{name}.com#{path}"
44
74
  end
75
+
45
76
  end
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: robots
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 9
8
+ - 0
9
+ version: 0.9.0
5
10
  platform: ruby
6
11
  authors:
7
12
  - Kyle Maxwell
@@ -9,19 +14,21 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2010-02-08 00:00:00 -08:00
17
+ date: 2010-05-29 00:00:00 -07:00
13
18
  default_executable:
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: thoughtbot-shoulda
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
20
24
  requirements:
21
25
  - - ">="
22
26
  - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
23
29
  version: "0"
24
- version:
30
+ type: :development
31
+ version_requirements: *id001
25
32
  description: It parses robots.txt files
26
33
  email: kyle@kylemaxwell.com
27
34
  executables: []
@@ -38,7 +45,11 @@ files:
38
45
  - VERSION
39
46
  - lib/robots.rb
40
47
  - robots.gemspec
41
- - test/fixtures/robots1.txt
48
+ - test/fixtures/emptyish.txt
49
+ - test/fixtures/eventbrite.txt
50
+ - test/fixtures/google.txt
51
+ - test/fixtures/reddit.txt
52
+ - test/fixtures/yelp.txt
42
53
  - test/test_robots.rb
43
54
  has_rdoc: true
44
55
  homepage: http://github.com/fizx/robots
@@ -53,18 +64,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
53
64
  requirements:
54
65
  - - ">="
55
66
  - !ruby/object:Gem::Version
67
+ segments:
68
+ - 0
56
69
  version: "0"
57
- version:
58
70
  required_rubygems_version: !ruby/object:Gem::Requirement
59
71
  requirements:
60
72
  - - ">="
61
73
  - !ruby/object:Gem::Version
74
+ segments:
75
+ - 0
62
76
  version: "0"
63
- version:
64
77
  requirements: []
65
78
 
66
79
  rubyforge_project:
67
- rubygems_version: 1.3.5
80
+ rubygems_version: 1.3.6
68
81
  signing_key:
69
82
  specification_version: 3
70
83
  summary: Simple robots.txt parser
File without changes