robots 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,6 @@
1
+ 0.9.0
2
+ - Fix http://github.com/fizx/robots/issues#issue/1
3
+ - Tests don't rely on network.
1
4
  0.8.0
2
5
  - Add multiple values from robots.txt (via joost)
3
6
  0.7.3
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.0
1
+ 0.9.0
@@ -12,15 +12,7 @@ class Robots
12
12
  def initialize(uri, user_agent)
13
13
  @last_accessed = Time.at(1)
14
14
 
15
- io = nil
16
- begin
17
- Timeout::timeout(Robots.timeout) do
18
- io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
19
- end
20
- rescue Timeout::Error
21
- STDERR.puts "robots.txt request timed out"
22
- end
23
-
15
+ io = Robots.get_robots_txt(uri)
24
16
 
25
17
  if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
26
18
  io = StringIO.new("User-agent: *\nAllow: /\n")
@@ -99,12 +91,23 @@ class Robots
99
91
  protected
100
92
 
101
93
  def to_regex(pattern)
94
+ return /should-not-match-anything-123456789/ if pattern.strip.empty?
102
95
  pattern = Regexp.escape(pattern)
103
96
  pattern.gsub!(Regexp.escape("*"), ".*")
104
97
  Regexp.compile("^#{pattern}")
105
98
  end
106
99
  end
107
100
 
101
+ def self.get_robots_txt(uri)
102
+ begin
103
+ Timeout::timeout(Robots.timeout) do
104
+ io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
105
+ end
106
+ rescue Timeout::Error
107
+ STDERR.puts "robots.txt request timed out"
108
+ end
109
+ end
110
+
108
111
  def self.timeout=(t)
109
112
  @timeout = t
110
113
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{robots}
8
- s.version = "0.8.0"
8
+ s.version = "0.9.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Kyle Maxwell"]
12
- s.date = %q{2010-02-08}
12
+ s.date = %q{2010-05-29}
13
13
  s.description = %q{It parses robots.txt files}
14
14
  s.email = %q{kyle@kylemaxwell.com}
15
15
  s.extra_rdoc_files = [
@@ -23,13 +23,17 @@ Gem::Specification.new do |s|
23
23
  "VERSION",
24
24
  "lib/robots.rb",
25
25
  "robots.gemspec",
26
- "test/fixtures/robots1.txt",
26
+ "test/fixtures/emptyish.txt",
27
+ "test/fixtures/eventbrite.txt",
28
+ "test/fixtures/google.txt",
29
+ "test/fixtures/reddit.txt",
30
+ "test/fixtures/yelp.txt",
27
31
  "test/test_robots.rb"
28
32
  ]
29
33
  s.homepage = %q{http://github.com/fizx/robots}
30
34
  s.rdoc_options = ["--charset=UTF-8"]
31
35
  s.require_paths = ["lib"]
32
- s.rubygems_version = %q{1.3.5}
36
+ s.rubygems_version = %q{1.3.6}
33
37
  s.summary = %q{Simple robots.txt parser}
34
38
  s.test_files = [
35
39
  "test/test_robots.rb"
@@ -0,0 +1,2 @@
1
+ User-agent: *
2
+ Disallow:
@@ -0,0 +1,435 @@
1
+ # These entries assist in minimizing bandwidth usage caused
2
+ # by questionable robots spidering your site. Some of these
3
+ # robots or agents are used by web-stripping sofware.
4
+ # Please do not remove these entries, but feel free to add
5
+ # your own at the end of the list.
6
+ # If you have any questions regarding this file, please
7
+ # contact support@thinkhost.com
8
+
9
+ User-agent: *
10
+ Disallow: /rest/
11
+ Disallow: /xml/
12
+ Disallow: /json/
13
+ Disallow: /atom/
14
+ Disallow: /opml/
15
+ Disallow: /widget/
16
+ Disallow: /register
17
+ Disallow: /review
18
+ Disallow: /orderconfirmation
19
+ Disallow: /venues/
20
+ Disallow: /*?
21
+
22
+ Sitemap: http://www.eventbrite.com/sitemap_index.xml
23
+
24
+ User-agent: msnbot
25
+ Crawl-delay: 4
26
+
27
+ User-agent: Slurp
28
+ Crawl-delay: 4
29
+
30
+ User-agent: Balihoo
31
+ Disallow: /
32
+
33
+ User-agent: BotRightHere
34
+ Disallow: /
35
+
36
+ User-agent: WebZip
37
+ Disallow: /
38
+
39
+ User-agent: larbin
40
+ Disallow: /
41
+
42
+ User-agent: b2w/0.1
43
+ Disallow: /
44
+
45
+ User-agent: Copernic
46
+ Disallow: /
47
+
48
+ User-agent: psbot
49
+ Disallow: /
50
+
51
+ User-agent: Python-urllib
52
+ Disallow: /
53
+
54
+ User-agent: NetMechanic
55
+ Disallow: /
56
+
57
+ User-agent: URL_Spider_Pro
58
+ Disallow: /
59
+
60
+ User-agent: CherryPicker
61
+ Disallow: /
62
+
63
+ User-agent: EmailCollector
64
+ Disallow: /
65
+
66
+ User-agent: EmailSiphon
67
+ Disallow: /
68
+
69
+ User-agent: WebBandit
70
+ Disallow: /
71
+
72
+ User-agent: EmailWolf
73
+ Disallow: /
74
+
75
+ User-agent: ExtractorPro
76
+ Disallow: /
77
+
78
+ User-agent: CopyRightCheck
79
+ Disallow: /
80
+
81
+ User-agent: Crescent
82
+ Disallow: /
83
+
84
+ User-agent: SiteSnagger
85
+ Disallow: /
86
+
87
+ User-agent: ProWebWalker
88
+ Disallow: /
89
+
90
+ User-agent: CheeseBot
91
+ Disallow: /
92
+
93
+ User-agent: LNSpiderguy
94
+ Disallow: /
95
+
96
+ User-agent: Alexibot
97
+ Disallow: /
98
+
99
+ User-agent: Teleport
100
+ Disallow: /
101
+
102
+ User-agent: TeleportPro
103
+ Disallow: /
104
+
105
+ User-agent: MIIxpc
106
+ Disallow: /
107
+
108
+ User-agent: Telesoft
109
+ Disallow: /
110
+
111
+ User-agent: Website Quester
112
+ Disallow: /
113
+
114
+ User-agent: WebZip
115
+ Disallow: /
116
+
117
+ User-agent: moget/2.1
118
+ Disallow: /
119
+
120
+ User-agent: WebZip/4.0
121
+ Disallow: /
122
+
123
+ User-agent: WebStripper
124
+ Disallow: /
125
+
126
+ User-agent: WebSauger
127
+ Disallow: /
128
+
129
+ User-agent: WebCopier
130
+ Disallow: /
131
+
132
+ User-agent: NetAnts
133
+ Disallow: /
134
+
135
+ User-agent: Mister PiX
136
+ Disallow: /
137
+
138
+ User-agent: WebAuto
139
+ Disallow: /
140
+
141
+ User-agent: TheNomad
142
+ Disallow: /
143
+
144
+ User-agent: WWW-Collector-E
145
+ Disallow: /
146
+
147
+ User-agent: RMA
148
+ Disallow: /
149
+
150
+ User-agent: libWeb/clsHTTP
151
+ Disallow: /
152
+
153
+ User-agent: asterias
154
+ Disallow: /
155
+
156
+ User-agent: httplib
157
+ Disallow: /
158
+
159
+ User-agent: turingos
160
+ Disallow: /
161
+
162
+ User-agent: spanner
163
+ Disallow: /
164
+
165
+ User-agent: InfoNaviRobot
166
+ Disallow: /
167
+
168
+ User-agent: Harvest/1.5
169
+ Disallow: /
170
+
171
+ User-agent: Bullseye/1.0
172
+ Disallow: /
173
+
174
+ User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95)
175
+ Disallow: /
176
+
177
+ User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0
178
+ Disallow: /
179
+
180
+ User-agent: CherryPickerSE/1.0
181
+ Disallow: /
182
+
183
+ User-agent: CherryPickerElite/1.0
184
+ Disallow: /
185
+
186
+ User-agent: WebBandit/3.50
187
+ Disallow: /
188
+
189
+ User-agent: NICErsPRO
190
+ Disallow: /
191
+
192
+ User-agent: Microsoft URL Control - 5.01.4511
193
+ Disallow: /
194
+
195
+ User-agent: DittoSpyder
196
+ Disallow: /
197
+
198
+ User-agent: Foobot
199
+ Disallow: /
200
+
201
+ User-agent: SpankBot
202
+ Disallow: /
203
+
204
+ User-agent: BotALot
205
+ Disallow: /
206
+
207
+ User-agent: lwp-trivial/1.34
208
+ Disallow: /
209
+
210
+ User-agent: lwp-trivial
211
+ Disallow: /
212
+
213
+ User-agent: BunnySlippers
214
+ Disallow: /
215
+
216
+ User-agent: Microsoft URL Control - 6.00.8169
217
+ Disallow: /
218
+
219
+ User-agent: URLy Warning
220
+ Disallow: /
221
+
222
+ User-agent: Wget/1.6
223
+ Disallow: /
224
+
225
+ User-agent: Wget/1.5.3
226
+ Disallow: /
227
+
228
+ User-agent: Wget
229
+ Disallow: /
230
+
231
+ User-agent: LinkWalker
232
+ Disallow: /
233
+
234
+ User-agent: cosmos
235
+ Disallow: /
236
+
237
+ User-agent: moget
238
+ Disallow: /
239
+
240
+ User-agent: hloader
241
+ Disallow: /
242
+
243
+ User-agent: humanlinks
244
+ Disallow: /
245
+
246
+ User-agent: LinkextractorPro
247
+ Disallow: /
248
+
249
+ User-agent: Offline Explorer
250
+ Disallow: /
251
+
252
+ User-agent: Mata Hari
253
+ Disallow: /
254
+
255
+ User-agent: LexiBot
256
+ Disallow: /
257
+
258
+ User-agent: Web Image Collector
259
+ Disallow: /
260
+
261
+ User-agent: The Intraformant
262
+ Disallow: /
263
+
264
+ User-agent: True_Robot/1.0
265
+ Disallow: /
266
+
267
+ User-agent: True_Robot
268
+ Disallow: /
269
+
270
+ User-agent: BlowFish/1.0
271
+ Disallow: /
272
+
273
+ User-agent: JennyBot
274
+ Disallow: /
275
+
276
+ User-agent: MIIxpc/4.2
277
+ Disallow: /
278
+
279
+ User-agent: BuiltBotTough
280
+ Disallow: /
281
+
282
+ User-agent: ProPowerBot/2.14
283
+ Disallow: /
284
+
285
+ User-agent: BackDoorBot/1.0
286
+ Disallow: /
287
+
288
+ User-agent: toCrawl/UrlDispatcher
289
+ Disallow: /
290
+
291
+ User-agent: WebEnhancer
292
+ Disallow: /
293
+
294
+ User-agent: suzuran
295
+ Disallow: /
296
+
297
+ User-agent: TightTwatBot
298
+ Disallow: /
299
+
300
+ User-agent: VCI WebViewer VCI WebViewer Win32
301
+ Disallow: /
302
+
303
+ User-agent: VCI
304
+ Disallow: /
305
+
306
+ User-agent: Szukacz/1.4
307
+ Disallow: /
308
+
309
+ User-agent: QueryN Metasearch
310
+ Disallow: /
311
+
312
+ User-agent: Openfind data gatherer
313
+ Disallow: /
314
+
315
+ User-agent: Openfind
316
+ Disallow: /
317
+
318
+ User-agent: Xenu's Link Sleuth 1.1c
319
+ Disallow: /
320
+
321
+ User-agent: Xenu's
322
+ Disallow: /
323
+
324
+ User-agent: Zeus
325
+ Disallow: /
326
+
327
+ User-agent: RepoMonkey Bait & Tackle/v1.01
328
+ Disallow: /
329
+
330
+ User-agent: RepoMonkey
331
+ Disallow: /
332
+
333
+ User-agent: Microsoft URL Control
334
+ Disallow: /
335
+
336
+ User-agent: Openbot
337
+ Disallow: /
338
+
339
+ User-agent: URL Control
340
+ Disallow: /
341
+
342
+ User-agent: Zeus Link Scout
343
+ Disallow: /
344
+
345
+ User-agent: Zeus 32297 Webster Pro V2.9 Win32
346
+ Disallow: /
347
+
348
+ User-agent: Webster Pro
349
+ Disallow: /
350
+
351
+ User-agent: EroCrawler
352
+ Disallow: /
353
+
354
+ User-agent: LinkScan/8.1a Unix
355
+ Disallow: /
356
+
357
+ User-agent: Keyword Density/0.9
358
+ Disallow: /
359
+
360
+ User-agent: Kenjin Spider
361
+ Disallow: /
362
+
363
+ User-agent: Iron33/1.0.2
364
+ Disallow: /
365
+
366
+ User-agent: Bookmark search tool
367
+ Disallow: /
368
+
369
+ User-agent: GetRight/4.2
370
+ Disallow: /
371
+
372
+ User-agent: FairAd Client
373
+ Disallow: /
374
+
375
+ User-agent: Gaisbot
376
+ Disallow: /
377
+
378
+ User-agent: Aqua_Products
379
+ Disallow: /
380
+
381
+ User-agent: Radiation Retriever 1.1
382
+ Disallow: /
383
+
384
+ User-agent: Flaming AttackBot
385
+ Disallow: /
386
+
387
+ User-agent: Oracle Ultra Search
388
+ Disallow: /
389
+
390
+ User-agent: MSIECrawler
391
+ Disallow: /
392
+
393
+ User-agent: PerMan
394
+ Disallow: /
395
+
396
+ User-agent: searchpreview
397
+ Disallow: /
398
+
399
+ User-agent: TurnitinBot
400
+ Disallow: /
401
+
402
+ User-agent: wget
403
+ Disallow: /
404
+
405
+ User-agent: ExtractorPro
406
+ Disallow: /
407
+
408
+ User-agent: WebZIP/4.21
409
+ Disallow: /
410
+
411
+ User-agent: WebZIP/5.0
412
+ Disallow: /
413
+
414
+ User-agent: HTTrack 3.0
415
+ Disallow: /
416
+
417
+ User-agent: TurnitinBot/1.5
418
+ Disallow: /
419
+
420
+ User-agent: WebCopier v3.2a
421
+ Disallow: /
422
+
423
+ User-agent: WebCapture 2.0
424
+ Disallow: /
425
+
426
+ User-agent: WebCopier v.2.2
427
+ Disallow: /
428
+
429
+ User-agent: Spinn3r
430
+ Disallow: /
431
+
432
+ User-agent: Tailrank
433
+ Disallow: /
434
+
435
+ Sitemap: http://www.eventbrite.com/sitemap_index.xml
@@ -0,0 +1,215 @@
1
+ User-agent: *
2
+ Disallow: /search
3
+ Disallow: /groups
4
+ Disallow: /images
5
+ Disallow: /catalogs
6
+ Disallow: /catalogues
7
+ Disallow: /news
8
+ Allow: /news/directory
9
+ Disallow: /nwshp
10
+ Disallow: /setnewsprefs?
11
+ Disallow: /index.html?
12
+ Disallow: /?
13
+ Disallow: /addurl/image?
14
+ Disallow: /pagead/
15
+ Disallow: /relpage/
16
+ Disallow: /relcontent
17
+ Disallow: /imgres
18
+ Disallow: /imglanding
19
+ Disallow: /keyword/
20
+ Disallow: /u/
21
+ Disallow: /univ/
22
+ Disallow: /cobrand
23
+ Disallow: /custom
24
+ Disallow: /advanced_group_search
25
+ Disallow: /googlesite
26
+ Disallow: /preferences
27
+ Disallow: /setprefs
28
+ Disallow: /swr
29
+ Disallow: /url
30
+ Disallow: /default
31
+ Disallow: /m?
32
+ Disallow: /m/?
33
+ Disallow: /m/blogs?
34
+ Disallow: /m/ig
35
+ Disallow: /m/images?
36
+ Disallow: /m/local?
37
+ Disallow: /m/movies?
38
+ Disallow: /m/news?
39
+ Disallow: /m/news/i?
40
+ Disallow: /m/place?
41
+ Disallow: /m/setnewsprefs?
42
+ Disallow: /m/search?
43
+ Disallow: /m/swmloptin?
44
+ Disallow: /m/trends
45
+ Disallow: /wml?
46
+ Disallow: /wml/?
47
+ Disallow: /wml/search?
48
+ Disallow: /xhtml?
49
+ Disallow: /xhtml/?
50
+ Disallow: /xhtml/search?
51
+ Disallow: /xml?
52
+ Disallow: /imode?
53
+ Disallow: /imode/?
54
+ Disallow: /imode/search?
55
+ Disallow: /jsky?
56
+ Disallow: /jsky/?
57
+ Disallow: /jsky/search?
58
+ Disallow: /pda?
59
+ Disallow: /pda/?
60
+ Disallow: /pda/search?
61
+ Disallow: /sprint_xhtml
62
+ Disallow: /sprint_wml
63
+ Disallow: /pqa
64
+ Disallow: /palm
65
+ Disallow: /gwt/
66
+ Disallow: /purchases
67
+ Disallow: /hws
68
+ Disallow: /bsd?
69
+ Disallow: /linux?
70
+ Disallow: /mac?
71
+ Disallow: /microsoft?
72
+ Disallow: /unclesam?
73
+ Disallow: /answers/search?q=
74
+ Disallow: /local?
75
+ Disallow: /local_url
76
+ Disallow: /froogle?
77
+ Disallow: /products?
78
+ Disallow: /products/
79
+ Disallow: /froogle_
80
+ Disallow: /product_
81
+ Disallow: /products_
82
+ Disallow: /print
83
+ Disallow: /books
84
+ Disallow: /bkshp?q=
85
+ Allow: /booksrightsholders
86
+ Disallow: /patents?
87
+ Disallow: /patents/
88
+ Allow: /patents/about
89
+ Disallow: /scholar
90
+ Disallow: /complete
91
+ Disallow: /sponsoredlinks
92
+ Disallow: /videosearch?
93
+ Disallow: /videopreview?
94
+ Disallow: /videoprograminfo?
95
+ Disallow: /maps?
96
+ Disallow: /mapstt?
97
+ Disallow: /mapslt?
98
+ Disallow: /maps/stk/
99
+ Disallow: /maps/br?
100
+ Disallow: /mapabcpoi?
101
+ Disallow: /maphp?
102
+ Disallow: /places/
103
+ Disallow: /maps/place
104
+ Disallow: /help/maps/streetview/partners/welcome/
105
+ Disallow: /lochp?
106
+ Disallow: /center
107
+ Disallow: /ie?
108
+ Disallow: /sms/demo?
109
+ Disallow: /katrina?
110
+ Disallow: /blogsearch?
111
+ Disallow: /blogsearch/
112
+ Disallow: /blogsearch_feeds
113
+ Disallow: /advanced_blog_search
114
+ Disallow: /reader/
115
+ Allow: /reader/play
116
+ Disallow: /uds/
117
+ Disallow: /chart?
118
+ Disallow: /transit?
119
+ Disallow: /mbd?
120
+ Disallow: /extern_js/
121
+ Disallow: /calendar/feeds/
122
+ Disallow: /calendar/ical/
123
+ Disallow: /cl2/feeds/
124
+ Disallow: /cl2/ical/
125
+ Disallow: /coop/directory
126
+ Disallow: /coop/manage
127
+ Disallow: /trends?
128
+ Disallow: /trends/music?
129
+ Disallow: /trends/hottrends?
130
+ Disallow: /trends/viz?
131
+ Disallow: /notebook/search?
132
+ Disallow: /musica
133
+ Disallow: /musicad
134
+ Disallow: /musicas
135
+ Disallow: /musicl
136
+ Disallow: /musics
137
+ Disallow: /musicsearch
138
+ Disallow: /musicsp
139
+ Disallow: /musiclp
140
+ Disallow: /browsersync
141
+ Disallow: /call
142
+ Disallow: /archivesearch?
143
+ Disallow: /archivesearch/url
144
+ Disallow: /archivesearch/advanced_search
145
+ Disallow: /base/reportbadoffer
146
+ Disallow: /urchin_test/
147
+ Disallow: /movies?
148
+ Disallow: /codesearch?
149
+ Disallow: /codesearch/feeds/search?
150
+ Disallow: /wapsearch?
151
+ Disallow: /safebrowsing
152
+ Allow: /safebrowsing/diagnostic
153
+ Allow: /safebrowsing/report_error/
154
+ Allow: /safebrowsing/report_phish/
155
+ Disallow: /reviews/search?
156
+ Disallow: /orkut/albums
157
+ Disallow: /jsapi
158
+ Disallow: /views?
159
+ Disallow: /c/
160
+ Disallow: /cbk
161
+ Disallow: /recharge/dashboard/car
162
+ Disallow: /recharge/dashboard/static/
163
+ Disallow: /translate_a/
164
+ Disallow: /translate_c
165
+ Disallow: /translate_f
166
+ Disallow: /translate_static/
167
+ Disallow: /translate_suggestion
168
+ Disallow: /profiles/me
169
+ Allow: /profiles
170
+ Disallow: /s2/profiles/me
171
+ Allow: /s2/profiles
172
+ Allow: /s2/photos
173
+ Allow: /s2/static
174
+ Disallow: /s2
175
+ Disallow: /transconsole/portal/
176
+ Disallow: /gcc/
177
+ Disallow: /aclk
178
+ Disallow: /cse?
179
+ Disallow: /cse/home
180
+ Disallow: /cse/panel
181
+ Disallow: /cse/manage
182
+ Disallow: /tbproxy/
183
+ Disallow: /comparisonads/
184
+ Disallow: /imesync/
185
+ Disallow: /shenghuo/search?
186
+ Disallow: /support/forum/search?
187
+ Disallow: /reviews/polls/
188
+ Disallow: /hosted/images/
189
+ Disallow: /ppob/?
190
+ Disallow: /ppob?
191
+ Disallow: /ig/add?
192
+ Disallow: /adwordsresellers
193
+ Disallow: /accounts/o8
194
+ Allow: /accounts/o8/id
195
+ Disallow: /topicsearch?q=
196
+ Disallow: /xfx7/
197
+ Disallow: /squared/api
198
+ Disallow: /squared/search
199
+ Disallow: /squared/table
200
+ Disallow: /toolkit/
201
+ Allow: /toolkit/*.html
202
+ Disallow: /qnasearch?
203
+ Disallow: /errors/
204
+ Disallow: /app/updates
205
+ Disallow: /sidewiki/entry/
206
+ Disallow: /quality_form?
207
+ Disallow: /labs/popgadget/search
208
+ Disallow: /buzz/post
209
+ Disallow: /compressiontest/
210
+ Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
211
+ Sitemap: http://www.google.com/hostednews/sitemap_index.xml
212
+ Sitemap: http://www.google.com/ventures/sitemap_ventures.xml
213
+ Sitemap: http://www.google.com/sitemaps_webmasters.xml
214
+ Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml
215
+ Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml
@@ -0,0 +1,14 @@
1
+ # 80legs
2
+ User-agent: 008
3
+ Disallow: /
4
+
5
+ User-Agent: *
6
+ Disallow: /goto
7
+ Disallow: /*after=
8
+ Disallow: /*before=
9
+ Disallow: /domain/*t=
10
+ Disallow: /login
11
+ Disallow: /reddits/search
12
+ Disallow: /search
13
+ Disallow: /r/*/search
14
+ Allow: /
@@ -0,0 +1,68 @@
1
+ #
2
+ # 1. A robot may not injure a human being or, through inaction, allow a
3
+ # human being to come to harm.
4
+ #
5
+ # 2. A robot must obey orders given it by human beings except where such
6
+ # orders would conflict with the First Law.
7
+ #
8
+ # 3. A robot must protect its own existence as long as such protection
9
+ # does not conflict with the First or Second Law.
10
+
11
+ User-agent: *
12
+ Disallow: /advertise?
13
+ Disallow: /biz_share?
14
+ Disallow: /biz_attribute
15
+ Disallow: /biz_link
16
+ Disallow: /biz_update
17
+ Disallow: /bookmark?
18
+ Disallow: /flag_content?
19
+ Disallow: /invite_friends_service?
20
+ Disallow: /login?
21
+ Disallow: /mail?
22
+ Disallow: /map?
23
+ Disallow: /redir?
24
+ Disallow: /writeareview
25
+ Disallow: /signup?
26
+ Disallow: /talk/new_topic
27
+ Disallow: /thanx?
28
+ Disallow: /user_favorites?
29
+ Disallow: /weekly/signup
30
+ Disallow: /elite?
31
+ Disallow: /member_search_results
32
+ Disallow: /advertise?
33
+ Disallow: /syndicate/
34
+ Disallow: /filtered_reviews
35
+ Disallow: /language/update?
36
+
37
+ User-agent: Fasterfox
38
+ Disallow: /
39
+
40
+ User-agent: Nutch
41
+ Disallow: /
42
+
43
+ User-agent: spock
44
+ Disallow: /
45
+
46
+ User-agent: OmniExplorer_Bot
47
+ Disallow: /
48
+
49
+ User-agent: MJ12bot
50
+ Disallow: /
51
+
52
+ User-agent: TurnitinBot
53
+ Disallow: /
54
+
55
+ User-agent: BecomeBot
56
+ Disallow: /
57
+
58
+ User-agent: genieBot
59
+ Disallow: /
60
+
61
+ User-agent: dotbot
62
+ Disallow: /
63
+
64
+ User-agent: MLBot
65
+ Disallow: /
66
+
67
+ User-agent: 80bot
68
+ Disallow: /
@@ -2,44 +2,75 @@
2
2
  require "test/unit"
3
3
  require File.dirname(__FILE__) + "/../lib/robots"
4
4
 
5
- module Kernel
6
- alias_method :open_old, :open
7
-
8
- def set_open(key, value)
9
- @fake_open_values ||= {}
10
- @fake_open_values[key] = value
5
+ module FakeHttp
6
+ def content_type
7
+ "text/plain"
11
8
  end
12
9
 
13
- def open(*args)
14
- @fake_open_values ||= {}
15
- @fake_open_values[args.first] || open_old(*args)
10
+ def status
11
+ ["200", "OK"]
16
12
  end
17
13
  end
18
14
 
19
15
  class TestRobots < Test::Unit::TestCase
20
16
  def setup
17
+ def Robots.get_robots_txt(uri)
18
+ fixture_file = File.dirname(__FILE__) + "/fixtures/" + uri.host.split(".")[-2] + ".txt"
19
+ File.open(fixture_file).extend(FakeHttp)
20
+ end
21
+
21
22
  @robots = Robots.new "Ruby-Robot.txt Parser Test Script"
22
23
  end
23
24
 
24
25
  def test_allowed_if_no_robots
25
- assert @robots.allowed?("http://www.yahoo.com")
26
+ def Robots.get_robots_txt(uri)
27
+ return nil
28
+ end
29
+
30
+ assert_allowed("somesite", "/")
31
+ end
32
+
33
+ def test_disallow_nothing
34
+ assert_allowed("emptyish", "/")
35
+ assert_allowed("emptyish", "/foo")
26
36
  end
27
37
 
28
38
  def test_reddit
29
- assert @robots.allowed?("http://reddit.com")
39
+ assert_allowed("reddit", "/")
30
40
  end
31
41
 
32
42
  def test_other
33
- assert @robots.allowed?("http://www.yelp.com/foo")
34
- assert !@robots.allowed?("http://www.yelp.com/mail?foo=bar")
43
+ assert_allowed("yelp", "/foo")
44
+ assert_disallowed("yelp", "/mail?foo=bar")
35
45
  end
36
46
 
37
47
  def test_site_with_disallowed
38
- assert @robots.allowed?("http://www.google.com/")
48
+ assert_allowed("google", "/")
39
49
  end
40
50
 
41
51
  def test_other_values
42
52
  sitemap = {"Sitemap" => ["http://www.eventbrite.com/sitemap_index.xml", "http://www.eventbrite.com/sitemap_index.xml"]}
43
- assert_equal(sitemap, @robots.other_values("http://eventbrite.com"))
53
+ assert_other_equals("eventbrite", sitemap)
54
+ end
55
+
56
+ def assert_other_equals(name, value)
57
+ assert_equal(value, @robots.other_values(uri_for_name(name, "/")))
58
+ end
59
+
60
+ def assert_allowed(name, path)
61
+ assert_allowed_equals(name, path, true)
62
+ end
63
+
64
+ def assert_disallowed(name, path)
65
+ assert_allowed_equals(name, path, false)
66
+ end
67
+
68
+ def assert_allowed_equals(name, path, value)
69
+ assert_equal(value, @robots.allowed?(uri_for_name(name, path)), @robots.inspect)
70
+ end
71
+
72
+ def uri_for_name(name, path)
73
+ uri = name.nil? ? nil : "http://www.#{name}.com#{path}"
44
74
  end
75
+
45
76
  end
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: robots
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 9
8
+ - 0
9
+ version: 0.9.0
5
10
  platform: ruby
6
11
  authors:
7
12
  - Kyle Maxwell
@@ -9,19 +14,21 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2010-02-08 00:00:00 -08:00
17
+ date: 2010-05-29 00:00:00 -07:00
13
18
  default_executable:
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: thoughtbot-shoulda
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
20
24
  requirements:
21
25
  - - ">="
22
26
  - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
23
29
  version: "0"
24
- version:
30
+ type: :development
31
+ version_requirements: *id001
25
32
  description: It parses robots.txt files
26
33
  email: kyle@kylemaxwell.com
27
34
  executables: []
@@ -38,7 +45,11 @@ files:
38
45
  - VERSION
39
46
  - lib/robots.rb
40
47
  - robots.gemspec
41
- - test/fixtures/robots1.txt
48
+ - test/fixtures/emptyish.txt
49
+ - test/fixtures/eventbrite.txt
50
+ - test/fixtures/google.txt
51
+ - test/fixtures/reddit.txt
52
+ - test/fixtures/yelp.txt
42
53
  - test/test_robots.rb
43
54
  has_rdoc: true
44
55
  homepage: http://github.com/fizx/robots
@@ -53,18 +64,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
53
64
  requirements:
54
65
  - - ">="
55
66
  - !ruby/object:Gem::Version
67
+ segments:
68
+ - 0
56
69
  version: "0"
57
- version:
58
70
  required_rubygems_version: !ruby/object:Gem::Requirement
59
71
  requirements:
60
72
  - - ">="
61
73
  - !ruby/object:Gem::Version
74
+ segments:
75
+ - 0
62
76
  version: "0"
63
- version:
64
77
  requirements: []
65
78
 
66
79
  rubyforge_project:
67
- rubygems_version: 1.3.5
80
+ rubygems_version: 1.3.6
68
81
  signing_key:
69
82
  specification_version: 3
70
83
  summary: Simple robots.txt parser
File without changes