spidr 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,16 @@
1
+ === 0.1.6 / 2009-04-14
2
+
3
+ * Added Agent#failures, a list of URLs which could not be visited.
4
+ * Added Agent#failed?.
5
+ * Added Agent#every_failed_url.
6
+ * Added Agent#clear, which clears the history and failures URL lists.
7
+ * Improved fault tolerance in Agent#get_page.
8
+ * If a Network or HTTP error is encountered, the URL will be added to
9
+ the failures list and the next URL will be visited.
10
+ * Fixed a typo in Agent#ignore_exts_like.
11
+ * Updated the Web Spider Obstacle Course with links that always fail to be
12
+ visited.
13
+
1
14
  === 0.1.5 / 2009-03-22
2
15
 
3
16
  * Catch malformed URIs in Page#to_absolute and return +nil+.
data/README.txt CHANGED
@@ -21,6 +21,7 @@ and easy to use.
21
21
  * Every visited Page.
22
22
  * Every visited URL.
23
23
  * Every visited URL that matches a specified pattern.
24
+ * Every URL that failed to be visited.
24
25
  * Custom User-Agent strings.
25
26
  * Custom proxy settings.
26
27
 
data/lib/spidr/agent.rb CHANGED
@@ -23,7 +23,10 @@ module Spidr
23
23
  attr_accessor :delay
24
24
 
25
25
  # History containing visited URLs
26
- attr_accessor :history
26
+ attr_reader :history
27
+
28
+ # List of unreachable URLs
29
+ attr_reader :failures
27
30
 
28
31
  #
29
32
  # Creates a new Agent object with the given _options_ and _block_.
@@ -70,12 +73,14 @@ module Spidr
70
73
  )
71
74
 
72
75
  @every_url_blocks = []
76
+ @every_failed_url_blocks = []
73
77
  @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
74
78
 
75
79
  @every_page_blocks = []
76
80
 
77
81
  @delay = (options[:delay] || 0)
78
82
  @history = []
83
+ @failures = []
79
84
  @queue = []
80
85
 
81
86
  if options[:host]
@@ -287,7 +292,7 @@ module Spidr
287
292
  # Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
288
293
  # it will be added to the ignore_exts.
289
294
  #
290
- def ignore_exts_like(&block)
295
+ def ignore_exts_like(pattern=nil,&block)
291
296
  if pattern
292
297
  ignore_exts << pattern
293
298
  elsif block
@@ -306,6 +311,15 @@ module Spidr
306
311
  return self
307
312
  end
308
313
 
314
+ #
315
+ # For every URL that the agent is unable to visit, it will be passed
316
+ # to the specified _block_.
317
+ #
318
+ def every_failed_url(&block)
319
+ @every_failed_url_blocks << block
320
+ return self
321
+ end
322
+
309
323
  #
310
324
  # For every URL that the agent visits and matches the specified
311
325
  # _pattern_, it will be passed to the specified _block_.
@@ -324,11 +338,21 @@ module Spidr
324
338
  return self
325
339
  end
326
340
 
341
+ #
342
+ # Clears the history of the agent.
343
+ #
344
+ def clear
345
+ @queue.clear
346
+ @history.clear
347
+ @failures.clear
348
+ return self
349
+ end
350
+
327
351
  #
328
352
  # Clear the history and start spidering at the specified _url_.
329
353
  #
330
354
  def start_at(url)
331
- @history.clear
355
+ clear
332
356
  return run(url)
333
357
  end
334
358
 
@@ -366,11 +390,23 @@ module Spidr
366
390
  # otherwise.
367
391
  #
368
392
  def visited?(url)
369
- if url.kind_of?(URI)
370
- return @history.include?(url)
371
- else
372
- return @history.include?(URI(url).to_s)
393
+ unless url.kind_of?(URI)
394
+ url = URI(url)
373
395
  end
396
+
397
+ return @history.include?(url)
398
+ end
399
+
400
+ #
401
+ # Returns +true+ if the specified _url_ was unable to be visited,
402
+ # returns +false+ otherwise.
403
+ #
404
+ def failed?(url)
405
+ unless url.kind_of?(URI)
406
+ url = URI(url)
407
+ end
408
+
409
+ return @failures.include?(url)
374
410
  end
375
411
 
376
412
  #
@@ -392,16 +428,21 @@ module Spidr
392
428
  proxy_user = @proxy[:user]
393
429
  proxy_password = @proxy[:password]
394
430
 
395
- Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
396
- headers = {}
431
+ begin
432
+ Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
433
+ headers = {}
397
434
 
398
- headers['User-Agent'] = @user_agent if @user_agent
399
- headers['Referer'] = @referer if @referer
435
+ headers['User-Agent'] = @user_agent if @user_agent
436
+ headers['Referer'] = @referer if @referer
400
437
 
401
- new_page = Page.new(url,sess.get(path,headers))
438
+ new_page = Page.new(url,sess.get(path,headers))
402
439
 
403
- block.call(new_page) if block
404
- return new_page
440
+ block.call(new_page) if block
441
+ return new_page
442
+ end
443
+ rescue SystemCallError, Net::HTTPBadResponse
444
+ failed(url)
445
+ return nil
405
446
  end
406
447
  end
407
448
 
@@ -447,6 +488,50 @@ module Spidr
447
488
  @queue.shift
448
489
  end
449
490
 
491
+ #
492
+ # Returns +true+ if the specified _url_ should be visited, based on
493
+ # it's scheme, returns +false+ otherwise.
494
+ #
495
+ def visit_scheme?(url)
496
+ if url.scheme
497
+ return SCHEMES.include?(url.scheme)
498
+ else
499
+ return true
500
+ end
501
+ end
502
+
503
+ #
504
+ # Returns +true+ if the specified _url_ should be visited, based on
505
+ # the host of the _url_, returns +false+ otherwise.
506
+ #
507
+ def visit_host?(url)
508
+ @host_rules.accept?(url.host)
509
+ end
510
+
511
+ #
512
+ # Returns +true+ if the specified _url_ should be visited, based on
513
+ # the port of the _url_, returns +false+ otherwise.
514
+ #
515
+ def visit_port?(url)
516
+ @port_rules.accept?(url.port)
517
+ end
518
+
519
+ #
520
+ # Returns +true+ if the specified _url_ should be visited, based on
521
+ # the pattern of the _url_, returns +false+ otherwise.
522
+ #
523
+ def visit_link?(url)
524
+ @link_rules.accept?(url.to_s)
525
+ end
526
+
527
+ #
528
+ # Returns +true+ if the specified _url_ should be visited, based on
529
+ # the file extension of the _url_, returns +false+ otherwise.
530
+ #
531
+ def visit_ext?(url)
532
+ @ext_rules.accept?(File.extname(url.path)[1..-1])
533
+ end
534
+
450
535
  #
451
536
  # Returns +true+ if the specified URL should be visited, returns
452
537
  # +false+ otherwise.
@@ -477,28 +562,17 @@ module Spidr
477
562
  end
478
563
  end
479
564
 
480
- def visit_scheme?(url)
481
- if url.scheme
482
- return SCHEMES.include?(url.scheme)
483
- else
484
- return true
565
+ #
566
+ # Adds the specified _url_ to the failures list.
567
+ #
568
+ def failed(url)
569
+ unless url.kind_of?(URI)
570
+ url = URI(url.to_s)
485
571
  end
486
- end
487
572
 
488
- def visit_host?(url)
489
- @host_rules.accept?(url.host)
490
- end
491
-
492
- def visit_port?(url)
493
- @port_rules.accept?(url.port)
494
- end
495
-
496
- def visit_link?(url)
497
- @link_rules.accept?(url.to_s)
498
- end
499
-
500
- def visit_ext?(url)
501
- @ext_rules.accept?(File.extname(url.path)[1..-1])
573
+ @every_failed_url_blocks.each { |block| block.call(url) }
574
+ @failures << url
575
+ return true
502
576
  end
503
577
 
504
578
  end
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.5'
2
+ VERSION = '0.1.6'
3
3
  end
@@ -15,18 +15,25 @@ module Helpers
15
15
  message = spec['message'].to_s.dump
16
16
  url = spec['url'].to_s.dump
17
17
 
18
- if spec['behavior'] == 'follow'
18
+ case spec['behavior']
19
+ when 'follow'
19
20
  base.module_eval %{
20
21
  it #{message} do
21
22
  should_visit_link(#{url})
22
23
  end
23
24
  }
24
- elsif spec['behavior'] == 'nofollow'
25
+ when 'nofollow'
25
26
  base.module_eval %{
26
27
  it #{message} do
27
28
  should_visit_once(#{url})
28
29
  end
29
30
  }
31
+ when 'fail'
32
+ base.module_eval %{
33
+ it #{message} do
34
+ should_fail_link(#{url})
35
+ end
36
+ }
30
37
  else
31
38
  link = spec['link'].to_s.dump
32
39
 
@@ -42,7 +49,10 @@ module Helpers
42
49
  end
43
50
 
44
51
  def run_course
45
- Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host])
52
+ Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
53
+ agent.every_failed_url { |url| puts "[FAILED] #{url}" }
54
+ agent.every_url { |url| puts url }
55
+ end
46
56
  end
47
57
 
48
58
  def visited_once?(link)
@@ -58,13 +68,11 @@ module Helpers
58
68
  # +false+ otherwise.
59
69
  #
60
70
  def visited_link?(link)
61
- url = COURSE_URL.merge(URI.encode(link))
62
-
63
- @agent.visited_urls.each do |visited_url|
64
- return true if visited_url == url
65
- end
71
+ @agent.visited?(COURSE_URL.merge(URI.encode(link)))
72
+ end
66
73
 
67
- return false
74
+ def visit_failed?(link)
75
+ @agent.failed?(COURSE_URL.merge(URI.encode(link)))
68
76
  end
69
77
 
70
78
  def should_visit_link(link)
@@ -78,5 +86,10 @@ module Helpers
78
86
  def should_visit_once(link)
79
87
  visited_once?(link).should == true
80
88
  end
89
+
90
+ def should_fail_link(link)
91
+ visited_link?(link).should == false
92
+ visit_failed?(link).should == true
93
+ end
81
94
  end
82
95
  end
@@ -18,6 +18,10 @@
18
18
  <li class="follow">
19
19
  <a href="http://spidr.rubyforge.org/course/remote/next.html">should follow remote links to unvisited pages</a>
20
20
  </li>
21
+
22
+ <li class="fail">
23
+ <a href="http://spidr.rubyforge.org:1337/path/">should ignore links that fail</a>
24
+ </li>
21
25
  </ul>
22
26
  </body>
23
27
  </html>
@@ -1 +1 @@
1
- [{"behavior":"ignore","link":"javascript:fail();","url":"javascript:fail();","message":"should ignore links beginning with \"javascript:\"","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","message":"should ignore links with an onclick attribute and a href pointing to the page.","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"behavior":"follow","link":"next.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","message":"should follow links pointing to other pages","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links pointing to the current page","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","link":"normal.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","message":"should follow relative links","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"behavior":"follow","link":".\/current_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","message":"should follow relative links to files in the current directory","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"behavior":"follow","link":"..\/relative\/same_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","message":"should follow links that transverse directories","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","message":"should ignore in-page links","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with no href attributes","example":"<a>should not follow links with no href attributes<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with empty href attributes","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"behavior":"ignore","link":" ","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","message":"should ignore links with blank href attributes","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"behavior":"follow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"behavior":"follow","link":"\/course\/absolute\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","message":"should follow absolute links to unvisited pages","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"\/course\/absolute\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","message":"should not follow absolute links to the current page","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"}]
1
+ [{"link":"\/course\/absolute\/next.html","behavior":"follow","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"link":"\/course\/absolute\/start.html","behavior":"nofollow","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a>should not follow links with no href attributes<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"link":" ","behavior":"ignore","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"link":"javascript:fail();","behavior":"ignore","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"link":"next.html","behavior":"follow","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"link":"normal.html","behavior":"follow","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"link":".\/current_directory.html","behavior":"follow","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"link":"..\/relative\/same_directory.html","behavior":"follow","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","behavior":"fail","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>"}]
data/tasks/course.rb CHANGED
@@ -44,6 +44,10 @@ namespace :course do
44
44
  doc.search('.ignore//a').each do |ignore|
45
45
  specs << link_to_spec.call(ignore, :behavior => :ignore)
46
46
  end
47
+
48
+ doc.search('.fail//a').each do |ignore|
49
+ specs << link_to_spec.call(ignore, :behavior => :fail)
50
+ end
47
51
  end
48
52
 
49
53
  spec.write(specs.to_json)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-03-22 00:00:00 -07:00
12
+ date: 2009-04-14 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: 1.11.0
33
+ version: 1.12.1
34
34
  version:
35
35
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
36
36
  email: