spidr 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,16 @@
1
+ === 0.1.6 / 2009-04-14
2
+
3
+ * Added Agent#failures, a list of URLs which could not be visited.
4
+ * Added Agent#failed?.
5
+ * Added Agent#every_failed_url.
6
+ * Added Agent#clear, which clears the history and failures URL lists.
7
+ * Improved fault tolerance in Agent#get_page.
8
+ * If a Network or HTTP error is encountered, the URL will be added to
9
+ the failures list and the next URL will be visited.
10
+ * Fixed a typo in Agent#ignore_exts_like.
11
+ * Updated the Web Spider Obstacle Course with links that always fail to be
12
+ visited.
13
+
1
14
  === 0.1.5 / 2009-03-22
2
15
 
3
16
  * Catch malformed URIs in Page#to_absolute and return +nil+.
data/README.txt CHANGED
@@ -21,6 +21,7 @@ and easy to use.
21
21
  * Every visited Page.
22
22
  * Every visited URL.
23
23
  * Every visited URL that matches a specified pattern.
24
+ * Every URL that failed to be visited.
24
25
  * Custom User-Agent strings.
25
26
  * Custom proxy settings.
26
27
 
data/lib/spidr/agent.rb CHANGED
@@ -23,7 +23,10 @@ module Spidr
23
23
  attr_accessor :delay
24
24
 
25
25
  # History containing visited URLs
26
- attr_accessor :history
26
+ attr_reader :history
27
+
28
+ # List of unreachable URLs
29
+ attr_reader :failures
27
30
 
28
31
  #
29
32
  # Creates a new Agent object with the given _options_ and _block_.
@@ -70,12 +73,14 @@ module Spidr
70
73
  )
71
74
 
72
75
  @every_url_blocks = []
76
+ @every_failed_url_blocks = []
73
77
  @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
74
78
 
75
79
  @every_page_blocks = []
76
80
 
77
81
  @delay = (options[:delay] || 0)
78
82
  @history = []
83
+ @failures = []
79
84
  @queue = []
80
85
 
81
86
  if options[:host]
@@ -287,7 +292,7 @@ module Spidr
287
292
  # Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
288
293
  # it will be added to the ignore_exts.
289
294
  #
290
- def ignore_exts_like(&block)
295
+ def ignore_exts_like(pattern=nil,&block)
291
296
  if pattern
292
297
  ignore_exts << pattern
293
298
  elsif block
@@ -306,6 +311,15 @@ module Spidr
306
311
  return self
307
312
  end
308
313
 
314
+ #
315
+ # For every URL that the agent is unable to visit, it will be passed
316
+ # to the specified _block_.
317
+ #
318
+ def every_failed_url(&block)
319
+ @every_failed_url_blocks << block
320
+ return self
321
+ end
322
+
309
323
  #
310
324
  # For every URL that the agent visits and matches the specified
311
325
  # _pattern_, it will be passed to the specified _block_.
@@ -324,11 +338,21 @@ module Spidr
324
338
  return self
325
339
  end
326
340
 
341
+ #
342
+ # Clears the history of the agent.
343
+ #
344
+ def clear
345
+ @queue.clear
346
+ @history.clear
347
+ @failures.clear
348
+ return self
349
+ end
350
+
327
351
  #
328
352
  # Clear the history and start spidering at the specified _url_.
329
353
  #
330
354
  def start_at(url)
331
- @history.clear
355
+ clear
332
356
  return run(url)
333
357
  end
334
358
 
@@ -366,11 +390,23 @@ module Spidr
366
390
  # otherwise.
367
391
  #
368
392
  def visited?(url)
369
- if url.kind_of?(URI)
370
- return @history.include?(url)
371
- else
372
- return @history.include?(URI(url).to_s)
393
+ unless url.kind_of?(URI)
394
+ url = URI(url)
373
395
  end
396
+
397
+ return @history.include?(url)
398
+ end
399
+
400
+ #
401
+ # Returns +true+ if the specified _url_ was unable to be visited,
402
+ # returns +false+ otherwise.
403
+ #
404
+ def failed?(url)
405
+ unless url.kind_of?(URI)
406
+ url = URI(url)
407
+ end
408
+
409
+ return @failures.include?(url)
374
410
  end
375
411
 
376
412
  #
@@ -392,16 +428,21 @@ module Spidr
392
428
  proxy_user = @proxy[:user]
393
429
  proxy_password = @proxy[:password]
394
430
 
395
- Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
396
- headers = {}
431
+ begin
432
+ Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
433
+ headers = {}
397
434
 
398
- headers['User-Agent'] = @user_agent if @user_agent
399
- headers['Referer'] = @referer if @referer
435
+ headers['User-Agent'] = @user_agent if @user_agent
436
+ headers['Referer'] = @referer if @referer
400
437
 
401
- new_page = Page.new(url,sess.get(path,headers))
438
+ new_page = Page.new(url,sess.get(path,headers))
402
439
 
403
- block.call(new_page) if block
404
- return new_page
440
+ block.call(new_page) if block
441
+ return new_page
442
+ end
443
+ rescue SystemCallError, Net::HTTPBadResponse
444
+ failed(url)
445
+ return nil
405
446
  end
406
447
  end
407
448
 
@@ -447,6 +488,50 @@ module Spidr
447
488
  @queue.shift
448
489
  end
449
490
 
491
+ #
492
+ # Returns +true+ if the specified _url_ should be visited, based on
493
+ # it's scheme, returns +false+ otherwise.
494
+ #
495
+ def visit_scheme?(url)
496
+ if url.scheme
497
+ return SCHEMES.include?(url.scheme)
498
+ else
499
+ return true
500
+ end
501
+ end
502
+
503
+ #
504
+ # Returns +true+ if the specified _url_ should be visited, based on
505
+ # the host of the _url_, returns +false+ otherwise.
506
+ #
507
+ def visit_host?(url)
508
+ @host_rules.accept?(url.host)
509
+ end
510
+
511
+ #
512
+ # Returns +true+ if the specified _url_ should be visited, based on
513
+ # the port of the _url_, returns +false+ otherwise.
514
+ #
515
+ def visit_port?(url)
516
+ @port_rules.accept?(url.port)
517
+ end
518
+
519
+ #
520
+ # Returns +true+ if the specified _url_ should be visited, based on
521
+ # the pattern of the _url_, returns +false+ otherwise.
522
+ #
523
+ def visit_link?(url)
524
+ @link_rules.accept?(url.to_s)
525
+ end
526
+
527
+ #
528
+ # Returns +true+ if the specified _url_ should be visited, based on
529
+ # the file extension of the _url_, returns +false+ otherwise.
530
+ #
531
+ def visit_ext?(url)
532
+ @ext_rules.accept?(File.extname(url.path)[1..-1])
533
+ end
534
+
450
535
  #
451
536
  # Returns +true+ if the specified URL should be visited, returns
452
537
  # +false+ otherwise.
@@ -477,28 +562,17 @@ module Spidr
477
562
  end
478
563
  end
479
564
 
480
- def visit_scheme?(url)
481
- if url.scheme
482
- return SCHEMES.include?(url.scheme)
483
- else
484
- return true
565
+ #
566
+ # Adds the specified _url_ to the failures list.
567
+ #
568
+ def failed(url)
569
+ unless url.kind_of?(URI)
570
+ url = URI(url.to_s)
485
571
  end
486
- end
487
572
 
488
- def visit_host?(url)
489
- @host_rules.accept?(url.host)
490
- end
491
-
492
- def visit_port?(url)
493
- @port_rules.accept?(url.port)
494
- end
495
-
496
- def visit_link?(url)
497
- @link_rules.accept?(url.to_s)
498
- end
499
-
500
- def visit_ext?(url)
501
- @ext_rules.accept?(File.extname(url.path)[1..-1])
573
+ @every_failed_url_blocks.each { |block| block.call(url) }
574
+ @failures << url
575
+ return true
502
576
  end
503
577
 
504
578
  end
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.5'
2
+ VERSION = '0.1.6'
3
3
  end
@@ -15,18 +15,25 @@ module Helpers
15
15
  message = spec['message'].to_s.dump
16
16
  url = spec['url'].to_s.dump
17
17
 
18
- if spec['behavior'] == 'follow'
18
+ case spec['behavior']
19
+ when 'follow'
19
20
  base.module_eval %{
20
21
  it #{message} do
21
22
  should_visit_link(#{url})
22
23
  end
23
24
  }
24
- elsif spec['behavior'] == 'nofollow'
25
+ when 'nofollow'
25
26
  base.module_eval %{
26
27
  it #{message} do
27
28
  should_visit_once(#{url})
28
29
  end
29
30
  }
31
+ when 'fail'
32
+ base.module_eval %{
33
+ it #{message} do
34
+ should_fail_link(#{url})
35
+ end
36
+ }
30
37
  else
31
38
  link = spec['link'].to_s.dump
32
39
 
@@ -42,7 +49,10 @@ module Helpers
42
49
  end
43
50
 
44
51
  def run_course
45
- Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host])
52
+ Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
53
+ agent.every_failed_url { |url| puts "[FAILED] #{url}" }
54
+ agent.every_url { |url| puts url }
55
+ end
46
56
  end
47
57
 
48
58
  def visited_once?(link)
@@ -58,13 +68,11 @@ module Helpers
58
68
  # +false+ otherwise.
59
69
  #
60
70
  def visited_link?(link)
61
- url = COURSE_URL.merge(URI.encode(link))
62
-
63
- @agent.visited_urls.each do |visited_url|
64
- return true if visited_url == url
65
- end
71
+ @agent.visited?(COURSE_URL.merge(URI.encode(link)))
72
+ end
66
73
 
67
- return false
74
+ def visit_failed?(link)
75
+ @agent.failed?(COURSE_URL.merge(URI.encode(link)))
68
76
  end
69
77
 
70
78
  def should_visit_link(link)
@@ -78,5 +86,10 @@ module Helpers
78
86
  def should_visit_once(link)
79
87
  visited_once?(link).should == true
80
88
  end
89
+
90
+ def should_fail_link(link)
91
+ visited_link?(link).should == false
92
+ visit_failed?(link).should == true
93
+ end
81
94
  end
82
95
  end
@@ -18,6 +18,10 @@
18
18
  <li class="follow">
19
19
  <a href="http://spidr.rubyforge.org/course/remote/next.html">should follow remote links to unvisited pages</a>
20
20
  </li>
21
+
22
+ <li class="fail">
23
+ <a href="http://spidr.rubyforge.org:1337/path/">should ignore links that fail</a>
24
+ </li>
21
25
  </ul>
22
26
  </body>
23
27
  </html>
@@ -1 +1 @@
1
- [{"behavior":"ignore","link":"javascript:fail();","url":"javascript:fail();","message":"should ignore links beginning with \"javascript:\"","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","message":"should ignore links with an onclick attribute and a href pointing to the page.","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"behavior":"follow","link":"next.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","message":"should follow links pointing to other pages","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links pointing to the current page","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","link":"normal.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","message":"should follow relative links","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"behavior":"follow","link":".\/current_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","message":"should follow relative links to files in the current directory","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"behavior":"follow","link":"..\/relative\/same_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","message":"should follow links that transverse directories","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","message":"should ignore in-page links","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with no href attributes","example":"<a>should not follow links with no href attributes<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with empty href attributes","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"behavior":"ignore","link":" ","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","message":"should ignore links with blank href attributes","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"behavior":"follow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"behavior":"follow","link":"\/course\/absolute\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","message":"should follow absolute links to unvisited pages","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"\/course\/absolute\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","message":"should not follow absolute links to the current page","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"}]
1
+ [{"link":"\/course\/absolute\/next.html","behavior":"follow","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"link":"\/course\/absolute\/start.html","behavior":"nofollow","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a>should not follow links with no href attributes<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"link":" ","behavior":"ignore","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"link":"javascript:fail();","behavior":"ignore","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"link":"next.html","behavior":"follow","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"link":"normal.html","behavior":"follow","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"link":".\/current_directory.html","behavior":"follow","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"link":"..\/relative\/same_directory.html","behavior":"follow","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","behavior":"fail","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>"}]
data/tasks/course.rb CHANGED
@@ -44,6 +44,10 @@ namespace :course do
44
44
  doc.search('.ignore//a').each do |ignore|
45
45
  specs << link_to_spec.call(ignore, :behavior => :ignore)
46
46
  end
47
+
48
+ doc.search('.fail//a').each do |ignore|
49
+ specs << link_to_spec.call(ignore, :behavior => :fail)
50
+ end
47
51
  end
48
52
 
49
53
  spec.write(specs.to_json)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-03-22 00:00:00 -07:00
12
+ date: 2009-04-14 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: 1.11.0
33
+ version: 1.12.1
34
34
  version:
35
35
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
36
36
  email: