spidr 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,14 @@
1
+ === 0.1.8 / 2009-05-27
2
+
3
+ * Added the Agent#pause! and Agent#continue! methods.
4
+ * Added the Agent#running? and Agent#paused? methods.
5
+ * Added an alias for pending_urls to the queue methods.
6
+ * Added Agent#queue to provide read access to the queue.
7
+ * Added Agent#queue= and Agent#history= for setting the queue and history.
8
+ * Added Agent#to_hash which returns a Hash of the agents queue and history.
9
+ * Made Agent#enqueue and Agent#queued? public.
10
+ * Added more specs.
11
+
1
12
  === 0.1.7 / 2009-04-24
2
13
 
3
14
  * Added Agent#all_headers.
data/Manifest.txt CHANGED
@@ -13,6 +13,7 @@ tasks/course.rb
13
13
  spec/spec_helper.rb
14
14
  spec/helpers/course.rb
15
15
  spec/agent_spec.rb
16
+ spec/spidr_spec.rb
16
17
  static/course/index.html
17
18
  static/course/start.html
18
19
  static/course/fail.html
data/README.txt CHANGED
@@ -27,6 +27,8 @@ and easy to use.
27
27
  * Every visited URL.
28
28
  * Every visited URL that matches a specified pattern.
29
29
  * Every URL that failed to be visited.
30
+ * Pause and continue spidering.
31
+ * Restore the spidering queue and history from a previous session.
30
32
  * Custom User-Agent strings.
31
33
  * Custom proxy settings.
32
34
 
data/lib/spidr/agent.rb CHANGED
@@ -28,6 +28,9 @@ module Spidr
28
28
  # List of unreachable URLs
29
29
  attr_reader :failures
30
30
 
31
+ # Queue of URLs to visit
32
+ attr_reader :queue
33
+
31
34
  #
32
35
  # Creates a new Agent object with the given _options_ and _block_.
33
36
  # If a _block_ is given, it will be passed the newly created
@@ -82,6 +85,7 @@ module Spidr
82
85
  @history = []
83
86
  @failures = []
84
87
  @queue = []
88
+ @paused = true
85
89
 
86
90
  if options[:host]
87
91
  visit_hosts_like(options[:host])
@@ -361,22 +365,70 @@ module Spidr
361
365
  #
362
366
  def start_at(url)
363
367
  clear
364
- return run(url)
368
+ enqueue(url)
369
+
370
+ return continue!
365
371
  end
366
372
 
367
373
  #
368
- # Start spidering at the specified _url_.
374
+ # Start spidering until the queue becomes empty or the agent is
375
+ # paused.
369
376
  #
370
- def run(url)
371
- enqueue(url)
372
-
373
- until @queue.empty?
377
+ def run
378
+ until (@queue.empty? || @paused == true)
374
379
  visit_page(dequeue)
375
380
  end
376
381
 
377
382
  return self
378
383
  end
379
384
 
385
+ #
386
+ # Continue spidering.
387
+ #
388
+ def continue!
389
+ @paused = false
390
+ return run
391
+ end
392
+
393
+ #
394
+ # Returns +true+ if the agent is still spidering, returns +false+
395
+ # otherwise.
396
+ #
397
+ def running?
398
+ @paused == false
399
+ end
400
+
401
+ #
402
+ # Returns +true+ if the agent is paused, returns +false+ otherwise.
403
+ #
404
+ def paused?
405
+ @paused == true
406
+ end
407
+
408
+ #
409
+ # Pauses the agent, causing spidering to temporarily stop.
410
+ #
411
+ def pause!
412
+ @paused = true
413
+ return self
414
+ end
415
+
416
+ #
417
+ # Sets the history of links that were previously visited to the
418
+ # specified _new_history_.
419
+ #
420
+ # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
421
+ #
422
+ def history=(new_history)
423
+ @history = new_history.map do |url|
424
+ unless url.kind_of?(URI)
425
+ URI(url.to_s)
426
+ else
427
+ url
428
+ end
429
+ end
430
+ end
431
+
380
432
  alias visited_urls history
381
433
 
382
434
  #
@@ -398,9 +450,7 @@ module Spidr
398
450
  # otherwise.
399
451
  #
400
452
  def visited?(url)
401
- unless url.kind_of?(URI)
402
- url = URI(url)
403
- end
453
+ url = URI(url) unless url.kind_of?(URI)
404
454
 
405
455
  return @history.include?(url)
406
456
  end
@@ -410,13 +460,13 @@ module Spidr
410
460
  # returns +false+ otherwise.
411
461
  #
412
462
  def failed?(url)
413
- unless url.kind_of?(URI)
414
- url = URI(url)
415
- end
463
+ url = URI(url) unless url.kind_of?(URI)
416
464
 
417
465
  return @failures.include?(url)
418
466
  end
419
467
 
468
+ alias pending_urls queue
469
+
420
470
  #
421
471
  # Creates a new Page object from the specified _url_. If a _block_ is
422
472
  # given, it will be passed the newly created Page object.
@@ -454,7 +504,28 @@ module Spidr
454
504
  end
455
505
  end
456
506
 
457
- protected
507
+ #
508
+ # Returns the agent represented as a Hash containing the agents
509
+ # +history+ and +queue+ information.
510
+ #
511
+ def to_hash
512
+ {:history => @history, :queue => @queue}
513
+ end
514
+
515
+ #
516
+ # Sets the queue of links to visit to the specified _new_queue_.
517
+ #
518
+ # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
519
+ #
520
+ def queue=(new_queue)
521
+ @queue = new_queue.map do |url|
522
+ unless url.kind_of?(URI)
523
+ URI(url.to_s)
524
+ else
525
+ url
526
+ end
527
+ end
528
+ end
458
529
 
459
530
  #
460
531
  # Returns +true+ if the specified _url_ is queued for visiting, returns
@@ -489,6 +560,8 @@ module Spidr
489
560
  return false
490
561
  end
491
562
 
563
+ protected
564
+
492
565
  #
493
566
  # Dequeues a URL that will later be visited.
494
567
  #
@@ -574,9 +647,7 @@ module Spidr
574
647
  # Adds the specified _url_ to the failures list.
575
648
  #
576
649
  def failed(url)
577
- unless url.kind_of?(URI)
578
- url = URI(url.to_s)
579
- end
650
+ url = URI(url.to_s) unless url.kind_of?(URI)
580
651
 
581
652
  @every_failed_url_blocks.each { |block| block.call(url) }
582
653
  @failures << url
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.7'
2
+ VERSION = '0.1.8'
3
3
  end
data/spec/agent_spec.rb CHANGED
@@ -9,4 +9,80 @@ describe Agent do
9
9
  before(:all) do
10
10
  @agent = run_course
11
11
  end
12
+
13
+ it "should provide the history" do
14
+ @agent.history.should_not be_empty
15
+ end
16
+
17
+ it "should provide the queue" do
18
+ @agent.queue.should be_empty
19
+ end
20
+
21
+ it "should be able to restore the history" do
22
+ agent = Agent.new
23
+ previous_history = [URI('http://www.example.com')]
24
+
25
+ agent.history = previous_history
26
+ agent.history.should == previous_history
27
+ end
28
+
29
+ it "should convert new histories to an Array of URIs" do
30
+ agent = Agent.new
31
+ previous_history = ['http://www.example.com']
32
+
33
+ agent.history = previous_history
34
+ agent.history.should_not == previous_history
35
+ agent.history.should == previous_history.map { |url| URI(url) }
36
+ end
37
+
38
+ it "should be able to restore the queue" do
39
+ agent = Agent.new
40
+ previous_queue = [URI('http://www.example.com')]
41
+
42
+ agent.queue = previous_queue
43
+ agent.queue.should == previous_queue
44
+ end
45
+
46
+ it "should convert new queues to an Array of URIs" do
47
+ agent = Agent.new
48
+ previous_queue = ['http://www.example.com']
49
+
50
+ agent.queue = previous_queue
51
+ agent.queue.should_not == previous_queue
52
+ agent.queue.should == previous_queue.map { |url| URI(url) }
53
+ end
54
+
55
+ it "should be able to pause spidering" do
56
+ count = 0
57
+ agent = Agent.host('spidr.rubyforge.org') do |spider|
58
+ spider.every_page do |page|
59
+ count += 1
60
+ spider.pause! if count >= 2
61
+ end
62
+ end
63
+
64
+ agent.should be_paused
65
+ agent.history.length.should == 2
66
+ end
67
+
68
+ it "should be able to continue spidering after being paused" do
69
+ agent = Agent.new do |spider|
70
+ spider.enqueue('http://spidr.rubyforge.org/')
71
+ spider.every_page do |page|
72
+ spider.pause!
73
+ end
74
+ end
75
+
76
+ agent.pause!
77
+ agent.continue!
78
+
79
+ agent.visited?('http://spidr.rubyforge.org/').should == true
80
+ end
81
+
82
+ it "should provide a to_hash method that returns the queue and history" do
83
+ hash = @agent.to_hash
84
+
85
+ hash[:queue].should be_empty
86
+ hash[:history].should_not be_empty
87
+ end
12
88
  end
@@ -0,0 +1,9 @@
1
+ require 'spidr'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Spidr do
6
+ it "should have a VERSION constant" do
7
+ Spidr.const_defined?('VERSION').should == true
8
+ end
9
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-04-24 00:00:00 -07:00
12
+ date: 2009-05-27 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -59,6 +59,7 @@ files:
59
59
  - spec/spec_helper.rb
60
60
  - spec/helpers/course.rb
61
61
  - spec/agent_spec.rb
62
+ - spec/spidr_spec.rb
62
63
  - static/course/index.html
63
64
  - static/course/start.html
64
65
  - static/course/fail.html