spidr 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,14 @@
1
+ === 0.1.8 / 2009-05-27
2
+
3
+ * Added the Agent#pause! and Agent#continue! methods.
4
+ * Added the Agent#running? and Agent#paused? methods.
5
+ * Added an alias for pending_urls to the queue methods.
6
+ * Added Agent#queue to provide read access to the queue.
7
+ * Added Agent#queue= and Agent#history= for setting the queue and history.
8
+ * Added Agent#to_hash which returns a Hash of the agents queue and history.
9
+ * Made Agent#enqueue and Agent#queued? public.
10
+ * Added more specs.
11
+
1
12
  === 0.1.7 / 2009-04-24
2
13
 
3
14
  * Added Agent#all_headers.
data/Manifest.txt CHANGED
@@ -13,6 +13,7 @@ tasks/course.rb
13
13
  spec/spec_helper.rb
14
14
  spec/helpers/course.rb
15
15
  spec/agent_spec.rb
16
+ spec/spidr_spec.rb
16
17
  static/course/index.html
17
18
  static/course/start.html
18
19
  static/course/fail.html
data/README.txt CHANGED
@@ -27,6 +27,8 @@ and easy to use.
27
27
  * Every visited URL.
28
28
  * Every visited URL that matches a specified pattern.
29
29
  * Every URL that failed to be visited.
30
+ * Pause and continue spidering.
31
+ * Restore the spidering queue and history from a previous session.
30
32
  * Custom User-Agent strings.
31
33
  * Custom proxy settings.
32
34
 
data/lib/spidr/agent.rb CHANGED
@@ -28,6 +28,9 @@ module Spidr
28
28
  # List of unreachable URLs
29
29
  attr_reader :failures
30
30
 
31
+ # Queue of URLs to visit
32
+ attr_reader :queue
33
+
31
34
  #
32
35
  # Creates a new Agent object with the given _options_ and _block_.
33
36
  # If a _block_ is given, it will be passed the newly created
@@ -82,6 +85,7 @@ module Spidr
82
85
  @history = []
83
86
  @failures = []
84
87
  @queue = []
88
+ @paused = true
85
89
 
86
90
  if options[:host]
87
91
  visit_hosts_like(options[:host])
@@ -361,22 +365,70 @@ module Spidr
361
365
  #
362
366
  def start_at(url)
363
367
  clear
364
- return run(url)
368
+ enqueue(url)
369
+
370
+ return continue!
365
371
  end
366
372
 
367
373
  #
368
- # Start spidering at the specified _url_.
374
+ # Start spidering until the queue becomes empty or the agent is
375
+ # paused.
369
376
  #
370
- def run(url)
371
- enqueue(url)
372
-
373
- until @queue.empty?
377
+ def run
378
+ until (@queue.empty? || @paused == true)
374
379
  visit_page(dequeue)
375
380
  end
376
381
 
377
382
  return self
378
383
  end
379
384
 
385
+ #
386
+ # Continue spidering.
387
+ #
388
+ def continue!
389
+ @paused = false
390
+ return run
391
+ end
392
+
393
+ #
394
+ # Returns +true+ if the agent is still spidering, returns +false+
395
+ # otherwise.
396
+ #
397
+ def running?
398
+ @paused == false
399
+ end
400
+
401
+ #
402
+ # Returns +true+ if the agent is paused, returns +false+ otherwise.
403
+ #
404
+ def paused?
405
+ @paused == true
406
+ end
407
+
408
+ #
409
+ # Pauses the agent, causing spidering to temporarily stop.
410
+ #
411
+ def pause!
412
+ @paused = true
413
+ return self
414
+ end
415
+
416
+ #
417
+ # Sets the history of links that were previously visited to the
418
+ # specified _new_history_.
419
+ #
420
+ # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
421
+ #
422
+ def history=(new_history)
423
+ @history = new_history.map do |url|
424
+ unless url.kind_of?(URI)
425
+ URI(url.to_s)
426
+ else
427
+ url
428
+ end
429
+ end
430
+ end
431
+
380
432
  alias visited_urls history
381
433
 
382
434
  #
@@ -398,9 +450,7 @@ module Spidr
398
450
  # otherwise.
399
451
  #
400
452
  def visited?(url)
401
- unless url.kind_of?(URI)
402
- url = URI(url)
403
- end
453
+ url = URI(url) unless url.kind_of?(URI)
404
454
 
405
455
  return @history.include?(url)
406
456
  end
@@ -410,13 +460,13 @@ module Spidr
410
460
  # returns +false+ otherwise.
411
461
  #
412
462
  def failed?(url)
413
- unless url.kind_of?(URI)
414
- url = URI(url)
415
- end
463
+ url = URI(url) unless url.kind_of?(URI)
416
464
 
417
465
  return @failures.include?(url)
418
466
  end
419
467
 
468
+ alias pending_urls queue
469
+
420
470
  #
421
471
  # Creates a new Page object from the specified _url_. If a _block_ is
422
472
  # given, it will be passed the newly created Page object.
@@ -454,7 +504,28 @@ module Spidr
454
504
  end
455
505
  end
456
506
 
457
- protected
507
+ #
508
+ # Returns the agent represented as a Hash containing the agents
509
+ # +history+ and +queue+ information.
510
+ #
511
+ def to_hash
512
+ {:history => @history, :queue => @queue}
513
+ end
514
+
515
+ #
516
+ # Sets the queue of links to visit to the specified _new_queue_.
517
+ #
518
+ # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
519
+ #
520
+ def queue=(new_queue)
521
+ @queue = new_queue.map do |url|
522
+ unless url.kind_of?(URI)
523
+ URI(url.to_s)
524
+ else
525
+ url
526
+ end
527
+ end
528
+ end
458
529
 
459
530
  #
460
531
  # Returns +true+ if the specified _url_ is queued for visiting, returns
@@ -489,6 +560,8 @@ module Spidr
489
560
  return false
490
561
  end
491
562
 
563
+ protected
564
+
492
565
  #
493
566
  # Dequeues a URL that will later be visited.
494
567
  #
@@ -574,9 +647,7 @@ module Spidr
574
647
  # Adds the specified _url_ to the failures list.
575
648
  #
576
649
  def failed(url)
577
- unless url.kind_of?(URI)
578
- url = URI(url.to_s)
579
- end
650
+ url = URI(url.to_s) unless url.kind_of?(URI)
580
651
 
581
652
  @every_failed_url_blocks.each { |block| block.call(url) }
582
653
  @failures << url
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.7'
2
+ VERSION = '0.1.8'
3
3
  end
data/spec/agent_spec.rb CHANGED
@@ -9,4 +9,80 @@ describe Agent do
9
9
  before(:all) do
10
10
  @agent = run_course
11
11
  end
12
+
13
+ it "should provide the history" do
14
+ @agent.history.should_not be_empty
15
+ end
16
+
17
+ it "should provide the queue" do
18
+ @agent.queue.should be_empty
19
+ end
20
+
21
+ it "should be able to restore the history" do
22
+ agent = Agent.new
23
+ previous_history = [URI('http://www.example.com')]
24
+
25
+ agent.history = previous_history
26
+ agent.history.should == previous_history
27
+ end
28
+
29
+ it "should convert new histories to an Array of URIs" do
30
+ agent = Agent.new
31
+ previous_history = ['http://www.example.com']
32
+
33
+ agent.history = previous_history
34
+ agent.history.should_not == previous_history
35
+ agent.history.should == previous_history.map { |url| URI(url) }
36
+ end
37
+
38
+ it "should be able to restore the queue" do
39
+ agent = Agent.new
40
+ previous_queue = [URI('http://www.example.com')]
41
+
42
+ agent.queue = previous_queue
43
+ agent.queue.should == previous_queue
44
+ end
45
+
46
+ it "should convert new queues to an Array of URIs" do
47
+ agent = Agent.new
48
+ previous_queue = ['http://www.example.com']
49
+
50
+ agent.queue = previous_queue
51
+ agent.queue.should_not == previous_queue
52
+ agent.queue.should == previous_queue.map { |url| URI(url) }
53
+ end
54
+
55
+ it "should be able to pause spidering" do
56
+ count = 0
57
+ agent = Agent.host('spidr.rubyforge.org') do |spider|
58
+ spider.every_page do |page|
59
+ count += 1
60
+ spider.pause! if count >= 2
61
+ end
62
+ end
63
+
64
+ agent.should be_paused
65
+ agent.history.length.should == 2
66
+ end
67
+
68
+ it "should be able to continue spidering after being paused" do
69
+ agent = Agent.new do |spider|
70
+ spider.enqueue('http://spidr.rubyforge.org/')
71
+ spider.every_page do |page|
72
+ spider.pause!
73
+ end
74
+ end
75
+
76
+ agent.pause!
77
+ agent.continue!
78
+
79
+ agent.visited?('http://spidr.rubyforge.org/').should == true
80
+ end
81
+
82
+ it "should provide a to_hash method that returns the queue and history" do
83
+ hash = @agent.to_hash
84
+
85
+ hash[:queue].should be_empty
86
+ hash[:history].should_not be_empty
87
+ end
12
88
  end
@@ -0,0 +1,9 @@
1
+ require 'spidr'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Spidr do
6
+ it "should have a VERSION constant" do
7
+ Spidr.const_defined?('VERSION').should == true
8
+ end
9
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-04-24 00:00:00 -07:00
12
+ date: 2009-05-27 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -59,6 +59,7 @@ files:
59
59
  - spec/spec_helper.rb
60
60
  - spec/helpers/course.rb
61
61
  - spec/agent_spec.rb
62
+ - spec/spidr_spec.rb
62
63
  - static/course/index.html
63
64
  - static/course/start.html
64
65
  - static/course/fail.html