spidr 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +11 -0
- data/Manifest.txt +1 -0
- data/README.txt +2 -0
- data/lib/spidr/agent.rb +87 -16
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +76 -0
- data/spec/spidr_spec.rb +9 -0
- metadata +3 -2
data/History.txt
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
=== 0.1.8 / 2009-05-27
|
2
|
+
|
3
|
+
* Added the Agent#pause! and Agent#continue! methods.
|
4
|
+
* Added the Agent#running? and Agent#paused? methods.
|
5
|
+
* Added an alias for pending_urls to the queue methods.
|
6
|
+
* Added Agent#queue to provide read access to the queue.
|
7
|
+
* Added Agent#queue= and Agent#history= for setting the queue and history.
|
8
|
+
* Added Agent#to_hash which returns a Hash of the agents queue and history.
|
9
|
+
* Made Agent#enqueue and Agent#queued? public.
|
10
|
+
* Added more specs.
|
11
|
+
|
1
12
|
=== 0.1.7 / 2009-04-24
|
2
13
|
|
3
14
|
* Added Agent#all_headers.
|
data/Manifest.txt
CHANGED
data/README.txt
CHANGED
@@ -27,6 +27,8 @@ and easy to use.
|
|
27
27
|
* Every visited URL.
|
28
28
|
* Every visited URL that matches a specified pattern.
|
29
29
|
* Every URL that failed to be visited.
|
30
|
+
* Pause and continue spidering.
|
31
|
+
* Restore the spidering queue and history from a previous session.
|
30
32
|
* Custom User-Agent strings.
|
31
33
|
* Custom proxy settings.
|
32
34
|
|
data/lib/spidr/agent.rb
CHANGED
@@ -28,6 +28,9 @@ module Spidr
|
|
28
28
|
# List of unreachable URLs
|
29
29
|
attr_reader :failures
|
30
30
|
|
31
|
+
# Queue of URLs to visit
|
32
|
+
attr_reader :queue
|
33
|
+
|
31
34
|
#
|
32
35
|
# Creates a new Agent object with the given _options_ and _block_.
|
33
36
|
# If a _block_ is given, it will be passed the newly created
|
@@ -82,6 +85,7 @@ module Spidr
|
|
82
85
|
@history = []
|
83
86
|
@failures = []
|
84
87
|
@queue = []
|
88
|
+
@paused = true
|
85
89
|
|
86
90
|
if options[:host]
|
87
91
|
visit_hosts_like(options[:host])
|
@@ -361,22 +365,70 @@ module Spidr
|
|
361
365
|
#
|
362
366
|
def start_at(url)
|
363
367
|
clear
|
364
|
-
|
368
|
+
enqueue(url)
|
369
|
+
|
370
|
+
return continue!
|
365
371
|
end
|
366
372
|
|
367
373
|
#
|
368
|
-
# Start spidering
|
374
|
+
# Start spidering until the queue becomes empty or the agent is
|
375
|
+
# paused.
|
369
376
|
#
|
370
|
-
def run
|
371
|
-
|
372
|
-
|
373
|
-
until @queue.empty?
|
377
|
+
def run
|
378
|
+
until (@queue.empty? || @paused == true)
|
374
379
|
visit_page(dequeue)
|
375
380
|
end
|
376
381
|
|
377
382
|
return self
|
378
383
|
end
|
379
384
|
|
385
|
+
#
|
386
|
+
# Continue spidering.
|
387
|
+
#
|
388
|
+
def continue!
|
389
|
+
@paused = false
|
390
|
+
return run
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Returns +true+ if the agent is still spidering, returns +false+
|
395
|
+
# otherwise.
|
396
|
+
#
|
397
|
+
def running?
|
398
|
+
@paused == false
|
399
|
+
end
|
400
|
+
|
401
|
+
#
|
402
|
+
# Returns +true+ if the agent is paused, returns +false+ otherwise.
|
403
|
+
#
|
404
|
+
def paused?
|
405
|
+
@paused == true
|
406
|
+
end
|
407
|
+
|
408
|
+
#
|
409
|
+
# Pauses the agent, causing spidering to temporarily stop.
|
410
|
+
#
|
411
|
+
def pause!
|
412
|
+
@paused = true
|
413
|
+
return self
|
414
|
+
end
|
415
|
+
|
416
|
+
#
|
417
|
+
# Sets the history of links that were previously visited to the
|
418
|
+
# specified _new_history_.
|
419
|
+
#
|
420
|
+
# agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
|
421
|
+
#
|
422
|
+
def history=(new_history)
|
423
|
+
@history = new_history.map do |url|
|
424
|
+
unless url.kind_of?(URI)
|
425
|
+
URI(url.to_s)
|
426
|
+
else
|
427
|
+
url
|
428
|
+
end
|
429
|
+
end
|
430
|
+
end
|
431
|
+
|
380
432
|
alias visited_urls history
|
381
433
|
|
382
434
|
#
|
@@ -398,9 +450,7 @@ module Spidr
|
|
398
450
|
# otherwise.
|
399
451
|
#
|
400
452
|
def visited?(url)
|
401
|
-
unless url.kind_of?(URI)
|
402
|
-
url = URI(url)
|
403
|
-
end
|
453
|
+
url = URI(url) unless url.kind_of?(URI)
|
404
454
|
|
405
455
|
return @history.include?(url)
|
406
456
|
end
|
@@ -410,13 +460,13 @@ module Spidr
|
|
410
460
|
# returns +false+ otherwise.
|
411
461
|
#
|
412
462
|
def failed?(url)
|
413
|
-
unless url.kind_of?(URI)
|
414
|
-
url = URI(url)
|
415
|
-
end
|
463
|
+
url = URI(url) unless url.kind_of?(URI)
|
416
464
|
|
417
465
|
return @failures.include?(url)
|
418
466
|
end
|
419
467
|
|
468
|
+
alias pending_urls queue
|
469
|
+
|
420
470
|
#
|
421
471
|
# Creates a new Page object from the specified _url_. If a _block_ is
|
422
472
|
# given, it will be passed the newly created Page object.
|
@@ -454,7 +504,28 @@ module Spidr
|
|
454
504
|
end
|
455
505
|
end
|
456
506
|
|
457
|
-
|
507
|
+
#
|
508
|
+
# Returns the agent represented as a Hash containing the agents
|
509
|
+
# +history+ and +queue+ information.
|
510
|
+
#
|
511
|
+
def to_hash
|
512
|
+
{:history => @history, :queue => @queue}
|
513
|
+
end
|
514
|
+
|
515
|
+
#
|
516
|
+
# Sets the queue of links to visit to the specified _new_queue_.
|
517
|
+
#
|
518
|
+
# agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
|
519
|
+
#
|
520
|
+
def queue=(new_queue)
|
521
|
+
@queue = new_queue.map do |url|
|
522
|
+
unless url.kind_of?(URI)
|
523
|
+
URI(url.to_s)
|
524
|
+
else
|
525
|
+
url
|
526
|
+
end
|
527
|
+
end
|
528
|
+
end
|
458
529
|
|
459
530
|
#
|
460
531
|
# Returns +true+ if the specified _url_ is queued for visiting, returns
|
@@ -489,6 +560,8 @@ module Spidr
|
|
489
560
|
return false
|
490
561
|
end
|
491
562
|
|
563
|
+
protected
|
564
|
+
|
492
565
|
#
|
493
566
|
# Dequeues a URL that will later be visited.
|
494
567
|
#
|
@@ -574,9 +647,7 @@ module Spidr
|
|
574
647
|
# Adds the specified _url_ to the failures list.
|
575
648
|
#
|
576
649
|
def failed(url)
|
577
|
-
unless url.kind_of?(URI)
|
578
|
-
url = URI(url.to_s)
|
579
|
-
end
|
650
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
580
651
|
|
581
652
|
@every_failed_url_blocks.each { |block| block.call(url) }
|
582
653
|
@failures << url
|
data/lib/spidr/version.rb
CHANGED
data/spec/agent_spec.rb
CHANGED
@@ -9,4 +9,80 @@ describe Agent do
|
|
9
9
|
before(:all) do
|
10
10
|
@agent = run_course
|
11
11
|
end
|
12
|
+
|
13
|
+
it "should provide the history" do
|
14
|
+
@agent.history.should_not be_empty
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should provide the queue" do
|
18
|
+
@agent.queue.should be_empty
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should be able to restore the history" do
|
22
|
+
agent = Agent.new
|
23
|
+
previous_history = [URI('http://www.example.com')]
|
24
|
+
|
25
|
+
agent.history = previous_history
|
26
|
+
agent.history.should == previous_history
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should convert new histories to an Array of URIs" do
|
30
|
+
agent = Agent.new
|
31
|
+
previous_history = ['http://www.example.com']
|
32
|
+
|
33
|
+
agent.history = previous_history
|
34
|
+
agent.history.should_not == previous_history
|
35
|
+
agent.history.should == previous_history.map { |url| URI(url) }
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should be able to restore the queue" do
|
39
|
+
agent = Agent.new
|
40
|
+
previous_queue = [URI('http://www.example.com')]
|
41
|
+
|
42
|
+
agent.queue = previous_queue
|
43
|
+
agent.queue.should == previous_queue
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should convert new queues to an Array of URIs" do
|
47
|
+
agent = Agent.new
|
48
|
+
previous_queue = ['http://www.example.com']
|
49
|
+
|
50
|
+
agent.queue = previous_queue
|
51
|
+
agent.queue.should_not == previous_queue
|
52
|
+
agent.queue.should == previous_queue.map { |url| URI(url) }
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should be able to pause spidering" do
|
56
|
+
count = 0
|
57
|
+
agent = Agent.host('spidr.rubyforge.org') do |spider|
|
58
|
+
spider.every_page do |page|
|
59
|
+
count += 1
|
60
|
+
spider.pause! if count >= 2
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
agent.should be_paused
|
65
|
+
agent.history.length.should == 2
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should be able to continue spidering after being paused" do
|
69
|
+
agent = Agent.new do |spider|
|
70
|
+
spider.enqueue('http://spidr.rubyforge.org/')
|
71
|
+
spider.every_page do |page|
|
72
|
+
spider.pause!
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
agent.pause!
|
77
|
+
agent.continue!
|
78
|
+
|
79
|
+
agent.visited?('http://spidr.rubyforge.org/').should == true
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should provide a to_hash method that returns the queue and history" do
|
83
|
+
hash = @agent.to_hash
|
84
|
+
|
85
|
+
hash[:queue].should be_empty
|
86
|
+
hash[:history].should_not be_empty
|
87
|
+
end
|
12
88
|
end
|
data/spec/spidr_spec.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-05-27 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -59,6 +59,7 @@ files:
|
|
59
59
|
- spec/spec_helper.rb
|
60
60
|
- spec/helpers/course.rb
|
61
61
|
- spec/agent_spec.rb
|
62
|
+
- spec/spidr_spec.rb
|
62
63
|
- static/course/index.html
|
63
64
|
- static/course/start.html
|
64
65
|
- static/course/fail.html
|