spidr 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +11 -0
- data/Manifest.txt +1 -0
- data/README.txt +2 -0
- data/lib/spidr/agent.rb +87 -16
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +76 -0
- data/spec/spidr_spec.rb +9 -0
- metadata +3 -2
data/History.txt
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
=== 0.1.8 / 2009-05-27
|
2
|
+
|
3
|
+
* Added the Agent#pause! and Agent#continue! methods.
|
4
|
+
* Added the Agent#running? and Agent#paused? methods.
|
5
|
+
* Added an alias for pending_urls to the queue methods.
|
6
|
+
* Added Agent#queue to provide read access to the queue.
|
7
|
+
* Added Agent#queue= and Agent#history= for setting the queue and history.
|
8
|
+
* Added Agent#to_hash which returns a Hash of the agents queue and history.
|
9
|
+
* Made Agent#enqueue and Agent#queued? public.
|
10
|
+
* Added more specs.
|
11
|
+
|
1
12
|
=== 0.1.7 / 2009-04-24
|
2
13
|
|
3
14
|
* Added Agent#all_headers.
|
data/Manifest.txt
CHANGED
data/README.txt
CHANGED
@@ -27,6 +27,8 @@ and easy to use.
|
|
27
27
|
* Every visited URL.
|
28
28
|
* Every visited URL that matches a specified pattern.
|
29
29
|
* Every URL that failed to be visited.
|
30
|
+
* Pause and continue spidering.
|
31
|
+
* Restore the spidering queue and history from a previous session.
|
30
32
|
* Custom User-Agent strings.
|
31
33
|
* Custom proxy settings.
|
32
34
|
|
data/lib/spidr/agent.rb
CHANGED
@@ -28,6 +28,9 @@ module Spidr
|
|
28
28
|
# List of unreachable URLs
|
29
29
|
attr_reader :failures
|
30
30
|
|
31
|
+
# Queue of URLs to visit
|
32
|
+
attr_reader :queue
|
33
|
+
|
31
34
|
#
|
32
35
|
# Creates a new Agent object with the given _options_ and _block_.
|
33
36
|
# If a _block_ is given, it will be passed the newly created
|
@@ -82,6 +85,7 @@ module Spidr
|
|
82
85
|
@history = []
|
83
86
|
@failures = []
|
84
87
|
@queue = []
|
88
|
+
@paused = true
|
85
89
|
|
86
90
|
if options[:host]
|
87
91
|
visit_hosts_like(options[:host])
|
@@ -361,22 +365,70 @@ module Spidr
|
|
361
365
|
#
|
362
366
|
def start_at(url)
|
363
367
|
clear
|
364
|
-
|
368
|
+
enqueue(url)
|
369
|
+
|
370
|
+
return continue!
|
365
371
|
end
|
366
372
|
|
367
373
|
#
|
368
|
-
# Start spidering
|
374
|
+
# Start spidering until the queue becomes empty or the agent is
|
375
|
+
# paused.
|
369
376
|
#
|
370
|
-
def run
|
371
|
-
|
372
|
-
|
373
|
-
until @queue.empty?
|
377
|
+
def run
|
378
|
+
until (@queue.empty? || @paused == true)
|
374
379
|
visit_page(dequeue)
|
375
380
|
end
|
376
381
|
|
377
382
|
return self
|
378
383
|
end
|
379
384
|
|
385
|
+
#
|
386
|
+
# Continue spidering.
|
387
|
+
#
|
388
|
+
def continue!
|
389
|
+
@paused = false
|
390
|
+
return run
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Returns +true+ if the agent is still spidering, returns +false+
|
395
|
+
# otherwise.
|
396
|
+
#
|
397
|
+
def running?
|
398
|
+
@paused == false
|
399
|
+
end
|
400
|
+
|
401
|
+
#
|
402
|
+
# Returns +true+ if the agent is paused, returns +false+ otherwise.
|
403
|
+
#
|
404
|
+
def paused?
|
405
|
+
@paused == true
|
406
|
+
end
|
407
|
+
|
408
|
+
#
|
409
|
+
# Pauses the agent, causing spidering to temporarily stop.
|
410
|
+
#
|
411
|
+
def pause!
|
412
|
+
@paused = true
|
413
|
+
return self
|
414
|
+
end
|
415
|
+
|
416
|
+
#
|
417
|
+
# Sets the history of links that were previously visited to the
|
418
|
+
# specified _new_history_.
|
419
|
+
#
|
420
|
+
# agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
|
421
|
+
#
|
422
|
+
def history=(new_history)
|
423
|
+
@history = new_history.map do |url|
|
424
|
+
unless url.kind_of?(URI)
|
425
|
+
URI(url.to_s)
|
426
|
+
else
|
427
|
+
url
|
428
|
+
end
|
429
|
+
end
|
430
|
+
end
|
431
|
+
|
380
432
|
alias visited_urls history
|
381
433
|
|
382
434
|
#
|
@@ -398,9 +450,7 @@ module Spidr
|
|
398
450
|
# otherwise.
|
399
451
|
#
|
400
452
|
def visited?(url)
|
401
|
-
unless url.kind_of?(URI)
|
402
|
-
url = URI(url)
|
403
|
-
end
|
453
|
+
url = URI(url) unless url.kind_of?(URI)
|
404
454
|
|
405
455
|
return @history.include?(url)
|
406
456
|
end
|
@@ -410,13 +460,13 @@ module Spidr
|
|
410
460
|
# returns +false+ otherwise.
|
411
461
|
#
|
412
462
|
def failed?(url)
|
413
|
-
unless url.kind_of?(URI)
|
414
|
-
url = URI(url)
|
415
|
-
end
|
463
|
+
url = URI(url) unless url.kind_of?(URI)
|
416
464
|
|
417
465
|
return @failures.include?(url)
|
418
466
|
end
|
419
467
|
|
468
|
+
alias pending_urls queue
|
469
|
+
|
420
470
|
#
|
421
471
|
# Creates a new Page object from the specified _url_. If a _block_ is
|
422
472
|
# given, it will be passed the newly created Page object.
|
@@ -454,7 +504,28 @@ module Spidr
|
|
454
504
|
end
|
455
505
|
end
|
456
506
|
|
457
|
-
|
507
|
+
#
|
508
|
+
# Returns the agent represented as a Hash containing the agents
|
509
|
+
# +history+ and +queue+ information.
|
510
|
+
#
|
511
|
+
def to_hash
|
512
|
+
{:history => @history, :queue => @queue}
|
513
|
+
end
|
514
|
+
|
515
|
+
#
|
516
|
+
# Sets the queue of links to visit to the specified _new_queue_.
|
517
|
+
#
|
518
|
+
# agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
|
519
|
+
#
|
520
|
+
def queue=(new_queue)
|
521
|
+
@queue = new_queue.map do |url|
|
522
|
+
unless url.kind_of?(URI)
|
523
|
+
URI(url.to_s)
|
524
|
+
else
|
525
|
+
url
|
526
|
+
end
|
527
|
+
end
|
528
|
+
end
|
458
529
|
|
459
530
|
#
|
460
531
|
# Returns +true+ if the specified _url_ is queued for visiting, returns
|
@@ -489,6 +560,8 @@ module Spidr
|
|
489
560
|
return false
|
490
561
|
end
|
491
562
|
|
563
|
+
protected
|
564
|
+
|
492
565
|
#
|
493
566
|
# Dequeues a URL that will later be visited.
|
494
567
|
#
|
@@ -574,9 +647,7 @@ module Spidr
|
|
574
647
|
# Adds the specified _url_ to the failures list.
|
575
648
|
#
|
576
649
|
def failed(url)
|
577
|
-
unless url.kind_of?(URI)
|
578
|
-
url = URI(url.to_s)
|
579
|
-
end
|
650
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
580
651
|
|
581
652
|
@every_failed_url_blocks.each { |block| block.call(url) }
|
582
653
|
@failures << url
|
data/lib/spidr/version.rb
CHANGED
data/spec/agent_spec.rb
CHANGED
@@ -9,4 +9,80 @@ describe Agent do
|
|
9
9
|
before(:all) do
|
10
10
|
@agent = run_course
|
11
11
|
end
|
12
|
+
|
13
|
+
it "should provide the history" do
|
14
|
+
@agent.history.should_not be_empty
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should provide the queue" do
|
18
|
+
@agent.queue.should be_empty
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should be able to restore the history" do
|
22
|
+
agent = Agent.new
|
23
|
+
previous_history = [URI('http://www.example.com')]
|
24
|
+
|
25
|
+
agent.history = previous_history
|
26
|
+
agent.history.should == previous_history
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should convert new histories to an Array of URIs" do
|
30
|
+
agent = Agent.new
|
31
|
+
previous_history = ['http://www.example.com']
|
32
|
+
|
33
|
+
agent.history = previous_history
|
34
|
+
agent.history.should_not == previous_history
|
35
|
+
agent.history.should == previous_history.map { |url| URI(url) }
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should be able to restore the queue" do
|
39
|
+
agent = Agent.new
|
40
|
+
previous_queue = [URI('http://www.example.com')]
|
41
|
+
|
42
|
+
agent.queue = previous_queue
|
43
|
+
agent.queue.should == previous_queue
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should convert new queues to an Array of URIs" do
|
47
|
+
agent = Agent.new
|
48
|
+
previous_queue = ['http://www.example.com']
|
49
|
+
|
50
|
+
agent.queue = previous_queue
|
51
|
+
agent.queue.should_not == previous_queue
|
52
|
+
agent.queue.should == previous_queue.map { |url| URI(url) }
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should be able to pause spidering" do
|
56
|
+
count = 0
|
57
|
+
agent = Agent.host('spidr.rubyforge.org') do |spider|
|
58
|
+
spider.every_page do |page|
|
59
|
+
count += 1
|
60
|
+
spider.pause! if count >= 2
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
agent.should be_paused
|
65
|
+
agent.history.length.should == 2
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should be able to continue spidering after being paused" do
|
69
|
+
agent = Agent.new do |spider|
|
70
|
+
spider.enqueue('http://spidr.rubyforge.org/')
|
71
|
+
spider.every_page do |page|
|
72
|
+
spider.pause!
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
agent.pause!
|
77
|
+
agent.continue!
|
78
|
+
|
79
|
+
agent.visited?('http://spidr.rubyforge.org/').should == true
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should provide a to_hash method that returns the queue and history" do
|
83
|
+
hash = @agent.to_hash
|
84
|
+
|
85
|
+
hash[:queue].should be_empty
|
86
|
+
hash[:history].should_not be_empty
|
87
|
+
end
|
12
88
|
end
|
data/spec/spidr_spec.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-05-27 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -59,6 +59,7 @@ files:
|
|
59
59
|
- spec/spec_helper.rb
|
60
60
|
- spec/helpers/course.rb
|
61
61
|
- spec/agent_spec.rb
|
62
|
+
- spec/spidr_spec.rb
|
62
63
|
- static/course/index.html
|
63
64
|
- static/course/start.html
|
64
65
|
- static/course/fail.html
|