spidr 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +13 -0
- data/README.txt +1 -0
- data/lib/spidr/agent.rb +108 -34
- data/lib/spidr/version.rb +1 -1
- data/spec/helpers/course.rb +22 -9
- data/static/course/remote/start.html +4 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +4 -0
- metadata +3 -3
data/History.txt
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
=== 0.1.6 / 2009-04-14
|
2
|
+
|
3
|
+
* Added Agent#failures, a list of URLs which could not be visited.
|
4
|
+
* Added Agent#failed?.
|
5
|
+
* Added Agent#every_failed_url.
|
6
|
+
* Added Agent#clear, which clears the history and failures URL lists.
|
7
|
+
* Improved fault tolerance in Agent#get_page.
|
8
|
+
* If a Network or HTTP error is encountered, the URL will be added to
|
9
|
+
the failures list and the next URL will be visited.
|
10
|
+
* Fixed a typo in Agent#ignore_exts_like.
|
11
|
+
* Updated the Web Spider Obstacle Course with links that always fail to be
|
12
|
+
visited.
|
13
|
+
|
1
14
|
=== 0.1.5 / 2009-03-22
|
2
15
|
|
3
16
|
* Catch malformed URIs in Page#to_absolute and return +nil+.
|
data/README.txt
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -23,7 +23,10 @@ module Spidr
|
|
23
23
|
attr_accessor :delay
|
24
24
|
|
25
25
|
# History containing visited URLs
|
26
|
-
|
26
|
+
attr_reader :history
|
27
|
+
|
28
|
+
# List of unreachable URLs
|
29
|
+
attr_reader :failures
|
27
30
|
|
28
31
|
#
|
29
32
|
# Creates a new Agent object with the given _options_ and _block_.
|
@@ -70,12 +73,14 @@ module Spidr
|
|
70
73
|
)
|
71
74
|
|
72
75
|
@every_url_blocks = []
|
76
|
+
@every_failed_url_blocks = []
|
73
77
|
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
74
78
|
|
75
79
|
@every_page_blocks = []
|
76
80
|
|
77
81
|
@delay = (options[:delay] || 0)
|
78
82
|
@history = []
|
83
|
+
@failures = []
|
79
84
|
@queue = []
|
80
85
|
|
81
86
|
if options[:host]
|
@@ -287,7 +292,7 @@ module Spidr
|
|
287
292
|
# Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
|
288
293
|
# it will be added to the ignore_exts.
|
289
294
|
#
|
290
|
-
def ignore_exts_like(
|
295
|
+
def ignore_exts_like(pattern=nil,&block)
|
291
296
|
if pattern
|
292
297
|
ignore_exts << pattern
|
293
298
|
elsif block
|
@@ -306,6 +311,15 @@ module Spidr
|
|
306
311
|
return self
|
307
312
|
end
|
308
313
|
|
314
|
+
#
|
315
|
+
# For every URL that the agent is unable to visit, it will be passed
|
316
|
+
# to the specified _block_.
|
317
|
+
#
|
318
|
+
def every_failed_url(&block)
|
319
|
+
@every_failed_url_blocks << block
|
320
|
+
return self
|
321
|
+
end
|
322
|
+
|
309
323
|
#
|
310
324
|
# For every URL that the agent visits and matches the specified
|
311
325
|
# _pattern_, it will be passed to the specified _block_.
|
@@ -324,11 +338,21 @@ module Spidr
|
|
324
338
|
return self
|
325
339
|
end
|
326
340
|
|
341
|
+
#
|
342
|
+
# Clears the history of the agent.
|
343
|
+
#
|
344
|
+
def clear
|
345
|
+
@queue.clear
|
346
|
+
@history.clear
|
347
|
+
@failures.clear
|
348
|
+
return self
|
349
|
+
end
|
350
|
+
|
327
351
|
#
|
328
352
|
# Clear the history and start spidering at the specified _url_.
|
329
353
|
#
|
330
354
|
def start_at(url)
|
331
|
-
|
355
|
+
clear
|
332
356
|
return run(url)
|
333
357
|
end
|
334
358
|
|
@@ -366,11 +390,23 @@ module Spidr
|
|
366
390
|
# otherwise.
|
367
391
|
#
|
368
392
|
def visited?(url)
|
369
|
-
|
370
|
-
|
371
|
-
else
|
372
|
-
return @history.include?(URI(url).to_s)
|
393
|
+
unless url.kind_of?(URI)
|
394
|
+
url = URI(url)
|
373
395
|
end
|
396
|
+
|
397
|
+
return @history.include?(url)
|
398
|
+
end
|
399
|
+
|
400
|
+
#
|
401
|
+
# Returns +true+ if the specified _url_ was unable to be visited,
|
402
|
+
# returns +false+ otherwise.
|
403
|
+
#
|
404
|
+
def failed?(url)
|
405
|
+
unless url.kind_of?(URI)
|
406
|
+
url = URI(url)
|
407
|
+
end
|
408
|
+
|
409
|
+
return @failures.include?(url)
|
374
410
|
end
|
375
411
|
|
376
412
|
#
|
@@ -392,16 +428,21 @@ module Spidr
|
|
392
428
|
proxy_user = @proxy[:user]
|
393
429
|
proxy_password = @proxy[:password]
|
394
430
|
|
395
|
-
|
396
|
-
|
431
|
+
begin
|
432
|
+
Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
|
433
|
+
headers = {}
|
397
434
|
|
398
|
-
|
399
|
-
|
435
|
+
headers['User-Agent'] = @user_agent if @user_agent
|
436
|
+
headers['Referer'] = @referer if @referer
|
400
437
|
|
401
|
-
|
438
|
+
new_page = Page.new(url,sess.get(path,headers))
|
402
439
|
|
403
|
-
|
404
|
-
|
440
|
+
block.call(new_page) if block
|
441
|
+
return new_page
|
442
|
+
end
|
443
|
+
rescue SystemCallError, Net::HTTPBadResponse
|
444
|
+
failed(url)
|
445
|
+
return nil
|
405
446
|
end
|
406
447
|
end
|
407
448
|
|
@@ -447,6 +488,50 @@ module Spidr
|
|
447
488
|
@queue.shift
|
448
489
|
end
|
449
490
|
|
491
|
+
#
|
492
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
493
|
+
# it's scheme, returns +false+ otherwise.
|
494
|
+
#
|
495
|
+
def visit_scheme?(url)
|
496
|
+
if url.scheme
|
497
|
+
return SCHEMES.include?(url.scheme)
|
498
|
+
else
|
499
|
+
return true
|
500
|
+
end
|
501
|
+
end
|
502
|
+
|
503
|
+
#
|
504
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
505
|
+
# the host of the _url_, returns +false+ otherwise.
|
506
|
+
#
|
507
|
+
def visit_host?(url)
|
508
|
+
@host_rules.accept?(url.host)
|
509
|
+
end
|
510
|
+
|
511
|
+
#
|
512
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
513
|
+
# the port of the _url_, returns +false+ otherwise.
|
514
|
+
#
|
515
|
+
def visit_port?(url)
|
516
|
+
@port_rules.accept?(url.port)
|
517
|
+
end
|
518
|
+
|
519
|
+
#
|
520
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
521
|
+
# the pattern of the _url_, returns +false+ otherwise.
|
522
|
+
#
|
523
|
+
def visit_link?(url)
|
524
|
+
@link_rules.accept?(url.to_s)
|
525
|
+
end
|
526
|
+
|
527
|
+
#
|
528
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
529
|
+
# the file extension of the _url_, returns +false+ otherwise.
|
530
|
+
#
|
531
|
+
def visit_ext?(url)
|
532
|
+
@ext_rules.accept?(File.extname(url.path)[1..-1])
|
533
|
+
end
|
534
|
+
|
450
535
|
#
|
451
536
|
# Returns +true+ if the specified URL should be visited, returns
|
452
537
|
# +false+ otherwise.
|
@@ -477,28 +562,17 @@ module Spidr
|
|
477
562
|
end
|
478
563
|
end
|
479
564
|
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
565
|
+
#
|
566
|
+
# Adds the specified _url_ to the failures list.
|
567
|
+
#
|
568
|
+
def failed(url)
|
569
|
+
unless url.kind_of?(URI)
|
570
|
+
url = URI(url.to_s)
|
485
571
|
end
|
486
|
-
end
|
487
572
|
|
488
|
-
|
489
|
-
@
|
490
|
-
|
491
|
-
|
492
|
-
def visit_port?(url)
|
493
|
-
@port_rules.accept?(url.port)
|
494
|
-
end
|
495
|
-
|
496
|
-
def visit_link?(url)
|
497
|
-
@link_rules.accept?(url.to_s)
|
498
|
-
end
|
499
|
-
|
500
|
-
def visit_ext?(url)
|
501
|
-
@ext_rules.accept?(File.extname(url.path)[1..-1])
|
573
|
+
@every_failed_url_blocks.each { |block| block.call(url) }
|
574
|
+
@failures << url
|
575
|
+
return true
|
502
576
|
end
|
503
577
|
|
504
578
|
end
|
data/lib/spidr/version.rb
CHANGED
data/spec/helpers/course.rb
CHANGED
@@ -15,18 +15,25 @@ module Helpers
|
|
15
15
|
message = spec['message'].to_s.dump
|
16
16
|
url = spec['url'].to_s.dump
|
17
17
|
|
18
|
-
|
18
|
+
case spec['behavior']
|
19
|
+
when 'follow'
|
19
20
|
base.module_eval %{
|
20
21
|
it #{message} do
|
21
22
|
should_visit_link(#{url})
|
22
23
|
end
|
23
24
|
}
|
24
|
-
|
25
|
+
when 'nofollow'
|
25
26
|
base.module_eval %{
|
26
27
|
it #{message} do
|
27
28
|
should_visit_once(#{url})
|
28
29
|
end
|
29
30
|
}
|
31
|
+
when 'fail'
|
32
|
+
base.module_eval %{
|
33
|
+
it #{message} do
|
34
|
+
should_fail_link(#{url})
|
35
|
+
end
|
36
|
+
}
|
30
37
|
else
|
31
38
|
link = spec['link'].to_s.dump
|
32
39
|
|
@@ -42,7 +49,10 @@ module Helpers
|
|
42
49
|
end
|
43
50
|
|
44
51
|
def run_course
|
45
|
-
Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host])
|
52
|
+
Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
|
53
|
+
agent.every_failed_url { |url| puts "[FAILED] #{url}" }
|
54
|
+
agent.every_url { |url| puts url }
|
55
|
+
end
|
46
56
|
end
|
47
57
|
|
48
58
|
def visited_once?(link)
|
@@ -58,13 +68,11 @@ module Helpers
|
|
58
68
|
# +false+ otherwise.
|
59
69
|
#
|
60
70
|
def visited_link?(link)
|
61
|
-
|
62
|
-
|
63
|
-
@agent.visited_urls.each do |visited_url|
|
64
|
-
return true if visited_url == url
|
65
|
-
end
|
71
|
+
@agent.visited?(COURSE_URL.merge(URI.encode(link)))
|
72
|
+
end
|
66
73
|
|
67
|
-
|
74
|
+
def visit_failed?(link)
|
75
|
+
@agent.failed?(COURSE_URL.merge(URI.encode(link)))
|
68
76
|
end
|
69
77
|
|
70
78
|
def should_visit_link(link)
|
@@ -78,5 +86,10 @@ module Helpers
|
|
78
86
|
def should_visit_once(link)
|
79
87
|
visited_once?(link).should == true
|
80
88
|
end
|
89
|
+
|
90
|
+
def should_fail_link(link)
|
91
|
+
visited_link?(link).should == false
|
92
|
+
visit_failed?(link).should == true
|
93
|
+
end
|
81
94
|
end
|
82
95
|
end
|
@@ -18,6 +18,10 @@
|
|
18
18
|
<li class="follow">
|
19
19
|
<a href="http://spidr.rubyforge.org/course/remote/next.html">should follow remote links to unvisited pages</a>
|
20
20
|
</li>
|
21
|
+
|
22
|
+
<li class="fail">
|
23
|
+
<a href="http://spidr.rubyforge.org:1337/path/">should ignore links that fail</a>
|
24
|
+
</li>
|
21
25
|
</ul>
|
22
26
|
</body>
|
23
27
|
</html>
|
data/static/course/specs.json
CHANGED
@@ -1 +1 @@
|
|
1
|
-
[{"
|
1
|
+
[{"link":"\/course\/absolute\/next.html","behavior":"follow","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"link":"\/course\/absolute\/start.html","behavior":"nofollow","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a>should not follow links with no href attributes<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"link":" ","behavior":"ignore","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"link":"javascript:fail();","behavior":"ignore","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"link":"next.html","behavior":"follow","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"link":"normal.html","behavior":"follow","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"link":".\/current_directory.html","behavior":"follow","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"link":"..\/relative\/same_directory.html","behavior":"follow","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","behavior":"fail","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>"}]
|
data/tasks/course.rb
CHANGED
@@ -44,6 +44,10 @@ namespace :course do
|
|
44
44
|
doc.search('.ignore//a').each do |ignore|
|
45
45
|
specs << link_to_spec.call(ignore, :behavior => :ignore)
|
46
46
|
end
|
47
|
+
|
48
|
+
doc.search('.fail//a').each do |ignore|
|
49
|
+
specs << link_to_spec.call(ignore, :behavior => :fail)
|
50
|
+
end
|
47
51
|
end
|
48
52
|
|
49
53
|
spec.write(specs.to_json)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-04-14 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: 1.12.1
|
34
34
|
version:
|
35
35
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
36
36
|
email:
|