spidr 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +13 -0
- data/README.txt +1 -0
- data/lib/spidr/agent.rb +108 -34
- data/lib/spidr/version.rb +1 -1
- data/spec/helpers/course.rb +22 -9
- data/static/course/remote/start.html +4 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +4 -0
- metadata +3 -3
data/History.txt
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
=== 0.1.6 / 2009-04-14
|
2
|
+
|
3
|
+
* Added Agent#failures, a list of URLs which could not be visited.
|
4
|
+
* Added Agent#failed?.
|
5
|
+
* Added Agent#every_failed_url.
|
6
|
+
* Added Agent#clear, which clears the history and failures URL lists.
|
7
|
+
* Improved fault tolerance in Agent#get_page.
|
8
|
+
* If a Network or HTTP error is encountered, the URL will be added to
|
9
|
+
the failures list and the next URL will be visited.
|
10
|
+
* Fixed a typo in Agent#ignore_exts_like.
|
11
|
+
* Updated the Web Spider Obstacle Course with links that always fail to be
|
12
|
+
visited.
|
13
|
+
|
1
14
|
=== 0.1.5 / 2009-03-22
|
2
15
|
|
3
16
|
* Catch malformed URIs in Page#to_absolute and return +nil+.
|
data/README.txt
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -23,7 +23,10 @@ module Spidr
|
|
23
23
|
attr_accessor :delay
|
24
24
|
|
25
25
|
# History containing visited URLs
|
26
|
-
|
26
|
+
attr_reader :history
|
27
|
+
|
28
|
+
# List of unreachable URLs
|
29
|
+
attr_reader :failures
|
27
30
|
|
28
31
|
#
|
29
32
|
# Creates a new Agent object with the given _options_ and _block_.
|
@@ -70,12 +73,14 @@ module Spidr
|
|
70
73
|
)
|
71
74
|
|
72
75
|
@every_url_blocks = []
|
76
|
+
@every_failed_url_blocks = []
|
73
77
|
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
74
78
|
|
75
79
|
@every_page_blocks = []
|
76
80
|
|
77
81
|
@delay = (options[:delay] || 0)
|
78
82
|
@history = []
|
83
|
+
@failures = []
|
79
84
|
@queue = []
|
80
85
|
|
81
86
|
if options[:host]
|
@@ -287,7 +292,7 @@ module Spidr
|
|
287
292
|
# Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
|
288
293
|
# it will be added to the ignore_exts.
|
289
294
|
#
|
290
|
-
def ignore_exts_like(
|
295
|
+
def ignore_exts_like(pattern=nil,&block)
|
291
296
|
if pattern
|
292
297
|
ignore_exts << pattern
|
293
298
|
elsif block
|
@@ -306,6 +311,15 @@ module Spidr
|
|
306
311
|
return self
|
307
312
|
end
|
308
313
|
|
314
|
+
#
|
315
|
+
# For every URL that the agent is unable to visit, it will be passed
|
316
|
+
# to the specified _block_.
|
317
|
+
#
|
318
|
+
def every_failed_url(&block)
|
319
|
+
@every_failed_url_blocks << block
|
320
|
+
return self
|
321
|
+
end
|
322
|
+
|
309
323
|
#
|
310
324
|
# For every URL that the agent visits and matches the specified
|
311
325
|
# _pattern_, it will be passed to the specified _block_.
|
@@ -324,11 +338,21 @@ module Spidr
|
|
324
338
|
return self
|
325
339
|
end
|
326
340
|
|
341
|
+
#
|
342
|
+
# Clears the history of the agent.
|
343
|
+
#
|
344
|
+
def clear
|
345
|
+
@queue.clear
|
346
|
+
@history.clear
|
347
|
+
@failures.clear
|
348
|
+
return self
|
349
|
+
end
|
350
|
+
|
327
351
|
#
|
328
352
|
# Clear the history and start spidering at the specified _url_.
|
329
353
|
#
|
330
354
|
def start_at(url)
|
331
|
-
|
355
|
+
clear
|
332
356
|
return run(url)
|
333
357
|
end
|
334
358
|
|
@@ -366,11 +390,23 @@ module Spidr
|
|
366
390
|
# otherwise.
|
367
391
|
#
|
368
392
|
def visited?(url)
|
369
|
-
|
370
|
-
|
371
|
-
else
|
372
|
-
return @history.include?(URI(url).to_s)
|
393
|
+
unless url.kind_of?(URI)
|
394
|
+
url = URI(url)
|
373
395
|
end
|
396
|
+
|
397
|
+
return @history.include?(url)
|
398
|
+
end
|
399
|
+
|
400
|
+
#
|
401
|
+
# Returns +true+ if the specified _url_ was unable to be visited,
|
402
|
+
# returns +false+ otherwise.
|
403
|
+
#
|
404
|
+
def failed?(url)
|
405
|
+
unless url.kind_of?(URI)
|
406
|
+
url = URI(url)
|
407
|
+
end
|
408
|
+
|
409
|
+
return @failures.include?(url)
|
374
410
|
end
|
375
411
|
|
376
412
|
#
|
@@ -392,16 +428,21 @@ module Spidr
|
|
392
428
|
proxy_user = @proxy[:user]
|
393
429
|
proxy_password = @proxy[:password]
|
394
430
|
|
395
|
-
|
396
|
-
|
431
|
+
begin
|
432
|
+
Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
|
433
|
+
headers = {}
|
397
434
|
|
398
|
-
|
399
|
-
|
435
|
+
headers['User-Agent'] = @user_agent if @user_agent
|
436
|
+
headers['Referer'] = @referer if @referer
|
400
437
|
|
401
|
-
|
438
|
+
new_page = Page.new(url,sess.get(path,headers))
|
402
439
|
|
403
|
-
|
404
|
-
|
440
|
+
block.call(new_page) if block
|
441
|
+
return new_page
|
442
|
+
end
|
443
|
+
rescue SystemCallError, Net::HTTPBadResponse
|
444
|
+
failed(url)
|
445
|
+
return nil
|
405
446
|
end
|
406
447
|
end
|
407
448
|
|
@@ -447,6 +488,50 @@ module Spidr
|
|
447
488
|
@queue.shift
|
448
489
|
end
|
449
490
|
|
491
|
+
#
|
492
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
493
|
+
# it's scheme, returns +false+ otherwise.
|
494
|
+
#
|
495
|
+
def visit_scheme?(url)
|
496
|
+
if url.scheme
|
497
|
+
return SCHEMES.include?(url.scheme)
|
498
|
+
else
|
499
|
+
return true
|
500
|
+
end
|
501
|
+
end
|
502
|
+
|
503
|
+
#
|
504
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
505
|
+
# the host of the _url_, returns +false+ otherwise.
|
506
|
+
#
|
507
|
+
def visit_host?(url)
|
508
|
+
@host_rules.accept?(url.host)
|
509
|
+
end
|
510
|
+
|
511
|
+
#
|
512
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
513
|
+
# the port of the _url_, returns +false+ otherwise.
|
514
|
+
#
|
515
|
+
def visit_port?(url)
|
516
|
+
@port_rules.accept?(url.port)
|
517
|
+
end
|
518
|
+
|
519
|
+
#
|
520
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
521
|
+
# the pattern of the _url_, returns +false+ otherwise.
|
522
|
+
#
|
523
|
+
def visit_link?(url)
|
524
|
+
@link_rules.accept?(url.to_s)
|
525
|
+
end
|
526
|
+
|
527
|
+
#
|
528
|
+
# Returns +true+ if the specified _url_ should be visited, based on
|
529
|
+
# the file extension of the _url_, returns +false+ otherwise.
|
530
|
+
#
|
531
|
+
def visit_ext?(url)
|
532
|
+
@ext_rules.accept?(File.extname(url.path)[1..-1])
|
533
|
+
end
|
534
|
+
|
450
535
|
#
|
451
536
|
# Returns +true+ if the specified URL should be visited, returns
|
452
537
|
# +false+ otherwise.
|
@@ -477,28 +562,17 @@ module Spidr
|
|
477
562
|
end
|
478
563
|
end
|
479
564
|
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
565
|
+
#
|
566
|
+
# Adds the specified _url_ to the failures list.
|
567
|
+
#
|
568
|
+
def failed(url)
|
569
|
+
unless url.kind_of?(URI)
|
570
|
+
url = URI(url.to_s)
|
485
571
|
end
|
486
|
-
end
|
487
572
|
|
488
|
-
|
489
|
-
@
|
490
|
-
|
491
|
-
|
492
|
-
def visit_port?(url)
|
493
|
-
@port_rules.accept?(url.port)
|
494
|
-
end
|
495
|
-
|
496
|
-
def visit_link?(url)
|
497
|
-
@link_rules.accept?(url.to_s)
|
498
|
-
end
|
499
|
-
|
500
|
-
def visit_ext?(url)
|
501
|
-
@ext_rules.accept?(File.extname(url.path)[1..-1])
|
573
|
+
@every_failed_url_blocks.each { |block| block.call(url) }
|
574
|
+
@failures << url
|
575
|
+
return true
|
502
576
|
end
|
503
577
|
|
504
578
|
end
|
data/lib/spidr/version.rb
CHANGED
data/spec/helpers/course.rb
CHANGED
@@ -15,18 +15,25 @@ module Helpers
|
|
15
15
|
message = spec['message'].to_s.dump
|
16
16
|
url = spec['url'].to_s.dump
|
17
17
|
|
18
|
-
|
18
|
+
case spec['behavior']
|
19
|
+
when 'follow'
|
19
20
|
base.module_eval %{
|
20
21
|
it #{message} do
|
21
22
|
should_visit_link(#{url})
|
22
23
|
end
|
23
24
|
}
|
24
|
-
|
25
|
+
when 'nofollow'
|
25
26
|
base.module_eval %{
|
26
27
|
it #{message} do
|
27
28
|
should_visit_once(#{url})
|
28
29
|
end
|
29
30
|
}
|
31
|
+
when 'fail'
|
32
|
+
base.module_eval %{
|
33
|
+
it #{message} do
|
34
|
+
should_fail_link(#{url})
|
35
|
+
end
|
36
|
+
}
|
30
37
|
else
|
31
38
|
link = spec['link'].to_s.dump
|
32
39
|
|
@@ -42,7 +49,10 @@ module Helpers
|
|
42
49
|
end
|
43
50
|
|
44
51
|
def run_course
|
45
|
-
Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host])
|
52
|
+
Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
|
53
|
+
agent.every_failed_url { |url| puts "[FAILED] #{url}" }
|
54
|
+
agent.every_url { |url| puts url }
|
55
|
+
end
|
46
56
|
end
|
47
57
|
|
48
58
|
def visited_once?(link)
|
@@ -58,13 +68,11 @@ module Helpers
|
|
58
68
|
# +false+ otherwise.
|
59
69
|
#
|
60
70
|
def visited_link?(link)
|
61
|
-
|
62
|
-
|
63
|
-
@agent.visited_urls.each do |visited_url|
|
64
|
-
return true if visited_url == url
|
65
|
-
end
|
71
|
+
@agent.visited?(COURSE_URL.merge(URI.encode(link)))
|
72
|
+
end
|
66
73
|
|
67
|
-
|
74
|
+
def visit_failed?(link)
|
75
|
+
@agent.failed?(COURSE_URL.merge(URI.encode(link)))
|
68
76
|
end
|
69
77
|
|
70
78
|
def should_visit_link(link)
|
@@ -78,5 +86,10 @@ module Helpers
|
|
78
86
|
def should_visit_once(link)
|
79
87
|
visited_once?(link).should == true
|
80
88
|
end
|
89
|
+
|
90
|
+
def should_fail_link(link)
|
91
|
+
visited_link?(link).should == false
|
92
|
+
visit_failed?(link).should == true
|
93
|
+
end
|
81
94
|
end
|
82
95
|
end
|
@@ -18,6 +18,10 @@
|
|
18
18
|
<li class="follow">
|
19
19
|
<a href="http://spidr.rubyforge.org/course/remote/next.html">should follow remote links to unvisited pages</a>
|
20
20
|
</li>
|
21
|
+
|
22
|
+
<li class="fail">
|
23
|
+
<a href="http://spidr.rubyforge.org:1337/path/">should ignore links that fail</a>
|
24
|
+
</li>
|
21
25
|
</ul>
|
22
26
|
</body>
|
23
27
|
</html>
|
data/static/course/specs.json
CHANGED
@@ -1 +1 @@
|
|
1
|
-
[{"
|
1
|
+
[{"link":"\/course\/absolute\/next.html","behavior":"follow","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"link":"\/course\/absolute\/start.html","behavior":"nofollow","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a>should not follow links with no href attributes<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"link":" ","behavior":"ignore","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"link":"javascript:fail();","behavior":"ignore","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"link":"next.html","behavior":"follow","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"link":"normal.html","behavior":"follow","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"link":".\/current_directory.html","behavior":"follow","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"link":"..\/relative\/same_directory.html","behavior":"follow","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","behavior":"fail","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>"}]
|
data/tasks/course.rb
CHANGED
@@ -44,6 +44,10 @@ namespace :course do
|
|
44
44
|
doc.search('.ignore//a').each do |ignore|
|
45
45
|
specs << link_to_spec.call(ignore, :behavior => :ignore)
|
46
46
|
end
|
47
|
+
|
48
|
+
doc.search('.fail//a').each do |ignore|
|
49
|
+
specs << link_to_spec.call(ignore, :behavior => :fail)
|
50
|
+
end
|
47
51
|
end
|
48
52
|
|
49
53
|
spec.write(specs.to_json)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-04-14 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: 1.12.1
|
34
34
|
version:
|
35
35
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
36
36
|
email:
|