spidr 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog.md +69 -54
  3. data/Gemfile +9 -5
  4. data/LICENSE.txt +1 -1
  5. data/README.md +34 -26
  6. data/Rakefile +4 -15
  7. data/gemspec.yml +3 -2
  8. data/lib/spidr/agent.rb +101 -44
  9. data/lib/spidr/{actions → agent}/actions.rb +32 -12
  10. data/lib/spidr/{events.rb → agent/events.rb} +4 -8
  11. data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
  12. data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
  13. data/lib/spidr/auth_store.rb +2 -2
  14. data/lib/spidr/cookie_jar.rb +2 -2
  15. data/lib/spidr/extensions/uri.rb +28 -16
  16. data/lib/spidr/page.rb +7 -11
  17. data/lib/spidr/{body.rb → page/body.rb} +1 -1
  18. data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
  19. data/lib/spidr/{links.rb → page/links.rb} +43 -7
  20. data/lib/spidr/session_cache.rb +2 -2
  21. data/lib/spidr/spidr.rb +32 -5
  22. data/lib/spidr/version.rb +1 -1
  23. data/spec/agent/actions_spec.rb +60 -0
  24. data/spec/agent/filters_spec.rb +62 -0
  25. data/spec/agent/sanitizers_spec.rb +62 -0
  26. data/spec/agent_spec.rb +13 -13
  27. data/spec/auth_store_spec.rb +17 -17
  28. data/spec/cookie_jar_spec.rb +26 -26
  29. data/spec/extensions/uri_spec.rb +19 -9
  30. data/spec/helpers/history.rb +5 -5
  31. data/spec/helpers/wsoc.rb +2 -2
  32. data/spec/page_examples.rb +4 -4
  33. data/spec/page_spec.rb +28 -25
  34. data/spec/rules_spec.rb +14 -14
  35. data/spec/session_cache.rb +7 -7
  36. data/spec/spidr_spec.rb +10 -10
  37. metadata +37 -51
  38. data/lib/spidr/actions.rb +0 -2
  39. data/lib/spidr/actions/exceptions.rb +0 -4
  40. data/lib/spidr/actions/exceptions/action.rb +0 -9
  41. data/lib/spidr/actions/exceptions/paused.rb +0 -11
  42. data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
  43. data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
  44. data/spec/actions_spec.rb +0 -59
  45. data/spec/filters_spec.rb +0 -61
  46. data/spec/sanitizers_spec.rb +0 -61
@@ -1,2 +0,0 @@
1
- require 'spidr/actions/exceptions'
2
- require 'spidr/actions/actions'
@@ -1,4 +0,0 @@
1
- require 'spidr/actions/exceptions/action'
2
- require 'spidr/actions/exceptions/paused'
3
- require 'spidr/actions/exceptions/skip_link'
4
- require 'spidr/actions/exceptions/skip_page'
@@ -1,9 +0,0 @@
1
- module Spidr
2
- module Actions
3
- #
4
- # The base {Actions} exception class.
5
- #
6
- class Action < RuntimeError
7
- end
8
- end
9
- end
@@ -1,11 +0,0 @@
1
- require 'spidr/actions/exceptions/action'
2
-
3
- module Spidr
4
- module Actions
5
- #
6
- # An {Actions} exception class used to pause a running {Agent}.
7
- #
8
- class Paused < Action
9
- end
10
- end
11
- end
@@ -1,12 +0,0 @@
1
- require 'spidr/actions/exceptions/action'
2
-
3
- module Spidr
4
- module Actions
5
- #
6
- # An {Actions} exception class which causes a running {Agent} to
7
- # skip a link.
8
- #
9
- class SkipLink < Action
10
- end
11
- end
12
- end
@@ -1,12 +0,0 @@
1
- require 'spidr/actions/exceptions/action'
2
-
3
- module Spidr
4
- module Actions
5
- #
6
- # An {Actions} exception class which causes a running {Agent} to
7
- # skip a {Page}, and all links within that page.
8
- #
9
- class SkipPage < Action
10
- end
11
- end
12
- end
@@ -1,59 +0,0 @@
1
- require 'spidr/actions'
2
- require 'spidr/agent'
3
-
4
- require 'spec_helper'
5
-
6
- describe Actions do
7
- let(:url) { URI('http://spidr.rubyforge.org/') }
8
-
9
- it "should be able to pause spidering" do
10
- count = 0
11
- agent = Agent.host('spidr.rubyforge.org') do |spider|
12
- spider.every_page do |page|
13
- count += 1
14
- spider.pause! if count >= 2
15
- end
16
- end
17
-
18
- agent.should be_paused
19
- agent.history.length.should == 2
20
- end
21
-
22
- it "should be able to continue spidering after being paused" do
23
- agent = Agent.new do |spider|
24
- spider.every_page do |page|
25
- spider.pause!
26
- end
27
- end
28
-
29
- agent.enqueue(url)
30
- agent.continue!
31
-
32
- agent.visited?(url).should == true
33
- end
34
-
35
- it "should allow skipping of enqueued links" do
36
- agent = Agent.new do |spider|
37
- spider.every_url do |url|
38
- spider.skip_link!
39
- end
40
- end
41
-
42
- agent.enqueue(url)
43
-
44
- agent.queue.should be_empty
45
- end
46
-
47
- it "should allow skipping of visited pages" do
48
- agent = Agent.new do |spider|
49
- spider.every_page do |url|
50
- spider.skip_page!
51
- end
52
- end
53
-
54
- agent.visit_page(url)
55
-
56
- agent.history.should == Set[url]
57
- agent.queue.should be_empty
58
- end
59
- end
@@ -1,61 +0,0 @@
1
- require 'spidr/filters'
2
- require 'spidr/agent'
3
-
4
- require 'spec_helper'
5
-
6
- describe Filters do
7
- it "should allow setting the acceptable schemes" do
8
- agent = Agent.new
9
-
10
- agent.schemes = [:http]
11
- agent.schemes.should == ['http']
12
- end
13
-
14
- it "should provide the hosts that will be visited" do
15
- agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
16
-
17
- agent.visit_hosts.should == ['spidr.rubyforge.org']
18
- end
19
-
20
- it "should provide the hosts that will not be visited" do
21
- agent = Agent.new(:ignore_hosts => ['example.com'])
22
-
23
- agent.ignore_hosts.should == ['example.com']
24
- end
25
-
26
- it "should provide the ports that will be visited" do
27
- agent = Agent.new(:ports => [80, 443, 8000])
28
-
29
- agent.visit_ports.should == [80, 443, 8000]
30
- end
31
-
32
- it "should provide the ports that will not be visited" do
33
- agent = Agent.new(:ignore_ports => [8000, 8080])
34
-
35
- agent.ignore_ports.should == [8000, 8080]
36
- end
37
-
38
- it "should provide the links that will be visited" do
39
- agent = Agent.new(:links => ['index.php'])
40
-
41
- agent.visit_links.should == ['index.php']
42
- end
43
-
44
- it "should provide the links that will not be visited" do
45
- agent = Agent.new(:ignore_links => [/login/])
46
-
47
- agent.ignore_links.should == [/login/]
48
- end
49
-
50
- it "should provide the exts that will be visited" do
51
- agent = Agent.new(:exts => ['htm'])
52
-
53
- agent.visit_exts.should == ['htm']
54
- end
55
-
56
- it "should provide the exts that will not be visited" do
57
- agent = Agent.new(:ignore_exts => ['cfm'])
58
-
59
- agent.ignore_exts.should == ['cfm']
60
- end
61
- end
@@ -1,61 +0,0 @@
1
- require 'spidr/sanitizers'
2
- require 'spidr/agent'
3
-
4
- require 'spec_helper'
5
-
6
- describe Sanitizers do
7
- describe "sanitize_url" do
8
- let(:url) { 'http://host.com' }
9
- before(:all) { @agent = Agent.new }
10
-
11
- it "should sanitize URLs" do
12
- agent = Agent.new
13
- clean_url = agent.sanitize_url(URI(url))
14
-
15
- clean_url.host.should == 'host.com'
16
- end
17
-
18
- it "should sanitize URLs given as Strings" do
19
- agent = Agent.new
20
- clean_url = agent.sanitize_url(url)
21
-
22
- clean_url.host.should == 'host.com'
23
- end
24
- end
25
-
26
- describe "strip_fragments" do
27
- let(:url) { URI("http://host.com/page#lol") }
28
-
29
- it "should strip fragment components by default" do
30
- agent = Agent.new
31
- clean_url = agent.sanitize_url(url)
32
-
33
- clean_url.fragment.should be_nil
34
- end
35
-
36
- it "should allow perserving fragment components" do
37
- agent = Agent.new(:strip_fragments => false)
38
- clean_url = agent.sanitize_url(url)
39
-
40
- clean_url.fragment.should == 'lol'
41
- end
42
- end
43
-
44
- describe "strip_query" do
45
- let(:url) { URI("http://host.com/page?x=1") }
46
-
47
- it "should not strip query components by default" do
48
- agent = Agent.new
49
- clean_url = agent.sanitize_url(url)
50
-
51
- clean_url.query.should == 'x=1'
52
- end
53
-
54
- it "should allow stripping of query components" do
55
- agent = Agent.new(:strip_query => true)
56
- clean_url = agent.sanitize_url(url)
57
-
58
- clean_url.query.should be_nil
59
- end
60
- end
61
- end