spidr 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog.md +69 -54
  3. data/Gemfile +9 -5
  4. data/LICENSE.txt +1 -1
  5. data/README.md +34 -26
  6. data/Rakefile +4 -15
  7. data/gemspec.yml +3 -2
  8. data/lib/spidr/agent.rb +101 -44
  9. data/lib/spidr/{actions → agent}/actions.rb +32 -12
  10. data/lib/spidr/{events.rb → agent/events.rb} +4 -8
  11. data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
  12. data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
  13. data/lib/spidr/auth_store.rb +2 -2
  14. data/lib/spidr/cookie_jar.rb +2 -2
  15. data/lib/spidr/extensions/uri.rb +28 -16
  16. data/lib/spidr/page.rb +7 -11
  17. data/lib/spidr/{body.rb → page/body.rb} +1 -1
  18. data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
  19. data/lib/spidr/{links.rb → page/links.rb} +43 -7
  20. data/lib/spidr/session_cache.rb +2 -2
  21. data/lib/spidr/spidr.rb +32 -5
  22. data/lib/spidr/version.rb +1 -1
  23. data/spec/agent/actions_spec.rb +60 -0
  24. data/spec/agent/filters_spec.rb +62 -0
  25. data/spec/agent/sanitizers_spec.rb +62 -0
  26. data/spec/agent_spec.rb +13 -13
  27. data/spec/auth_store_spec.rb +17 -17
  28. data/spec/cookie_jar_spec.rb +26 -26
  29. data/spec/extensions/uri_spec.rb +19 -9
  30. data/spec/helpers/history.rb +5 -5
  31. data/spec/helpers/wsoc.rb +2 -2
  32. data/spec/page_examples.rb +4 -4
  33. data/spec/page_spec.rb +28 -25
  34. data/spec/rules_spec.rb +14 -14
  35. data/spec/session_cache.rb +7 -7
  36. data/spec/spidr_spec.rb +10 -10
  37. metadata +37 -51
  38. data/lib/spidr/actions.rb +0 -2
  39. data/lib/spidr/actions/exceptions.rb +0 -4
  40. data/lib/spidr/actions/exceptions/action.rb +0 -9
  41. data/lib/spidr/actions/exceptions/paused.rb +0 -11
  42. data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
  43. data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
  44. data/spec/actions_spec.rb +0 -59
  45. data/spec/filters_spec.rb +0 -61
  46. data/spec/sanitizers_spec.rb +0 -61
@@ -1,2 +0,0 @@
1
- require 'spidr/actions/exceptions'
2
- require 'spidr/actions/actions'
@@ -1,4 +0,0 @@
1
- require 'spidr/actions/exceptions/action'
2
- require 'spidr/actions/exceptions/paused'
3
- require 'spidr/actions/exceptions/skip_link'
4
- require 'spidr/actions/exceptions/skip_page'
@@ -1,9 +0,0 @@
1
- module Spidr
2
- module Actions
3
- #
4
- # The base {Actions} exception class.
5
- #
6
- class Action < RuntimeError
7
- end
8
- end
9
- end
@@ -1,11 +0,0 @@
1
- require 'spidr/actions/exceptions/action'
2
-
3
- module Spidr
4
- module Actions
5
- #
6
- # An {Actions} exception class used to pause a running {Agent}.
7
- #
8
- class Paused < Action
9
- end
10
- end
11
- end
@@ -1,12 +0,0 @@
1
- require 'spidr/actions/exceptions/action'
2
-
3
- module Spidr
4
- module Actions
5
- #
6
- # An {Actions} exception class which causes a running {Agent} to
7
- # skip a link.
8
- #
9
- class SkipLink < Action
10
- end
11
- end
12
- end
@@ -1,12 +0,0 @@
1
- require 'spidr/actions/exceptions/action'
2
-
3
- module Spidr
4
- module Actions
5
- #
6
- # An {Actions} exception class which causes a running {Agent} to
7
- # skip a {Page}, and all links within that page.
8
- #
9
- class SkipPage < Action
10
- end
11
- end
12
- end
@@ -1,59 +0,0 @@
1
- require 'spidr/actions'
2
- require 'spidr/agent'
3
-
4
- require 'spec_helper'
5
-
6
- describe Actions do
7
- let(:url) { URI('http://spidr.rubyforge.org/') }
8
-
9
- it "should be able to pause spidering" do
10
- count = 0
11
- agent = Agent.host('spidr.rubyforge.org') do |spider|
12
- spider.every_page do |page|
13
- count += 1
14
- spider.pause! if count >= 2
15
- end
16
- end
17
-
18
- agent.should be_paused
19
- agent.history.length.should == 2
20
- end
21
-
22
- it "should be able to continue spidering after being paused" do
23
- agent = Agent.new do |spider|
24
- spider.every_page do |page|
25
- spider.pause!
26
- end
27
- end
28
-
29
- agent.enqueue(url)
30
- agent.continue!
31
-
32
- agent.visited?(url).should == true
33
- end
34
-
35
- it "should allow skipping of enqueued links" do
36
- agent = Agent.new do |spider|
37
- spider.every_url do |url|
38
- spider.skip_link!
39
- end
40
- end
41
-
42
- agent.enqueue(url)
43
-
44
- agent.queue.should be_empty
45
- end
46
-
47
- it "should allow skipping of visited pages" do
48
- agent = Agent.new do |spider|
49
- spider.every_page do |url|
50
- spider.skip_page!
51
- end
52
- end
53
-
54
- agent.visit_page(url)
55
-
56
- agent.history.should == Set[url]
57
- agent.queue.should be_empty
58
- end
59
- end
@@ -1,61 +0,0 @@
1
- require 'spidr/filters'
2
- require 'spidr/agent'
3
-
4
- require 'spec_helper'
5
-
6
- describe Filters do
7
- it "should allow setting the acceptable schemes" do
8
- agent = Agent.new
9
-
10
- agent.schemes = [:http]
11
- agent.schemes.should == ['http']
12
- end
13
-
14
- it "should provide the hosts that will be visited" do
15
- agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
16
-
17
- agent.visit_hosts.should == ['spidr.rubyforge.org']
18
- end
19
-
20
- it "should provide the hosts that will not be visited" do
21
- agent = Agent.new(:ignore_hosts => ['example.com'])
22
-
23
- agent.ignore_hosts.should == ['example.com']
24
- end
25
-
26
- it "should provide the ports that will be visited" do
27
- agent = Agent.new(:ports => [80, 443, 8000])
28
-
29
- agent.visit_ports.should == [80, 443, 8000]
30
- end
31
-
32
- it "should provide the ports that will not be visited" do
33
- agent = Agent.new(:ignore_ports => [8000, 8080])
34
-
35
- agent.ignore_ports.should == [8000, 8080]
36
- end
37
-
38
- it "should provide the links that will be visited" do
39
- agent = Agent.new(:links => ['index.php'])
40
-
41
- agent.visit_links.should == ['index.php']
42
- end
43
-
44
- it "should provide the links that will not be visited" do
45
- agent = Agent.new(:ignore_links => [/login/])
46
-
47
- agent.ignore_links.should == [/login/]
48
- end
49
-
50
- it "should provide the exts that will be visited" do
51
- agent = Agent.new(:exts => ['htm'])
52
-
53
- agent.visit_exts.should == ['htm']
54
- end
55
-
56
- it "should provide the exts that will not be visited" do
57
- agent = Agent.new(:ignore_exts => ['cfm'])
58
-
59
- agent.ignore_exts.should == ['cfm']
60
- end
61
- end
@@ -1,61 +0,0 @@
1
- require 'spidr/sanitizers'
2
- require 'spidr/agent'
3
-
4
- require 'spec_helper'
5
-
6
- describe Sanitizers do
7
- describe "sanitize_url" do
8
- let(:url) { 'http://host.com' }
9
- before(:all) { @agent = Agent.new }
10
-
11
- it "should sanitize URLs" do
12
- agent = Agent.new
13
- clean_url = agent.sanitize_url(URI(url))
14
-
15
- clean_url.host.should == 'host.com'
16
- end
17
-
18
- it "should sanitize URLs given as Strings" do
19
- agent = Agent.new
20
- clean_url = agent.sanitize_url(url)
21
-
22
- clean_url.host.should == 'host.com'
23
- end
24
- end
25
-
26
- describe "strip_fragments" do
27
- let(:url) { URI("http://host.com/page#lol") }
28
-
29
- it "should strip fragment components by default" do
30
- agent = Agent.new
31
- clean_url = agent.sanitize_url(url)
32
-
33
- clean_url.fragment.should be_nil
34
- end
35
-
36
- it "should allow perserving fragment components" do
37
- agent = Agent.new(:strip_fragments => false)
38
- clean_url = agent.sanitize_url(url)
39
-
40
- clean_url.fragment.should == 'lol'
41
- end
42
- end
43
-
44
- describe "strip_query" do
45
- let(:url) { URI("http://host.com/page?x=1") }
46
-
47
- it "should not strip query components by default" do
48
- agent = Agent.new
49
- clean_url = agent.sanitize_url(url)
50
-
51
- clean_url.query.should == 'x=1'
52
- end
53
-
54
- it "should allow stripping of query components" do
55
- agent = Agent.new(:strip_query => true)
56
- clean_url = agent.sanitize_url(url)
57
-
58
- clean_url.query.should be_nil
59
- end
60
- end
61
- end