spidr 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ require 'uri'
3
3
  module Spidr
4
4
  #
5
5
  # The {Sanitizers} module adds methods to {Agent} which control the
6
- # sanitization of incoming links.
6
+ # sanitation of incoming links.
7
7
  #
8
8
  module Sanitizers
9
9
  def self.included(base)
@@ -17,7 +17,7 @@ module Spidr
17
17
  end
18
18
 
19
19
  #
20
- # Initializes the sanitization rules.
20
+ # Initializes the Sanitizer rules.
21
21
  #
22
22
  # @param [Hash] options
23
23
  # Additional options.
@@ -52,7 +52,7 @@ module Spidr
52
52
  # @since 0.2.2
53
53
  #
54
54
  def sanitize_url(url)
55
- url = URI(url.to_s)
55
+ url = URI(url.to_s) unless url.kind_of?(URI)
56
56
 
57
57
  url.fragment = nil if @strip_fragments
58
58
  url.query = nil if @strip_query
@@ -84,6 +84,7 @@ module Spidr
84
84
  if url.scheme == 'https'
85
85
  session.use_ssl = true
86
86
  session.verify_mode = OpenSSL::SSL::VERIFY_NONE
87
+ session.start
87
88
  end
88
89
 
89
90
  @sessions[key] = session
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.2.7'
3
+ VERSION = '0.3.0'
4
4
  end
data/spec/actions_spec.rb CHANGED
@@ -4,9 +4,7 @@ require 'spidr/agent'
4
4
  require 'spec_helper'
5
5
 
6
6
  describe Actions do
7
- before(:all) do
8
- @url = URI('http://spidr.rubyforge.org/')
9
- end
7
+ let(:url) { URI('http://spidr.rubyforge.org/') }
10
8
 
11
9
  it "should be able to pause spidering" do
12
10
  count = 0
@@ -28,10 +26,10 @@ describe Actions do
28
26
  end
29
27
  end
30
28
 
31
- agent.enqueue(@url)
29
+ agent.enqueue(url)
32
30
  agent.continue!
33
31
 
34
- agent.visited?(@url).should == true
32
+ agent.visited?(url).should == true
35
33
  end
36
34
 
37
35
  it "should allow skipping of enqueued links" do
@@ -41,7 +39,7 @@ describe Actions do
41
39
  end
42
40
  end
43
41
 
44
- agent.enqueue(@url)
42
+ agent.enqueue(url)
45
43
 
46
44
  agent.queue.should be_empty
47
45
  end
@@ -53,9 +51,9 @@ describe Actions do
53
51
  end
54
52
  end
55
53
 
56
- agent.visit_page(@url)
54
+ agent.visit_page(url)
57
55
 
58
- agent.history.should == Set[@url]
56
+ agent.history.should == Set[url]
59
57
  agent.queue.should be_empty
60
58
  end
61
59
  end
@@ -3,10 +3,12 @@ require 'spidr/auth_store'
3
3
  require 'spec_helper'
4
4
 
5
5
  describe AuthStore do
6
+ let(:root_uri) { URI('http://zerosum.org/') }
7
+ let(:uri) { root_uri.merge('/course/auth') }
8
+
6
9
  before(:each) do
7
10
  @auth_store = AuthStore.new
8
- @uri = URI('http://zerosum.org/course/auth')
9
- @auth_store.add(@uri, 'admin', 'password')
11
+ @auth_store.add(uri, 'admin', 'password')
10
12
  end
11
13
 
12
14
  after(:each) do
@@ -14,58 +16,56 @@ describe AuthStore do
14
16
  end
15
17
 
16
18
  it 'should retrieve auth credentials for the URL' do
17
- uri = @uri.merge('/')
18
-
19
- @auth_store[uri] = AuthCredential.new('user1', 'pass1')
20
- @auth_store[uri].username.should == 'user1'
21
- @auth_store[uri].password.should == 'pass1'
19
+ @auth_store[root_uri] = AuthCredential.new('user1', 'pass1')
20
+ @auth_store[root_uri].username.should == 'user1'
21
+ @auth_store[root_uri].password.should == 'pass1'
22
22
  end
23
23
 
24
24
  it 'should add auth credentials for the URL' do
25
- uri = @uri.merge('/')
26
-
27
25
  lambda {
28
- @auth_store.add(uri, 'user1', 'pass1')
26
+ @auth_store.add(root_uri, 'user1', 'pass1')
29
27
  }.should change(@auth_store, :size)
30
28
 
31
- @auth_store[uri].username.should == 'user1'
32
- @auth_store[uri].password.should == 'pass1'
29
+ @auth_store[root_uri].username.should == 'user1'
30
+ @auth_store[root_uri].password.should == 'pass1'
33
31
  end
34
32
 
35
33
  describe 'matching' do
36
- it 'should match a longer URL to the base' do
37
- uri = @uri.merge('/course/auth/protected.html')
34
+ let(:sub_uri) { uri.merge('/course/auth/protected.html') }
38
35
 
39
- @auth_store[uri].username.should == 'admin'
40
- @auth_store[uri].password.should == 'password'
36
+ it 'should match a longer URL to the base' do
37
+ @auth_store[sub_uri].username.should == 'admin'
38
+ @auth_store[sub_uri].password.should == 'password'
41
39
  end
42
40
 
43
41
  it 'should match the longest of all matching URLs' do
44
- @auth_store.add(@uri.merge('/course'), 'user1', 'pass1')
45
- @auth_store.add(@uri.merge('/course/auth/special'), 'user2', 'pass2')
46
- @auth_store.add(@uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
42
+ @auth_store.add(uri.merge('/course'), 'user1', 'pass1')
43
+ @auth_store.add(uri.merge('/course/auth/special'), 'user2', 'pass2')
44
+ @auth_store.add(uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
47
45
 
48
- auth = @auth_store[@uri.merge('/course/auth/special/1.html')]
46
+ auth = @auth_store[uri.merge('/course/auth/special/1.html')]
49
47
  auth.username.should == 'user2'
50
48
  auth.password.should == 'pass2'
51
49
  end
52
50
 
53
51
  it 'should not match a URL with a different host' do
54
- uri = URI('http://spidr.rubyforge.org/course/auth')
55
- @auth_store[uri].should be_nil
52
+ remote_uri = URI('http://spidr.rubyforge.org/course/auth')
53
+
54
+ @auth_store[remote_uri].should be_nil
56
55
  end
57
56
 
58
57
  it 'should not match a URL with an alternate path' do
59
- uri = @uri.merge('/course/admin/protected.html')
60
- @auth_store[uri].should be_nil
58
+ relative_uri = uri.merge('/course/admin/protected.html')
59
+
60
+ @auth_store[relative_uri].should be_nil
61
61
  end
62
62
  end
63
63
 
64
64
  it 'should override previous auth credentials' do
65
- @auth_store.add(@uri, 'newuser', 'newpass')
65
+ @auth_store.add(uri, 'newuser', 'newpass')
66
66
 
67
- @auth_store[@uri].username.should == 'newuser'
68
- @auth_store[@uri].password.should == 'newpass'
67
+ @auth_store[uri].username.should == 'newuser'
68
+ @auth_store[uri].password.should == 'newpass'
69
69
  end
70
70
 
71
71
  it 'should clear all cookies' do
@@ -79,7 +79,7 @@ describe AuthStore do
79
79
  end
80
80
 
81
81
  it 'should create an encoded authorization string' do
82
- @auth_store.for_url(@uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
82
+ @auth_store.for_url(uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
83
83
  end
84
84
  end
85
85
  end
@@ -3,108 +3,99 @@ require 'spidr/cookie_jar'
3
3
  require 'spec_helper'
4
4
 
5
5
  describe CookieJar do
6
- before(:each) do
7
- @cookie_jar = CookieJar.new
8
- end
9
-
10
6
  it "should retrieve cookies for the named host" do
11
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
7
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
12
8
 
13
- @cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
9
+ subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
14
10
  end
15
11
 
16
12
  it "should add a cookie to the jar" do
17
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
13
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
18
14
 
19
- @cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
15
+ subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
20
16
  end
21
17
 
22
18
  it "should merge new cookies into the jar" do
23
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
24
- @cookie_jar['zerosum.org'] = {'other' => '1'}
19
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
20
+ subject['zerosum.org'] = {'other' => '1'}
25
21
 
26
- @cookie_jar['zerosum.org'].should == {
22
+ subject['zerosum.org'].should == {
27
23
  'admin' => 'ofcourseiam',
28
24
  'other' => '1'
29
25
  }
30
26
  end
31
27
 
32
28
  it "should override previous cookies in the jar" do
33
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
34
- @cookie_jar['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
29
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
30
+ subject['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
35
31
 
36
- @cookie_jar['zerosum.org'].should == {
32
+ subject['zerosum.org'].should == {
37
33
  'admin' => 'somethingcompletelydifferent'
38
34
  }
39
35
  end
40
36
 
41
37
  it "should clear all cookies" do
42
- @cookie_jar['zerosum.org'] = {'cookie' => 'foobar'}
43
- @cookie_jar.clear!
38
+ subject['zerosum.org'] = {'cookie' => 'foobar'}
39
+ subject.clear!
44
40
 
45
- @cookie_jar.size.should == 0
41
+ subject.size.should == 0
46
42
  end
47
43
 
48
44
  describe "dirty" do
49
- before(:each) do
50
- @cookie_jar = CookieJar.new
51
- @dirty = @cookie_jar.instance_variable_get('@dirty')
52
- end
45
+ let(:dirty) { subject.instance_variable_get('@dirty') }
53
46
 
54
47
  it "should mark a cookie dirty after adding new params" do
55
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
56
- @cookie_jar['zerosum.org'] = {'other' => '1'}
48
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
49
+ subject['zerosum.org'] = {'other' => '1'}
57
50
 
58
- @dirty.include?('zerosum.org').should == true
51
+ dirty.include?('zerosum.org').should == true
59
52
  end
60
53
 
61
54
  it "should mark a cookie dirty after overriding params" do
62
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
63
- @cookie_jar['zerosum.org'] = {'admin' => 'nope'}
55
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
56
+ subject['zerosum.org'] = {'admin' => 'nope'}
64
57
 
65
- @dirty.include?('zerosum.org').should == true
58
+ dirty.include?('zerosum.org').should == true
66
59
  end
67
60
 
68
61
  it "should un-mark a cookie as dirty after re-encoding it" do
69
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
70
- @cookie_jar['zerosum.org'] = {'admin' => 'nope'}
62
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
63
+ subject['zerosum.org'] = {'admin' => 'nope'}
71
64
 
72
- @dirty.include?('zerosum.org').should == true
65
+ dirty.include?('zerosum.org').should == true
73
66
 
74
- @cookie_jar.for_host('zerosum.org')
67
+ subject.for_host('zerosum.org')
75
68
 
76
- @dirty.include?('zerosum.org').should == false
69
+ dirty.include?('zerosum.org').should == false
77
70
  end
78
71
  end
79
72
 
80
73
  describe "cookies_for_host" do
81
- before(:each) do
82
- @cookie_jar = CookieJar.new
83
- end
84
-
85
74
  it "should return an empty Hash for unknown hosts" do
86
- @cookie_jar.cookies_for_host('lol.com').should be_empty
75
+ subject.cookies_for_host('lol.com').should be_empty
87
76
  end
88
77
 
89
78
  it "should return an empty Hash for hosts with no cookie params" do
90
- @cookie_jar['lol.com'] = {}
79
+ subject['lol.com'] = {}
91
80
 
92
- @cookie_jar.cookies_for_host('lol.com').should be_empty
81
+ subject.cookies_for_host('lol.com').should be_empty
93
82
  end
94
83
 
95
84
  it "should return cookie parameters for the host" do
96
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
97
- @cookie_jar['zerosum.org'] = {'other' => '1'}
98
- cookie = @cookie_jar.cookies_for_host('zerosum.org')
85
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
86
+ subject['zerosum.org'] = {'other' => '1'}
87
+
88
+ cookie = subject.cookies_for_host('zerosum.org')
99
89
 
100
90
  cookie['admin'].should == 'ofcourseiam'
101
91
  cookie['other'].should == '1'
102
92
  end
103
93
 
104
94
  it "should include cookies for the parent domain" do
105
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
106
- @cookie_jar['sub.zerosum.org'] = {'other' => '1'}
107
- cookie = @cookie_jar.cookies_for_host('sub.zerosum.org')
95
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
96
+ subject['sub.zerosum.org'] = {'other' => '1'}
97
+
98
+ cookie = subject.cookies_for_host('sub.zerosum.org')
108
99
 
109
100
  cookie['admin'].should == 'ofcourseiam'
110
101
  cookie['other'].should == '1'
@@ -112,30 +103,27 @@ describe CookieJar do
112
103
  end
113
104
 
114
105
  describe "for_host" do
115
- before(:each) do
116
- @cookie_jar = CookieJar.new
117
- end
118
-
119
106
  it "should return nil for unknown hosts" do
120
- @cookie_jar.for_host('lol.com').should be_nil
107
+ subject.for_host('lol.com').should be_nil
121
108
  end
122
109
 
123
110
  it "should return nil for hosts with no cookie params" do
124
- @cookie_jar['lol.com'] = {}
111
+ subject['lol.com'] = {}
125
112
 
126
- @cookie_jar.for_host('lol.com').should be_nil
113
+ subject.for_host('lol.com').should be_nil
127
114
  end
128
115
 
129
116
  it "should encode single cookie params" do
130
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
117
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
131
118
 
132
- @cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam'
119
+ subject.for_host('zerosum.org').should == 'admin=ofcourseiam'
133
120
  end
134
121
 
135
122
  it "should encode multiple cookie params" do
136
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
137
- @cookie_jar['zerosum.org'] = {'other' => '1'}
138
- cookie = @cookie_jar.for_host('zerosum.org')
123
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
124
+ subject['zerosum.org'] = {'other' => '1'}
125
+
126
+ cookie = subject.for_host('zerosum.org')
139
127
 
140
128
  cookie.should include('admin=ofcourseiam')
141
129
  cookie.should include('; ')
@@ -143,9 +131,10 @@ describe CookieJar do
143
131
  end
144
132
 
145
133
  it "should include cookies for the parent domain" do
146
- @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
147
- @cookie_jar['sub.zerosum.org'] = {'other' => '1'}
148
- cookie = @cookie_jar.for_host('sub.zerosum.org')
134
+ subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
135
+ subject['sub.zerosum.org'] = {'other' => '1'}
136
+
137
+ cookie = subject.for_host('sub.zerosum.org')
149
138
 
150
139
  cookie.should include('admin=ofcourseiam')
151
140
  cookie.should include('; ')
@@ -35,5 +35,9 @@ describe URI do
35
35
  it "should preserve the root path" do
36
36
  URI.expand_path('/').should == '/'
37
37
  end
38
+
39
+ it "should default empty paths to the root path" do
40
+ URI.expand_path('').should == '/'
41
+ end
38
42
  end
39
43
  end
data/spec/filters_spec.rb CHANGED
@@ -13,41 +13,49 @@ describe Filters do
13
13
 
14
14
  it "should provide the hosts that will be visited" do
15
15
  agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
16
+
16
17
  agent.visit_hosts.should == ['spidr.rubyforge.org']
17
18
  end
18
19
 
19
20
  it "should provide the hosts that will not be visited" do
20
21
  agent = Agent.new(:ignore_hosts => ['example.com'])
22
+
21
23
  agent.ignore_hosts.should == ['example.com']
22
24
  end
23
25
 
24
26
  it "should provide the ports that will be visited" do
25
27
  agent = Agent.new(:ports => [80, 443, 8000])
28
+
26
29
  agent.visit_ports.should == [80, 443, 8000]
27
30
  end
28
31
 
29
32
  it "should provide the ports that will not be visited" do
30
33
  agent = Agent.new(:ignore_ports => [8000, 8080])
34
+
31
35
  agent.ignore_ports.should == [8000, 8080]
32
36
  end
33
37
 
34
38
  it "should provide the links that will be visited" do
35
39
  agent = Agent.new(:links => ['index.php'])
40
+
36
41
  agent.visit_links.should == ['index.php']
37
42
  end
38
43
 
39
44
  it "should provide the links that will not be visited" do
40
45
  agent = Agent.new(:ignore_links => [/login/])
46
+
41
47
  agent.ignore_links.should == [/login/]
42
48
  end
43
49
 
44
50
  it "should provide the exts that will be visited" do
45
51
  agent = Agent.new(:exts => ['htm'])
52
+
46
53
  agent.visit_exts.should == ['htm']
47
54
  end
48
55
 
49
56
  it "should provide the exts that will not be visited" do
50
57
  agent = Agent.new(:ignore_exts => ['cfm'])
58
+
51
59
  agent.ignore_exts.should == ['cfm']
52
60
  end
53
61
  end