spidr 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/ChangeLog.md +56 -31
- data/Gemfile +7 -21
- data/LICENSE.txt +1 -2
- data/README.md +7 -6
- data/Rakefile +13 -23
- data/gemspec.yml +19 -0
- data/lib/spidr/actions/actions.rb +1 -1
- data/lib/spidr/agent.rb +21 -6
- data/lib/spidr/auth_store.rb +1 -1
- data/lib/spidr/body.rb +99 -0
- data/lib/spidr/extensions/uri.rb +14 -7
- data/lib/spidr/headers.rb +323 -0
- data/lib/spidr/links.rb +229 -0
- data/lib/spidr/page.rb +32 -536
- data/lib/spidr/sanitizers.rb +3 -3
- data/lib/spidr/session_cache.rb +1 -0
- data/lib/spidr/version.rb +1 -1
- data/spec/actions_spec.rb +6 -8
- data/spec/auth_store_spec.rb +28 -28
- data/spec/cookie_jar_spec.rb +49 -60
- data/spec/extensions/uri_spec.rb +4 -0
- data/spec/filters_spec.rb +8 -0
- data/spec/page_spec.rb +0 -7
- data/spec/rules_spec.rb +8 -6
- data/spec/sanitizers_spec.rb +10 -16
- data/spec/spec_helper.rb +1 -12
- data/spec/spidr_spec.rb +11 -11
- data/spidr.gemspec +11 -110
- metadata +24 -52
- data/.gitignore +0 -9
- data/.specopts +0 -1
- data/Gemfile.lock +0 -39
data/lib/spidr/sanitizers.rb
CHANGED
@@ -3,7 +3,7 @@ require 'uri'
|
|
3
3
|
module Spidr
|
4
4
|
#
|
5
5
|
# The {Sanitizers} module adds methods to {Agent} which control the
|
6
|
-
#
|
6
|
+
# sanitation of incoming links.
|
7
7
|
#
|
8
8
|
module Sanitizers
|
9
9
|
def self.included(base)
|
@@ -17,7 +17,7 @@ module Spidr
|
|
17
17
|
end
|
18
18
|
|
19
19
|
#
|
20
|
-
# Initializes the
|
20
|
+
# Initializes the Sanitizer rules.
|
21
21
|
#
|
22
22
|
# @param [Hash] options
|
23
23
|
# Additional options.
|
@@ -52,7 +52,7 @@ module Spidr
|
|
52
52
|
# @since 0.2.2
|
53
53
|
#
|
54
54
|
def sanitize_url(url)
|
55
|
-
url = URI(url.to_s)
|
55
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
56
56
|
|
57
57
|
url.fragment = nil if @strip_fragments
|
58
58
|
url.query = nil if @strip_query
|
data/lib/spidr/session_cache.rb
CHANGED
data/lib/spidr/version.rb
CHANGED
data/spec/actions_spec.rb
CHANGED
@@ -4,9 +4,7 @@ require 'spidr/agent'
|
|
4
4
|
require 'spec_helper'
|
5
5
|
|
6
6
|
describe Actions do
|
7
|
-
|
8
|
-
@url = URI('http://spidr.rubyforge.org/')
|
9
|
-
end
|
7
|
+
let(:url) { URI('http://spidr.rubyforge.org/') }
|
10
8
|
|
11
9
|
it "should be able to pause spidering" do
|
12
10
|
count = 0
|
@@ -28,10 +26,10 @@ describe Actions do
|
|
28
26
|
end
|
29
27
|
end
|
30
28
|
|
31
|
-
agent.enqueue(
|
29
|
+
agent.enqueue(url)
|
32
30
|
agent.continue!
|
33
31
|
|
34
|
-
agent.visited?(
|
32
|
+
agent.visited?(url).should == true
|
35
33
|
end
|
36
34
|
|
37
35
|
it "should allow skipping of enqueued links" do
|
@@ -41,7 +39,7 @@ describe Actions do
|
|
41
39
|
end
|
42
40
|
end
|
43
41
|
|
44
|
-
agent.enqueue(
|
42
|
+
agent.enqueue(url)
|
45
43
|
|
46
44
|
agent.queue.should be_empty
|
47
45
|
end
|
@@ -53,9 +51,9 @@ describe Actions do
|
|
53
51
|
end
|
54
52
|
end
|
55
53
|
|
56
|
-
agent.visit_page(
|
54
|
+
agent.visit_page(url)
|
57
55
|
|
58
|
-
agent.history.should == Set[
|
56
|
+
agent.history.should == Set[url]
|
59
57
|
agent.queue.should be_empty
|
60
58
|
end
|
61
59
|
end
|
data/spec/auth_store_spec.rb
CHANGED
@@ -3,10 +3,12 @@ require 'spidr/auth_store'
|
|
3
3
|
require 'spec_helper'
|
4
4
|
|
5
5
|
describe AuthStore do
|
6
|
+
let(:root_uri) { URI('http://zerosum.org/') }
|
7
|
+
let(:uri) { root_uri.merge('/course/auth') }
|
8
|
+
|
6
9
|
before(:each) do
|
7
10
|
@auth_store = AuthStore.new
|
8
|
-
@uri
|
9
|
-
@auth_store.add(@uri, 'admin', 'password')
|
11
|
+
@auth_store.add(uri, 'admin', 'password')
|
10
12
|
end
|
11
13
|
|
12
14
|
after(:each) do
|
@@ -14,58 +16,56 @@ describe AuthStore do
|
|
14
16
|
end
|
15
17
|
|
16
18
|
it 'should retrieve auth credentials for the URL' do
|
17
|
-
|
18
|
-
|
19
|
-
@auth_store[
|
20
|
-
@auth_store[uri].username.should == 'user1'
|
21
|
-
@auth_store[uri].password.should == 'pass1'
|
19
|
+
@auth_store[root_uri] = AuthCredential.new('user1', 'pass1')
|
20
|
+
@auth_store[root_uri].username.should == 'user1'
|
21
|
+
@auth_store[root_uri].password.should == 'pass1'
|
22
22
|
end
|
23
23
|
|
24
24
|
it 'should add auth credentials for the URL' do
|
25
|
-
uri = @uri.merge('/')
|
26
|
-
|
27
25
|
lambda {
|
28
|
-
@auth_store.add(
|
26
|
+
@auth_store.add(root_uri, 'user1', 'pass1')
|
29
27
|
}.should change(@auth_store, :size)
|
30
28
|
|
31
|
-
@auth_store[
|
32
|
-
@auth_store[
|
29
|
+
@auth_store[root_uri].username.should == 'user1'
|
30
|
+
@auth_store[root_uri].password.should == 'pass1'
|
33
31
|
end
|
34
32
|
|
35
33
|
describe 'matching' do
|
36
|
-
|
37
|
-
uri = @uri.merge('/course/auth/protected.html')
|
34
|
+
let(:sub_uri) { uri.merge('/course/auth/protected.html') }
|
38
35
|
|
39
|
-
|
40
|
-
@auth_store[
|
36
|
+
it 'should match a longer URL to the base' do
|
37
|
+
@auth_store[sub_uri].username.should == 'admin'
|
38
|
+
@auth_store[sub_uri].password.should == 'password'
|
41
39
|
end
|
42
40
|
|
43
41
|
it 'should match the longest of all matching URLs' do
|
44
|
-
@auth_store.add(
|
45
|
-
@auth_store.add(
|
46
|
-
@auth_store.add(
|
42
|
+
@auth_store.add(uri.merge('/course'), 'user1', 'pass1')
|
43
|
+
@auth_store.add(uri.merge('/course/auth/special'), 'user2', 'pass2')
|
44
|
+
@auth_store.add(uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
|
47
45
|
|
48
|
-
auth = @auth_store[
|
46
|
+
auth = @auth_store[uri.merge('/course/auth/special/1.html')]
|
49
47
|
auth.username.should == 'user2'
|
50
48
|
auth.password.should == 'pass2'
|
51
49
|
end
|
52
50
|
|
53
51
|
it 'should not match a URL with a different host' do
|
54
|
-
|
55
|
-
|
52
|
+
remote_uri = URI('http://spidr.rubyforge.org/course/auth')
|
53
|
+
|
54
|
+
@auth_store[remote_uri].should be_nil
|
56
55
|
end
|
57
56
|
|
58
57
|
it 'should not match a URL with an alternate path' do
|
59
|
-
|
60
|
-
|
58
|
+
relative_uri = uri.merge('/course/admin/protected.html')
|
59
|
+
|
60
|
+
@auth_store[relative_uri].should be_nil
|
61
61
|
end
|
62
62
|
end
|
63
63
|
|
64
64
|
it 'should override previous auth credentials' do
|
65
|
-
@auth_store.add(
|
65
|
+
@auth_store.add(uri, 'newuser', 'newpass')
|
66
66
|
|
67
|
-
@auth_store[
|
68
|
-
@auth_store[
|
67
|
+
@auth_store[uri].username.should == 'newuser'
|
68
|
+
@auth_store[uri].password.should == 'newpass'
|
69
69
|
end
|
70
70
|
|
71
71
|
it 'should clear all cookies' do
|
@@ -79,7 +79,7 @@ describe AuthStore do
|
|
79
79
|
end
|
80
80
|
|
81
81
|
it 'should create an encoded authorization string' do
|
82
|
-
@auth_store.for_url(
|
82
|
+
@auth_store.for_url(uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
|
83
83
|
end
|
84
84
|
end
|
85
85
|
end
|
data/spec/cookie_jar_spec.rb
CHANGED
@@ -3,108 +3,99 @@ require 'spidr/cookie_jar'
|
|
3
3
|
require 'spec_helper'
|
4
4
|
|
5
5
|
describe CookieJar do
|
6
|
-
before(:each) do
|
7
|
-
@cookie_jar = CookieJar.new
|
8
|
-
end
|
9
|
-
|
10
6
|
it "should retrieve cookies for the named host" do
|
11
|
-
|
7
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
12
8
|
|
13
|
-
|
9
|
+
subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
|
14
10
|
end
|
15
11
|
|
16
12
|
it "should add a cookie to the jar" do
|
17
|
-
|
13
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
18
14
|
|
19
|
-
|
15
|
+
subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
|
20
16
|
end
|
21
17
|
|
22
18
|
it "should merge new cookies into the jar" do
|
23
|
-
|
24
|
-
|
19
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
20
|
+
subject['zerosum.org'] = {'other' => '1'}
|
25
21
|
|
26
|
-
|
22
|
+
subject['zerosum.org'].should == {
|
27
23
|
'admin' => 'ofcourseiam',
|
28
24
|
'other' => '1'
|
29
25
|
}
|
30
26
|
end
|
31
27
|
|
32
28
|
it "should override previous cookies in the jar" do
|
33
|
-
|
34
|
-
|
29
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
30
|
+
subject['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
|
35
31
|
|
36
|
-
|
32
|
+
subject['zerosum.org'].should == {
|
37
33
|
'admin' => 'somethingcompletelydifferent'
|
38
34
|
}
|
39
35
|
end
|
40
36
|
|
41
37
|
it "should clear all cookies" do
|
42
|
-
|
43
|
-
|
38
|
+
subject['zerosum.org'] = {'cookie' => 'foobar'}
|
39
|
+
subject.clear!
|
44
40
|
|
45
|
-
|
41
|
+
subject.size.should == 0
|
46
42
|
end
|
47
43
|
|
48
44
|
describe "dirty" do
|
49
|
-
|
50
|
-
@cookie_jar = CookieJar.new
|
51
|
-
@dirty = @cookie_jar.instance_variable_get('@dirty')
|
52
|
-
end
|
45
|
+
let(:dirty) { subject.instance_variable_get('@dirty') }
|
53
46
|
|
54
47
|
it "should mark a cookie dirty after adding new params" do
|
55
|
-
|
56
|
-
|
48
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
49
|
+
subject['zerosum.org'] = {'other' => '1'}
|
57
50
|
|
58
|
-
|
51
|
+
dirty.include?('zerosum.org').should == true
|
59
52
|
end
|
60
53
|
|
61
54
|
it "should mark a cookie dirty after overriding params" do
|
62
|
-
|
63
|
-
|
55
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
56
|
+
subject['zerosum.org'] = {'admin' => 'nope'}
|
64
57
|
|
65
|
-
|
58
|
+
dirty.include?('zerosum.org').should == true
|
66
59
|
end
|
67
60
|
|
68
61
|
it "should un-mark a cookie as dirty after re-encoding it" do
|
69
|
-
|
70
|
-
|
62
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
63
|
+
subject['zerosum.org'] = {'admin' => 'nope'}
|
71
64
|
|
72
|
-
|
65
|
+
dirty.include?('zerosum.org').should == true
|
73
66
|
|
74
|
-
|
67
|
+
subject.for_host('zerosum.org')
|
75
68
|
|
76
|
-
|
69
|
+
dirty.include?('zerosum.org').should == false
|
77
70
|
end
|
78
71
|
end
|
79
72
|
|
80
73
|
describe "cookies_for_host" do
|
81
|
-
before(:each) do
|
82
|
-
@cookie_jar = CookieJar.new
|
83
|
-
end
|
84
|
-
|
85
74
|
it "should return an empty Hash for unknown hosts" do
|
86
|
-
|
75
|
+
subject.cookies_for_host('lol.com').should be_empty
|
87
76
|
end
|
88
77
|
|
89
78
|
it "should return an empty Hash for hosts with no cookie params" do
|
90
|
-
|
79
|
+
subject['lol.com'] = {}
|
91
80
|
|
92
|
-
|
81
|
+
subject.cookies_for_host('lol.com').should be_empty
|
93
82
|
end
|
94
83
|
|
95
84
|
it "should return cookie parameters for the host" do
|
96
|
-
|
97
|
-
|
98
|
-
|
85
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
86
|
+
subject['zerosum.org'] = {'other' => '1'}
|
87
|
+
|
88
|
+
cookie = subject.cookies_for_host('zerosum.org')
|
99
89
|
|
100
90
|
cookie['admin'].should == 'ofcourseiam'
|
101
91
|
cookie['other'].should == '1'
|
102
92
|
end
|
103
93
|
|
104
94
|
it "should include cookies for the parent domain" do
|
105
|
-
|
106
|
-
|
107
|
-
|
95
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
96
|
+
subject['sub.zerosum.org'] = {'other' => '1'}
|
97
|
+
|
98
|
+
cookie = subject.cookies_for_host('sub.zerosum.org')
|
108
99
|
|
109
100
|
cookie['admin'].should == 'ofcourseiam'
|
110
101
|
cookie['other'].should == '1'
|
@@ -112,30 +103,27 @@ describe CookieJar do
|
|
112
103
|
end
|
113
104
|
|
114
105
|
describe "for_host" do
|
115
|
-
before(:each) do
|
116
|
-
@cookie_jar = CookieJar.new
|
117
|
-
end
|
118
|
-
|
119
106
|
it "should return nil for unknown hosts" do
|
120
|
-
|
107
|
+
subject.for_host('lol.com').should be_nil
|
121
108
|
end
|
122
109
|
|
123
110
|
it "should return nil for hosts with no cookie params" do
|
124
|
-
|
111
|
+
subject['lol.com'] = {}
|
125
112
|
|
126
|
-
|
113
|
+
subject.for_host('lol.com').should be_nil
|
127
114
|
end
|
128
115
|
|
129
116
|
it "should encode single cookie params" do
|
130
|
-
|
117
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
131
118
|
|
132
|
-
|
119
|
+
subject.for_host('zerosum.org').should == 'admin=ofcourseiam'
|
133
120
|
end
|
134
121
|
|
135
122
|
it "should encode multiple cookie params" do
|
136
|
-
|
137
|
-
|
138
|
-
|
123
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
124
|
+
subject['zerosum.org'] = {'other' => '1'}
|
125
|
+
|
126
|
+
cookie = subject.for_host('zerosum.org')
|
139
127
|
|
140
128
|
cookie.should include('admin=ofcourseiam')
|
141
129
|
cookie.should include('; ')
|
@@ -143,9 +131,10 @@ describe CookieJar do
|
|
143
131
|
end
|
144
132
|
|
145
133
|
it "should include cookies for the parent domain" do
|
146
|
-
|
147
|
-
|
148
|
-
|
134
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
135
|
+
subject['sub.zerosum.org'] = {'other' => '1'}
|
136
|
+
|
137
|
+
cookie = subject.for_host('sub.zerosum.org')
|
149
138
|
|
150
139
|
cookie.should include('admin=ofcourseiam')
|
151
140
|
cookie.should include('; ')
|
data/spec/extensions/uri_spec.rb
CHANGED
data/spec/filters_spec.rb
CHANGED
@@ -13,41 +13,49 @@ describe Filters do
|
|
13
13
|
|
14
14
|
it "should provide the hosts that will be visited" do
|
15
15
|
agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
|
16
|
+
|
16
17
|
agent.visit_hosts.should == ['spidr.rubyforge.org']
|
17
18
|
end
|
18
19
|
|
19
20
|
it "should provide the hosts that will not be visited" do
|
20
21
|
agent = Agent.new(:ignore_hosts => ['example.com'])
|
22
|
+
|
21
23
|
agent.ignore_hosts.should == ['example.com']
|
22
24
|
end
|
23
25
|
|
24
26
|
it "should provide the ports that will be visited" do
|
25
27
|
agent = Agent.new(:ports => [80, 443, 8000])
|
28
|
+
|
26
29
|
agent.visit_ports.should == [80, 443, 8000]
|
27
30
|
end
|
28
31
|
|
29
32
|
it "should provide the ports that will not be visited" do
|
30
33
|
agent = Agent.new(:ignore_ports => [8000, 8080])
|
34
|
+
|
31
35
|
agent.ignore_ports.should == [8000, 8080]
|
32
36
|
end
|
33
37
|
|
34
38
|
it "should provide the links that will be visited" do
|
35
39
|
agent = Agent.new(:links => ['index.php'])
|
40
|
+
|
36
41
|
agent.visit_links.should == ['index.php']
|
37
42
|
end
|
38
43
|
|
39
44
|
it "should provide the links that will not be visited" do
|
40
45
|
agent = Agent.new(:ignore_links => [/login/])
|
46
|
+
|
41
47
|
agent.ignore_links.should == [/login/]
|
42
48
|
end
|
43
49
|
|
44
50
|
it "should provide the exts that will be visited" do
|
45
51
|
agent = Agent.new(:exts => ['htm'])
|
52
|
+
|
46
53
|
agent.visit_exts.should == ['htm']
|
47
54
|
end
|
48
55
|
|
49
56
|
it "should provide the exts that will not be visited" do
|
50
57
|
agent = Agent.new(:ignore_exts => ['cfm'])
|
58
|
+
|
51
59
|
agent.ignore_exts.should == ['cfm']
|
52
60
|
end
|
53
61
|
end
|