spidr 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +8 -0
- data/.specopts +1 -0
- data/.yardopts +1 -0
- data/{History.rdoc → ChangeLog.md} +47 -39
- data/LICENSE.txt +21 -0
- data/{README.rdoc → README.md} +57 -49
- data/Rakefile +36 -22
- data/lib/spidr/actions/actions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +3 -0
- data/lib/spidr/actions/exceptions/paused.rb +3 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +4 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +4 -0
- data/lib/spidr/agent.rb +61 -17
- data/lib/spidr/auth_credential.rb +3 -0
- data/lib/spidr/auth_store.rb +12 -8
- data/lib/spidr/cookie_jar.rb +4 -1
- data/lib/spidr/events.rb +25 -0
- data/lib/spidr/filters.rb +5 -1
- data/lib/spidr/page.rb +29 -24
- data/lib/spidr/rules.rb +4 -0
- data/lib/spidr/sanitizers.rb +4 -0
- data/lib/spidr/session_cache.rb +26 -1
- data/lib/spidr/version.rb +1 -1
- data/spec/auth_store_spec.rb +85 -0
- data/spec/cookie_jar_spec.rb +108 -0
- data/spec/page_spec.rb +0 -1
- data/spec/session_cache.rb +58 -0
- data/spidr.gemspec +115 -0
- metadata +99 -90
- data.tar.gz.sig +0 -2
- data/Manifest.txt +0 -41
- data/tasks/spec.rb +0 -10
- data/tasks/yard.rb +0 -12
- metadata.gz.sig +0 -0
data/lib/spidr/rules.rb
CHANGED
data/lib/spidr/sanitizers.rb
CHANGED
data/lib/spidr/session_cache.rb
CHANGED
@@ -3,6 +3,9 @@ require 'spidr/spidr'
|
|
3
3
|
require 'net/http'
|
4
4
|
|
5
5
|
module Spidr
|
6
|
+
#
|
7
|
+
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
8
|
+
#
|
6
9
|
class SessionCache
|
7
10
|
|
8
11
|
# Proxy to use
|
@@ -33,6 +36,27 @@ module Spidr
|
|
33
36
|
@sessions = {}
|
34
37
|
end
|
35
38
|
|
39
|
+
#
|
40
|
+
# Determines if there is an active HTTP session for a given URL.
|
41
|
+
#
|
42
|
+
# @param [URI::HTTP, String] url
|
43
|
+
# The URL that represents a session.
|
44
|
+
#
|
45
|
+
# @return [Boolean]
|
46
|
+
# Specifies whether there is an active HTTP session.
|
47
|
+
#
|
48
|
+
# @since 0.2.3
|
49
|
+
#
|
50
|
+
def active?(url)
|
51
|
+
# normalize the url
|
52
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
53
|
+
|
54
|
+
# session key
|
55
|
+
key = [url.scheme, url.host, url.port]
|
56
|
+
|
57
|
+
return @sessions.has_key?(key)
|
58
|
+
end
|
59
|
+
|
36
60
|
#
|
37
61
|
# Provides an active HTTP session for a given URL.
|
38
62
|
#
|
@@ -46,6 +70,7 @@ module Spidr
|
|
46
70
|
# normalize the url
|
47
71
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
48
72
|
|
73
|
+
# session key
|
49
74
|
key = [url.scheme, url.host, url.port]
|
50
75
|
|
51
76
|
unless @sessions[key]
|
@@ -81,10 +106,10 @@ module Spidr
|
|
81
106
|
# normalize the url
|
82
107
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
83
108
|
|
109
|
+
# session key
|
84
110
|
key = [url.scheme, url.host, url.port]
|
85
111
|
|
86
112
|
if (sess = @sessions[key])
|
87
|
-
|
88
113
|
begin
|
89
114
|
sess.finish
|
90
115
|
rescue IOError
|
data/lib/spidr/version.rb
CHANGED
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'spidr/auth_store'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe AuthStore do
|
6
|
+
before(:each) do
|
7
|
+
@auth_store = AuthStore.new
|
8
|
+
@uri = URI('http://zerosum.org/course/auth')
|
9
|
+
@auth_store.add(@uri, 'admin', 'password')
|
10
|
+
end
|
11
|
+
|
12
|
+
after(:each) do
|
13
|
+
@auth_store.clear!
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should retrieve auth credentials for the URL' do
|
17
|
+
uri = @uri.merge('/')
|
18
|
+
|
19
|
+
@auth_store[uri] = AuthCredential.new('user1', 'pass1')
|
20
|
+
@auth_store[uri].username.should == 'user1'
|
21
|
+
@auth_store[uri].password.should == 'pass1'
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should add auth credentials for the URL' do
|
25
|
+
uri = @uri.merge('/')
|
26
|
+
|
27
|
+
lambda {
|
28
|
+
@auth_store.add(uri, 'user1', 'pass1')
|
29
|
+
}.should change(@auth_store, :size)
|
30
|
+
|
31
|
+
@auth_store[uri].username.should == 'user1'
|
32
|
+
@auth_store[uri].password.should == 'pass1'
|
33
|
+
end
|
34
|
+
|
35
|
+
describe 'matching' do
|
36
|
+
it 'should match a longer URL to the base' do
|
37
|
+
uri = @uri.merge('/course/auth/protected.html')
|
38
|
+
|
39
|
+
@auth_store[uri].username.should == 'admin'
|
40
|
+
@auth_store[uri].password.should == 'password'
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should match the longest of all matching URLs' do
|
44
|
+
@auth_store.add(@uri.merge('/course'), 'user1', 'pass1')
|
45
|
+
@auth_store.add(@uri.merge('/course/auth/special'), 'user2', 'pass2')
|
46
|
+
@auth_store.add(@uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
|
47
|
+
|
48
|
+
auth = @auth_store[@uri.merge('/course/auth/special/1.html')]
|
49
|
+
auth.username.should == 'user2'
|
50
|
+
auth.password.should == 'pass2'
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should not match a URL with a different host' do
|
54
|
+
uri = URI('http://spidr.rubyforge.org/course/auth')
|
55
|
+
@auth_store[uri].should be_nil
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should not match a URL with an alternate path' do
|
59
|
+
uri = @uri.merge('/course/admin/protected.html')
|
60
|
+
@auth_store[uri].should be_nil
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
it 'should override previous auth credentials' do
|
65
|
+
@auth_store.add(@uri, 'newuser', 'newpass')
|
66
|
+
|
67
|
+
@auth_store[@uri].username.should == 'newuser'
|
68
|
+
@auth_store[@uri].password.should == 'newpass'
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should clear all cookies' do
|
72
|
+
@auth_store.clear!
|
73
|
+
@auth_store.size.should == 0
|
74
|
+
end
|
75
|
+
|
76
|
+
describe 'for_url' do
|
77
|
+
it 'should return nil if no authorization exists' do
|
78
|
+
@auth_store.for_url(URI('http://php.net')).should be_nil
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'should create an encoded authorization string' do
|
82
|
+
@auth_store.for_url(@uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'spidr/cookie_jar'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe CookieJar do
|
6
|
+
before(:each) do
|
7
|
+
@cookie_jar = CookieJar.new
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should retrieve cookies for the named host" do
|
11
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
12
|
+
|
13
|
+
@cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should add a cookie to the jar" do
|
17
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
18
|
+
|
19
|
+
@cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should merge new cookies into the jar" do
|
23
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
24
|
+
@cookie_jar['zerosum.org'] = {'other' => '1'}
|
25
|
+
|
26
|
+
@cookie_jar['zerosum.org'].should == {
|
27
|
+
'admin' => 'ofcourseiam',
|
28
|
+
'other' => '1'
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should override previous cookies in the jar" do
|
33
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
34
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
|
35
|
+
|
36
|
+
@cookie_jar['zerosum.org'].should == {
|
37
|
+
'admin' => 'somethingcompletelydifferent'
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should clear all cookies" do
|
42
|
+
@cookie_jar['zerosum.org'] = {'cookie' => 'foobar'}
|
43
|
+
@cookie_jar.clear!
|
44
|
+
|
45
|
+
@cookie_jar.size.should == 0
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "dirty" do
|
49
|
+
before(:each) do
|
50
|
+
@cookie_jar = CookieJar.new
|
51
|
+
@dirty = @cookie_jar.instance_variable_get('@dirty')
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should mark a cookie dirty after adding new params" do
|
55
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
56
|
+
@cookie_jar['zerosum.org'] = {'other' => '1'}
|
57
|
+
|
58
|
+
@dirty.include?('zerosum.org').should == true
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should mark a cookie dirty after overriding params" do
|
62
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
63
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'nope'}
|
64
|
+
|
65
|
+
@dirty.include?('zerosum.org').should == true
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should un-mark a cookie as dirty after re-encoding it" do
|
69
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
70
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'nope'}
|
71
|
+
|
72
|
+
@dirty.include?('zerosum.org').should == true
|
73
|
+
|
74
|
+
@cookie_jar.for_host('zerosum.org')
|
75
|
+
|
76
|
+
@dirty.include?('zerosum.org').should == false
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
describe "for_host" do
|
81
|
+
before(:each) do
|
82
|
+
@cookie_jar = CookieJar.new
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should return nil for unknown hosts" do
|
86
|
+
@cookie_jar.for_host('lol.com').should be_nil
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should return nil for hosts with no cookie params" do
|
90
|
+
@cookie_jar['lol.com'] = {}
|
91
|
+
|
92
|
+
@cookie_jar.for_host('lol.com').should be_nil
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should encode single cookie params" do
|
96
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
97
|
+
|
98
|
+
@cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam'
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should encode multiple cookie params" do
|
102
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
103
|
+
@cookie_jar['zerosum.org'] = {'other' => '1'}
|
104
|
+
|
105
|
+
@cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam; other=1'
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'spidr/session_cache'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe SessionCache do
|
6
|
+
describe "empty" do
|
7
|
+
before(:all) do
|
8
|
+
@sessions = SessionCache.new
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should not have any active sessions" do
|
12
|
+
@sessions.should_not be_active(URI('http://example.com/'))
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should start new sessions on-demand" do
|
16
|
+
@sessions[URI('http://example.com/')].should_not be_nil
|
17
|
+
end
|
18
|
+
|
19
|
+
after(:all) do
|
20
|
+
@sessions.clear
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
describe "not-empty" do
|
25
|
+
before(:all) do
|
26
|
+
@url = URI('http://example.com/')
|
27
|
+
|
28
|
+
@sessions = SessionCache.new
|
29
|
+
@sessions[@url]
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should have active sessions" do
|
33
|
+
@sessions.should be_active(@url)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should provide access to sessions" do
|
37
|
+
@sessions[@url].should_not be_nil
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should start new sessions on-demand" do
|
41
|
+
url2 = URI('http://www.w3c.org/')
|
42
|
+
|
43
|
+
@sessions[url2].should_not be_nil
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should be able to kill sessions" do
|
47
|
+
url2 = URI('http://www.w3c.org/')
|
48
|
+
|
49
|
+
@sessions[url2].should_not be_nil
|
50
|
+
@sessions.kill!(url2)
|
51
|
+
@sessions.should_not be_active(url2)
|
52
|
+
end
|
53
|
+
|
54
|
+
after(:all) do
|
55
|
+
@sessions.clear
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
data/spidr.gemspec
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{spidr}
|
8
|
+
s.version = "0.2.3"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Postmodern"]
|
12
|
+
s.date = %q{2010-02-27}
|
13
|
+
s.description = %q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
|
14
|
+
s.email = %q{postmodern.mod3@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"ChangeLog.md",
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".gitignore",
|
22
|
+
".specopts",
|
23
|
+
".yardopts",
|
24
|
+
"ChangeLog.md",
|
25
|
+
"LICENSE.txt",
|
26
|
+
"README.md",
|
27
|
+
"Rakefile",
|
28
|
+
"lib/spidr.rb",
|
29
|
+
"lib/spidr/actions.rb",
|
30
|
+
"lib/spidr/actions/actions.rb",
|
31
|
+
"lib/spidr/actions/exceptions.rb",
|
32
|
+
"lib/spidr/actions/exceptions/action.rb",
|
33
|
+
"lib/spidr/actions/exceptions/paused.rb",
|
34
|
+
"lib/spidr/actions/exceptions/skip_link.rb",
|
35
|
+
"lib/spidr/actions/exceptions/skip_page.rb",
|
36
|
+
"lib/spidr/agent.rb",
|
37
|
+
"lib/spidr/auth_credential.rb",
|
38
|
+
"lib/spidr/auth_store.rb",
|
39
|
+
"lib/spidr/cookie_jar.rb",
|
40
|
+
"lib/spidr/events.rb",
|
41
|
+
"lib/spidr/extensions.rb",
|
42
|
+
"lib/spidr/extensions/uri.rb",
|
43
|
+
"lib/spidr/filters.rb",
|
44
|
+
"lib/spidr/page.rb",
|
45
|
+
"lib/spidr/rules.rb",
|
46
|
+
"lib/spidr/sanitizers.rb",
|
47
|
+
"lib/spidr/session_cache.rb",
|
48
|
+
"lib/spidr/spidr.rb",
|
49
|
+
"lib/spidr/version.rb",
|
50
|
+
"spec/actions_spec.rb",
|
51
|
+
"spec/agent_spec.rb",
|
52
|
+
"spec/auth_store_spec.rb",
|
53
|
+
"spec/cookie_jar_spec.rb",
|
54
|
+
"spec/extensions/uri_spec.rb",
|
55
|
+
"spec/filters_spec.rb",
|
56
|
+
"spec/helpers/history.rb",
|
57
|
+
"spec/helpers/page.rb",
|
58
|
+
"spec/helpers/wsoc.rb",
|
59
|
+
"spec/page_examples.rb",
|
60
|
+
"spec/page_spec.rb",
|
61
|
+
"spec/rules_spec.rb",
|
62
|
+
"spec/sanitizers_spec.rb",
|
63
|
+
"spec/session_cache.rb",
|
64
|
+
"spec/spec_helper.rb",
|
65
|
+
"spec/spidr_spec.rb",
|
66
|
+
"spidr.gemspec"
|
67
|
+
]
|
68
|
+
s.has_rdoc = %q{yard}
|
69
|
+
s.homepage = %q{http://github.com/postmodern/spidr}
|
70
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
71
|
+
s.require_paths = ["lib"]
|
72
|
+
s.rubygems_version = %q{1.3.6}
|
73
|
+
s.summary = %q{A versatile Ruby web spidering library}
|
74
|
+
s.test_files = [
|
75
|
+
"spec/agent_spec.rb",
|
76
|
+
"spec/helpers/history.rb",
|
77
|
+
"spec/helpers/wsoc.rb",
|
78
|
+
"spec/helpers/page.rb",
|
79
|
+
"spec/spec_helper.rb",
|
80
|
+
"spec/extensions/uri_spec.rb",
|
81
|
+
"spec/page_spec.rb",
|
82
|
+
"spec/spidr_spec.rb",
|
83
|
+
"spec/sanitizers_spec.rb",
|
84
|
+
"spec/page_examples.rb",
|
85
|
+
"spec/filters_spec.rb",
|
86
|
+
"spec/actions_spec.rb",
|
87
|
+
"spec/rules_spec.rb",
|
88
|
+
"spec/auth_store_spec.rb",
|
89
|
+
"spec/cookie_jar_spec.rb",
|
90
|
+
"spec/session_cache.rb"
|
91
|
+
]
|
92
|
+
|
93
|
+
if s.respond_to? :specification_version then
|
94
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
95
|
+
s.specification_version = 3
|
96
|
+
|
97
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
98
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.2.0"])
|
99
|
+
s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
|
100
|
+
s.add_development_dependency(%q<yard>, [">= 0.5.3"])
|
101
|
+
s.add_development_dependency(%q<wsoc>, [">= 0.1.1"])
|
102
|
+
else
|
103
|
+
s.add_dependency(%q<nokogiri>, [">= 1.2.0"])
|
104
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
105
|
+
s.add_dependency(%q<yard>, [">= 0.5.3"])
|
106
|
+
s.add_dependency(%q<wsoc>, [">= 0.1.1"])
|
107
|
+
end
|
108
|
+
else
|
109
|
+
s.add_dependency(%q<nokogiri>, [">= 1.2.0"])
|
110
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
111
|
+
s.add_dependency(%q<yard>, [">= 0.5.3"])
|
112
|
+
s.add_dependency(%q<wsoc>, [">= 0.1.1"])
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|