spidr 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +8 -0
- data/.specopts +1 -0
- data/.yardopts +1 -0
- data/{History.rdoc → ChangeLog.md} +47 -39
- data/LICENSE.txt +21 -0
- data/{README.rdoc → README.md} +57 -49
- data/Rakefile +36 -22
- data/lib/spidr/actions/actions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +3 -0
- data/lib/spidr/actions/exceptions/paused.rb +3 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +4 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +4 -0
- data/lib/spidr/agent.rb +61 -17
- data/lib/spidr/auth_credential.rb +3 -0
- data/lib/spidr/auth_store.rb +12 -8
- data/lib/spidr/cookie_jar.rb +4 -1
- data/lib/spidr/events.rb +25 -0
- data/lib/spidr/filters.rb +5 -1
- data/lib/spidr/page.rb +29 -24
- data/lib/spidr/rules.rb +4 -0
- data/lib/spidr/sanitizers.rb +4 -0
- data/lib/spidr/session_cache.rb +26 -1
- data/lib/spidr/version.rb +1 -1
- data/spec/auth_store_spec.rb +85 -0
- data/spec/cookie_jar_spec.rb +108 -0
- data/spec/page_spec.rb +0 -1
- data/spec/session_cache.rb +58 -0
- data/spidr.gemspec +115 -0
- metadata +99 -90
- data.tar.gz.sig +0 -2
- data/Manifest.txt +0 -41
- data/tasks/spec.rb +0 -10
- data/tasks/yard.rb +0 -12
- metadata.gz.sig +0 -0
data/lib/spidr/rules.rb
CHANGED
data/lib/spidr/sanitizers.rb
CHANGED
data/lib/spidr/session_cache.rb
CHANGED
@@ -3,6 +3,9 @@ require 'spidr/spidr'
|
|
3
3
|
require 'net/http'
|
4
4
|
|
5
5
|
module Spidr
|
6
|
+
#
|
7
|
+
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
8
|
+
#
|
6
9
|
class SessionCache
|
7
10
|
|
8
11
|
# Proxy to use
|
@@ -33,6 +36,27 @@ module Spidr
|
|
33
36
|
@sessions = {}
|
34
37
|
end
|
35
38
|
|
39
|
+
#
|
40
|
+
# Determines if there is an active HTTP session for a given URL.
|
41
|
+
#
|
42
|
+
# @param [URI::HTTP, String] url
|
43
|
+
# The URL that represents a session.
|
44
|
+
#
|
45
|
+
# @return [Boolean]
|
46
|
+
# Specifies whether there is an active HTTP session.
|
47
|
+
#
|
48
|
+
# @since 0.2.3
|
49
|
+
#
|
50
|
+
def active?(url)
|
51
|
+
# normalize the url
|
52
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
53
|
+
|
54
|
+
# session key
|
55
|
+
key = [url.scheme, url.host, url.port]
|
56
|
+
|
57
|
+
return @sessions.has_key?(key)
|
58
|
+
end
|
59
|
+
|
36
60
|
#
|
37
61
|
# Provides an active HTTP session for a given URL.
|
38
62
|
#
|
@@ -46,6 +70,7 @@ module Spidr
|
|
46
70
|
# normalize the url
|
47
71
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
48
72
|
|
73
|
+
# session key
|
49
74
|
key = [url.scheme, url.host, url.port]
|
50
75
|
|
51
76
|
unless @sessions[key]
|
@@ -81,10 +106,10 @@ module Spidr
|
|
81
106
|
# normalize the url
|
82
107
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
83
108
|
|
109
|
+
# session key
|
84
110
|
key = [url.scheme, url.host, url.port]
|
85
111
|
|
86
112
|
if (sess = @sessions[key])
|
87
|
-
|
88
113
|
begin
|
89
114
|
sess.finish
|
90
115
|
rescue IOError
|
data/lib/spidr/version.rb
CHANGED
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'spidr/auth_store'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe AuthStore do
|
6
|
+
before(:each) do
|
7
|
+
@auth_store = AuthStore.new
|
8
|
+
@uri = URI('http://zerosum.org/course/auth')
|
9
|
+
@auth_store.add(@uri, 'admin', 'password')
|
10
|
+
end
|
11
|
+
|
12
|
+
after(:each) do
|
13
|
+
@auth_store.clear!
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should retrieve auth credentials for the URL' do
|
17
|
+
uri = @uri.merge('/')
|
18
|
+
|
19
|
+
@auth_store[uri] = AuthCredential.new('user1', 'pass1')
|
20
|
+
@auth_store[uri].username.should == 'user1'
|
21
|
+
@auth_store[uri].password.should == 'pass1'
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should add auth credentials for the URL' do
|
25
|
+
uri = @uri.merge('/')
|
26
|
+
|
27
|
+
lambda {
|
28
|
+
@auth_store.add(uri, 'user1', 'pass1')
|
29
|
+
}.should change(@auth_store, :size)
|
30
|
+
|
31
|
+
@auth_store[uri].username.should == 'user1'
|
32
|
+
@auth_store[uri].password.should == 'pass1'
|
33
|
+
end
|
34
|
+
|
35
|
+
describe 'matching' do
|
36
|
+
it 'should match a longer URL to the base' do
|
37
|
+
uri = @uri.merge('/course/auth/protected.html')
|
38
|
+
|
39
|
+
@auth_store[uri].username.should == 'admin'
|
40
|
+
@auth_store[uri].password.should == 'password'
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should match the longest of all matching URLs' do
|
44
|
+
@auth_store.add(@uri.merge('/course'), 'user1', 'pass1')
|
45
|
+
@auth_store.add(@uri.merge('/course/auth/special'), 'user2', 'pass2')
|
46
|
+
@auth_store.add(@uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
|
47
|
+
|
48
|
+
auth = @auth_store[@uri.merge('/course/auth/special/1.html')]
|
49
|
+
auth.username.should == 'user2'
|
50
|
+
auth.password.should == 'pass2'
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should not match a URL with a different host' do
|
54
|
+
uri = URI('http://spidr.rubyforge.org/course/auth')
|
55
|
+
@auth_store[uri].should be_nil
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should not match a URL with an alternate path' do
|
59
|
+
uri = @uri.merge('/course/admin/protected.html')
|
60
|
+
@auth_store[uri].should be_nil
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
it 'should override previous auth credentials' do
|
65
|
+
@auth_store.add(@uri, 'newuser', 'newpass')
|
66
|
+
|
67
|
+
@auth_store[@uri].username.should == 'newuser'
|
68
|
+
@auth_store[@uri].password.should == 'newpass'
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should clear all cookies' do
|
72
|
+
@auth_store.clear!
|
73
|
+
@auth_store.size.should == 0
|
74
|
+
end
|
75
|
+
|
76
|
+
describe 'for_url' do
|
77
|
+
it 'should return nil if no authorization exists' do
|
78
|
+
@auth_store.for_url(URI('http://php.net')).should be_nil
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'should create an encoded authorization string' do
|
82
|
+
@auth_store.for_url(@uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'spidr/cookie_jar'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe CookieJar do
|
6
|
+
before(:each) do
|
7
|
+
@cookie_jar = CookieJar.new
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should retrieve cookies for the named host" do
|
11
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
12
|
+
|
13
|
+
@cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should add a cookie to the jar" do
|
17
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
18
|
+
|
19
|
+
@cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should merge new cookies into the jar" do
|
23
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
24
|
+
@cookie_jar['zerosum.org'] = {'other' => '1'}
|
25
|
+
|
26
|
+
@cookie_jar['zerosum.org'].should == {
|
27
|
+
'admin' => 'ofcourseiam',
|
28
|
+
'other' => '1'
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should override previous cookies in the jar" do
|
33
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
34
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
|
35
|
+
|
36
|
+
@cookie_jar['zerosum.org'].should == {
|
37
|
+
'admin' => 'somethingcompletelydifferent'
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should clear all cookies" do
|
42
|
+
@cookie_jar['zerosum.org'] = {'cookie' => 'foobar'}
|
43
|
+
@cookie_jar.clear!
|
44
|
+
|
45
|
+
@cookie_jar.size.should == 0
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "dirty" do
|
49
|
+
before(:each) do
|
50
|
+
@cookie_jar = CookieJar.new
|
51
|
+
@dirty = @cookie_jar.instance_variable_get('@dirty')
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should mark a cookie dirty after adding new params" do
|
55
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
56
|
+
@cookie_jar['zerosum.org'] = {'other' => '1'}
|
57
|
+
|
58
|
+
@dirty.include?('zerosum.org').should == true
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should mark a cookie dirty after overriding params" do
|
62
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
63
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'nope'}
|
64
|
+
|
65
|
+
@dirty.include?('zerosum.org').should == true
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should un-mark a cookie as dirty after re-encoding it" do
|
69
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
70
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'nope'}
|
71
|
+
|
72
|
+
@dirty.include?('zerosum.org').should == true
|
73
|
+
|
74
|
+
@cookie_jar.for_host('zerosum.org')
|
75
|
+
|
76
|
+
@dirty.include?('zerosum.org').should == false
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
describe "for_host" do
|
81
|
+
before(:each) do
|
82
|
+
@cookie_jar = CookieJar.new
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should return nil for unknown hosts" do
|
86
|
+
@cookie_jar.for_host('lol.com').should be_nil
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should return nil for hosts with no cookie params" do
|
90
|
+
@cookie_jar['lol.com'] = {}
|
91
|
+
|
92
|
+
@cookie_jar.for_host('lol.com').should be_nil
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should encode single cookie params" do
|
96
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
97
|
+
|
98
|
+
@cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam'
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should encode multiple cookie params" do
|
102
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
103
|
+
@cookie_jar['zerosum.org'] = {'other' => '1'}
|
104
|
+
|
105
|
+
@cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam; other=1'
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'spidr/session_cache'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe SessionCache do
|
6
|
+
describe "empty" do
|
7
|
+
before(:all) do
|
8
|
+
@sessions = SessionCache.new
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should not have any active sessions" do
|
12
|
+
@sessions.should_not be_active(URI('http://example.com/'))
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should start new sessions on-demand" do
|
16
|
+
@sessions[URI('http://example.com/')].should_not be_nil
|
17
|
+
end
|
18
|
+
|
19
|
+
after(:all) do
|
20
|
+
@sessions.clear
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
describe "not-empty" do
|
25
|
+
before(:all) do
|
26
|
+
@url = URI('http://example.com/')
|
27
|
+
|
28
|
+
@sessions = SessionCache.new
|
29
|
+
@sessions[@url]
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should have active sessions" do
|
33
|
+
@sessions.should be_active(@url)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should provide access to sessions" do
|
37
|
+
@sessions[@url].should_not be_nil
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should start new sessions on-demand" do
|
41
|
+
url2 = URI('http://www.w3c.org/')
|
42
|
+
|
43
|
+
@sessions[url2].should_not be_nil
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should be able to kill sessions" do
|
47
|
+
url2 = URI('http://www.w3c.org/')
|
48
|
+
|
49
|
+
@sessions[url2].should_not be_nil
|
50
|
+
@sessions.kill!(url2)
|
51
|
+
@sessions.should_not be_active(url2)
|
52
|
+
end
|
53
|
+
|
54
|
+
after(:all) do
|
55
|
+
@sessions.clear
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
data/spidr.gemspec
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{spidr}
|
8
|
+
s.version = "0.2.3"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Postmodern"]
|
12
|
+
s.date = %q{2010-02-27}
|
13
|
+
s.description = %q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
|
14
|
+
s.email = %q{postmodern.mod3@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"ChangeLog.md",
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".gitignore",
|
22
|
+
".specopts",
|
23
|
+
".yardopts",
|
24
|
+
"ChangeLog.md",
|
25
|
+
"LICENSE.txt",
|
26
|
+
"README.md",
|
27
|
+
"Rakefile",
|
28
|
+
"lib/spidr.rb",
|
29
|
+
"lib/spidr/actions.rb",
|
30
|
+
"lib/spidr/actions/actions.rb",
|
31
|
+
"lib/spidr/actions/exceptions.rb",
|
32
|
+
"lib/spidr/actions/exceptions/action.rb",
|
33
|
+
"lib/spidr/actions/exceptions/paused.rb",
|
34
|
+
"lib/spidr/actions/exceptions/skip_link.rb",
|
35
|
+
"lib/spidr/actions/exceptions/skip_page.rb",
|
36
|
+
"lib/spidr/agent.rb",
|
37
|
+
"lib/spidr/auth_credential.rb",
|
38
|
+
"lib/spidr/auth_store.rb",
|
39
|
+
"lib/spidr/cookie_jar.rb",
|
40
|
+
"lib/spidr/events.rb",
|
41
|
+
"lib/spidr/extensions.rb",
|
42
|
+
"lib/spidr/extensions/uri.rb",
|
43
|
+
"lib/spidr/filters.rb",
|
44
|
+
"lib/spidr/page.rb",
|
45
|
+
"lib/spidr/rules.rb",
|
46
|
+
"lib/spidr/sanitizers.rb",
|
47
|
+
"lib/spidr/session_cache.rb",
|
48
|
+
"lib/spidr/spidr.rb",
|
49
|
+
"lib/spidr/version.rb",
|
50
|
+
"spec/actions_spec.rb",
|
51
|
+
"spec/agent_spec.rb",
|
52
|
+
"spec/auth_store_spec.rb",
|
53
|
+
"spec/cookie_jar_spec.rb",
|
54
|
+
"spec/extensions/uri_spec.rb",
|
55
|
+
"spec/filters_spec.rb",
|
56
|
+
"spec/helpers/history.rb",
|
57
|
+
"spec/helpers/page.rb",
|
58
|
+
"spec/helpers/wsoc.rb",
|
59
|
+
"spec/page_examples.rb",
|
60
|
+
"spec/page_spec.rb",
|
61
|
+
"spec/rules_spec.rb",
|
62
|
+
"spec/sanitizers_spec.rb",
|
63
|
+
"spec/session_cache.rb",
|
64
|
+
"spec/spec_helper.rb",
|
65
|
+
"spec/spidr_spec.rb",
|
66
|
+
"spidr.gemspec"
|
67
|
+
]
|
68
|
+
s.has_rdoc = %q{yard}
|
69
|
+
s.homepage = %q{http://github.com/postmodern/spidr}
|
70
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
71
|
+
s.require_paths = ["lib"]
|
72
|
+
s.rubygems_version = %q{1.3.6}
|
73
|
+
s.summary = %q{A versatile Ruby web spidering library}
|
74
|
+
s.test_files = [
|
75
|
+
"spec/agent_spec.rb",
|
76
|
+
"spec/helpers/history.rb",
|
77
|
+
"spec/helpers/wsoc.rb",
|
78
|
+
"spec/helpers/page.rb",
|
79
|
+
"spec/spec_helper.rb",
|
80
|
+
"spec/extensions/uri_spec.rb",
|
81
|
+
"spec/page_spec.rb",
|
82
|
+
"spec/spidr_spec.rb",
|
83
|
+
"spec/sanitizers_spec.rb",
|
84
|
+
"spec/page_examples.rb",
|
85
|
+
"spec/filters_spec.rb",
|
86
|
+
"spec/actions_spec.rb",
|
87
|
+
"spec/rules_spec.rb",
|
88
|
+
"spec/auth_store_spec.rb",
|
89
|
+
"spec/cookie_jar_spec.rb",
|
90
|
+
"spec/session_cache.rb"
|
91
|
+
]
|
92
|
+
|
93
|
+
if s.respond_to? :specification_version then
|
94
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
95
|
+
s.specification_version = 3
|
96
|
+
|
97
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
98
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.2.0"])
|
99
|
+
s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
|
100
|
+
s.add_development_dependency(%q<yard>, [">= 0.5.3"])
|
101
|
+
s.add_development_dependency(%q<wsoc>, [">= 0.1.1"])
|
102
|
+
else
|
103
|
+
s.add_dependency(%q<nokogiri>, [">= 1.2.0"])
|
104
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
105
|
+
s.add_dependency(%q<yard>, [">= 0.5.3"])
|
106
|
+
s.add_dependency(%q<wsoc>, [">= 0.1.1"])
|
107
|
+
end
|
108
|
+
else
|
109
|
+
s.add_dependency(%q<nokogiri>, [">= 1.2.0"])
|
110
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
111
|
+
s.add_dependency(%q<yard>, [">= 0.5.3"])
|
112
|
+
s.add_dependency(%q<wsoc>, [">= 0.1.1"])
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|