spidr 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/spidr/rules.rb CHANGED
@@ -1,4 +1,8 @@
1
1
  module Spidr
2
+ #
3
+ # The {Rules} class represents collections of acceptance and rejection
4
+ # rules, which are used to filter data.
5
+ #
2
6
  class Rules
3
7
 
4
8
  # Accept rules
@@ -1,6 +1,10 @@
1
1
  require 'uri'
2
2
 
3
3
  module Spidr
4
+ #
5
+ # The {Sanitizers} module adds methods to {Agent} which control the
6
+ # sanitization of incoming links.
7
+ #
4
8
  module Sanitizers
5
9
  def self.included(base)
6
10
  base.module_eval do
@@ -3,6 +3,9 @@ require 'spidr/spidr'
3
3
  require 'net/http'
4
4
 
5
5
  module Spidr
6
+ #
7
+ # Stores active HTTP Sessions organized by scheme, host-name and port.
8
+ #
6
9
  class SessionCache
7
10
 
8
11
  # Proxy to use
@@ -33,6 +36,27 @@ module Spidr
33
36
  @sessions = {}
34
37
  end
35
38
 
39
+ #
40
+ # Determines if there is an active HTTP session for a given URL.
41
+ #
42
+ # @param [URI::HTTP, String] url
43
+ # The URL that represents a session.
44
+ #
45
+ # @return [Boolean]
46
+ # Specifies whether there is an active HTTP session.
47
+ #
48
+ # @since 0.2.3
49
+ #
50
+ def active?(url)
51
+ # normalize the url
52
+ url = URI(url.to_s) unless url.kind_of?(URI)
53
+
54
+ # session key
55
+ key = [url.scheme, url.host, url.port]
56
+
57
+ return @sessions.has_key?(key)
58
+ end
59
+
36
60
  #
37
61
  # Provides an active HTTP session for a given URL.
38
62
  #
@@ -46,6 +70,7 @@ module Spidr
46
70
  # normalize the url
47
71
  url = URI(url.to_s) unless url.kind_of?(URI)
48
72
 
73
+ # session key
49
74
  key = [url.scheme, url.host, url.port]
50
75
 
51
76
  unless @sessions[key]
@@ -81,10 +106,10 @@ module Spidr
81
106
  # normalize the url
82
107
  url = URI(url.to_s) unless url.kind_of?(URI)
83
108
 
109
+ # session key
84
110
  key = [url.scheme, url.host, url.port]
85
111
 
86
112
  if (sess = @sessions[key])
87
-
88
113
  begin
89
114
  sess.finish
90
115
  rescue IOError
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.2.2'
3
+ VERSION = '0.2.3'
4
4
  end
@@ -0,0 +1,85 @@
1
+ require 'spidr/auth_store'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe AuthStore do
6
+ before(:each) do
7
+ @auth_store = AuthStore.new
8
+ @uri = URI('http://zerosum.org/course/auth')
9
+ @auth_store.add(@uri, 'admin', 'password')
10
+ end
11
+
12
+ after(:each) do
13
+ @auth_store.clear!
14
+ end
15
+
16
+ it 'should retrieve auth credentials for the URL' do
17
+ uri = @uri.merge('/')
18
+
19
+ @auth_store[uri] = AuthCredential.new('user1', 'pass1')
20
+ @auth_store[uri].username.should == 'user1'
21
+ @auth_store[uri].password.should == 'pass1'
22
+ end
23
+
24
+ it 'should add auth credentials for the URL' do
25
+ uri = @uri.merge('/')
26
+
27
+ lambda {
28
+ @auth_store.add(uri, 'user1', 'pass1')
29
+ }.should change(@auth_store, :size)
30
+
31
+ @auth_store[uri].username.should == 'user1'
32
+ @auth_store[uri].password.should == 'pass1'
33
+ end
34
+
35
+ describe 'matching' do
36
+ it 'should match a longer URL to the base' do
37
+ uri = @uri.merge('/course/auth/protected.html')
38
+
39
+ @auth_store[uri].username.should == 'admin'
40
+ @auth_store[uri].password.should == 'password'
41
+ end
42
+
43
+ it 'should match the longest of all matching URLs' do
44
+ @auth_store.add(@uri.merge('/course'), 'user1', 'pass1')
45
+ @auth_store.add(@uri.merge('/course/auth/special'), 'user2', 'pass2')
46
+ @auth_store.add(@uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
47
+
48
+ auth = @auth_store[@uri.merge('/course/auth/special/1.html')]
49
+ auth.username.should == 'user2'
50
+ auth.password.should == 'pass2'
51
+ end
52
+
53
+ it 'should not match a URL with a different host' do
54
+ uri = URI('http://spidr.rubyforge.org/course/auth')
55
+ @auth_store[uri].should be_nil
56
+ end
57
+
58
+ it 'should not match a URL with an alternate path' do
59
+ uri = @uri.merge('/course/admin/protected.html')
60
+ @auth_store[uri].should be_nil
61
+ end
62
+ end
63
+
64
+ it 'should override previous auth credentials' do
65
+ @auth_store.add(@uri, 'newuser', 'newpass')
66
+
67
+ @auth_store[@uri].username.should == 'newuser'
68
+ @auth_store[@uri].password.should == 'newpass'
69
+ end
70
+
71
+ it 'should clear all cookies' do
72
+ @auth_store.clear!
73
+ @auth_store.size.should == 0
74
+ end
75
+
76
+ describe 'for_url' do
77
+ it 'should return nil if no authorization exists' do
78
+ @auth_store.for_url(URI('http://php.net')).should be_nil
79
+ end
80
+
81
+ it 'should create an encoded authorization string' do
82
+ @auth_store.for_url(@uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,108 @@
1
+ require 'spidr/cookie_jar'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe CookieJar do
6
+ before(:each) do
7
+ @cookie_jar = CookieJar.new
8
+ end
9
+
10
+ it "should retrieve cookies for the named host" do
11
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
12
+
13
+ @cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
14
+ end
15
+
16
+ it "should add a cookie to the jar" do
17
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
18
+
19
+ @cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
20
+ end
21
+
22
+ it "should merge new cookies into the jar" do
23
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
24
+ @cookie_jar['zerosum.org'] = {'other' => '1'}
25
+
26
+ @cookie_jar['zerosum.org'].should == {
27
+ 'admin' => 'ofcourseiam',
28
+ 'other' => '1'
29
+ }
30
+ end
31
+
32
+ it "should override previous cookies in the jar" do
33
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
34
+ @cookie_jar['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
35
+
36
+ @cookie_jar['zerosum.org'].should == {
37
+ 'admin' => 'somethingcompletelydifferent'
38
+ }
39
+ end
40
+
41
+ it "should clear all cookies" do
42
+ @cookie_jar['zerosum.org'] = {'cookie' => 'foobar'}
43
+ @cookie_jar.clear!
44
+
45
+ @cookie_jar.size.should == 0
46
+ end
47
+
48
+ describe "dirty" do
49
+ before(:each) do
50
+ @cookie_jar = CookieJar.new
51
+ @dirty = @cookie_jar.instance_variable_get('@dirty')
52
+ end
53
+
54
+ it "should mark a cookie dirty after adding new params" do
55
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
56
+ @cookie_jar['zerosum.org'] = {'other' => '1'}
57
+
58
+ @dirty.include?('zerosum.org').should == true
59
+ end
60
+
61
+ it "should mark a cookie dirty after overriding params" do
62
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
63
+ @cookie_jar['zerosum.org'] = {'admin' => 'nope'}
64
+
65
+ @dirty.include?('zerosum.org').should == true
66
+ end
67
+
68
+ it "should un-mark a cookie as dirty after re-encoding it" do
69
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
70
+ @cookie_jar['zerosum.org'] = {'admin' => 'nope'}
71
+
72
+ @dirty.include?('zerosum.org').should == true
73
+
74
+ @cookie_jar.for_host('zerosum.org')
75
+
76
+ @dirty.include?('zerosum.org').should == false
77
+ end
78
+ end
79
+
80
+ describe "for_host" do
81
+ before(:each) do
82
+ @cookie_jar = CookieJar.new
83
+ end
84
+
85
+ it "should return nil for unknown hosts" do
86
+ @cookie_jar.for_host('lol.com').should be_nil
87
+ end
88
+
89
+ it "should return nil for hosts with no cookie params" do
90
+ @cookie_jar['lol.com'] = {}
91
+
92
+ @cookie_jar.for_host('lol.com').should be_nil
93
+ end
94
+
95
+ it "should encode single cookie params" do
96
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
97
+
98
+ @cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam'
99
+ end
100
+
101
+ it "should encode multiple cookie params" do
102
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
103
+ @cookie_jar['zerosum.org'] = {'other' => '1'}
104
+
105
+ @cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam; other=1'
106
+ end
107
+ end
108
+ end
data/spec/page_spec.rb CHANGED
@@ -104,7 +104,6 @@ describe Page do
104
104
 
105
105
  params.each do |key,value|
106
106
  key.should_not be_empty
107
- value.should_not be_empty
108
107
  end
109
108
  end
110
109
  end
@@ -0,0 +1,58 @@
1
+ require 'spidr/session_cache'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe SessionCache do
6
+ describe "empty" do
7
+ before(:all) do
8
+ @sessions = SessionCache.new
9
+ end
10
+
11
+ it "should not have any active sessions" do
12
+ @sessions.should_not be_active(URI('http://example.com/'))
13
+ end
14
+
15
+ it "should start new sessions on-demand" do
16
+ @sessions[URI('http://example.com/')].should_not be_nil
17
+ end
18
+
19
+ after(:all) do
20
+ @sessions.clear
21
+ end
22
+ end
23
+
24
+ describe "not-empty" do
25
+ before(:all) do
26
+ @url = URI('http://example.com/')
27
+
28
+ @sessions = SessionCache.new
29
+ @sessions[@url]
30
+ end
31
+
32
+ it "should have active sessions" do
33
+ @sessions.should be_active(@url)
34
+ end
35
+
36
+ it "should provide access to sessions" do
37
+ @sessions[@url].should_not be_nil
38
+ end
39
+
40
+ it "should start new sessions on-demand" do
41
+ url2 = URI('http://www.w3c.org/')
42
+
43
+ @sessions[url2].should_not be_nil
44
+ end
45
+
46
+ it "should be able to kill sessions" do
47
+ url2 = URI('http://www.w3c.org/')
48
+
49
+ @sessions[url2].should_not be_nil
50
+ @sessions.kill!(url2)
51
+ @sessions.should_not be_active(url2)
52
+ end
53
+
54
+ after(:all) do
55
+ @sessions.clear
56
+ end
57
+ end
58
+ end
data/spidr.gemspec ADDED
@@ -0,0 +1,115 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{spidr}
8
+ s.version = "0.2.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Postmodern"]
12
+ s.date = %q{2010-02-27}
13
+ s.description = %q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
14
+ s.email = %q{postmodern.mod3@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "ChangeLog.md",
17
+ "LICENSE.txt",
18
+ "README.md"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ ".specopts",
23
+ ".yardopts",
24
+ "ChangeLog.md",
25
+ "LICENSE.txt",
26
+ "README.md",
27
+ "Rakefile",
28
+ "lib/spidr.rb",
29
+ "lib/spidr/actions.rb",
30
+ "lib/spidr/actions/actions.rb",
31
+ "lib/spidr/actions/exceptions.rb",
32
+ "lib/spidr/actions/exceptions/action.rb",
33
+ "lib/spidr/actions/exceptions/paused.rb",
34
+ "lib/spidr/actions/exceptions/skip_link.rb",
35
+ "lib/spidr/actions/exceptions/skip_page.rb",
36
+ "lib/spidr/agent.rb",
37
+ "lib/spidr/auth_credential.rb",
38
+ "lib/spidr/auth_store.rb",
39
+ "lib/spidr/cookie_jar.rb",
40
+ "lib/spidr/events.rb",
41
+ "lib/spidr/extensions.rb",
42
+ "lib/spidr/extensions/uri.rb",
43
+ "lib/spidr/filters.rb",
44
+ "lib/spidr/page.rb",
45
+ "lib/spidr/rules.rb",
46
+ "lib/spidr/sanitizers.rb",
47
+ "lib/spidr/session_cache.rb",
48
+ "lib/spidr/spidr.rb",
49
+ "lib/spidr/version.rb",
50
+ "spec/actions_spec.rb",
51
+ "spec/agent_spec.rb",
52
+ "spec/auth_store_spec.rb",
53
+ "spec/cookie_jar_spec.rb",
54
+ "spec/extensions/uri_spec.rb",
55
+ "spec/filters_spec.rb",
56
+ "spec/helpers/history.rb",
57
+ "spec/helpers/page.rb",
58
+ "spec/helpers/wsoc.rb",
59
+ "spec/page_examples.rb",
60
+ "spec/page_spec.rb",
61
+ "spec/rules_spec.rb",
62
+ "spec/sanitizers_spec.rb",
63
+ "spec/session_cache.rb",
64
+ "spec/spec_helper.rb",
65
+ "spec/spidr_spec.rb",
66
+ "spidr.gemspec"
67
+ ]
68
+ s.has_rdoc = %q{yard}
69
+ s.homepage = %q{http://github.com/postmodern/spidr}
70
+ s.rdoc_options = ["--charset=UTF-8"]
71
+ s.require_paths = ["lib"]
72
+ s.rubygems_version = %q{1.3.6}
73
+ s.summary = %q{A versatile Ruby web spidering library}
74
+ s.test_files = [
75
+ "spec/agent_spec.rb",
76
+ "spec/helpers/history.rb",
77
+ "spec/helpers/wsoc.rb",
78
+ "spec/helpers/page.rb",
79
+ "spec/spec_helper.rb",
80
+ "spec/extensions/uri_spec.rb",
81
+ "spec/page_spec.rb",
82
+ "spec/spidr_spec.rb",
83
+ "spec/sanitizers_spec.rb",
84
+ "spec/page_examples.rb",
85
+ "spec/filters_spec.rb",
86
+ "spec/actions_spec.rb",
87
+ "spec/rules_spec.rb",
88
+ "spec/auth_store_spec.rb",
89
+ "spec/cookie_jar_spec.rb",
90
+ "spec/session_cache.rb"
91
+ ]
92
+
93
+ if s.respond_to? :specification_version then
94
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
95
+ s.specification_version = 3
96
+
97
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
98
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.2.0"])
99
+ s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
100
+ s.add_development_dependency(%q<yard>, [">= 0.5.3"])
101
+ s.add_development_dependency(%q<wsoc>, [">= 0.1.1"])
102
+ else
103
+ s.add_dependency(%q<nokogiri>, [">= 1.2.0"])
104
+ s.add_dependency(%q<rspec>, [">= 1.3.0"])
105
+ s.add_dependency(%q<yard>, [">= 0.5.3"])
106
+ s.add_dependency(%q<wsoc>, [">= 0.1.1"])
107
+ end
108
+ else
109
+ s.add_dependency(%q<nokogiri>, [">= 1.2.0"])
110
+ s.add_dependency(%q<rspec>, [">= 1.3.0"])
111
+ s.add_dependency(%q<yard>, [">= 0.5.3"])
112
+ s.add_dependency(%q<wsoc>, [">= 0.1.1"])
113
+ end
114
+ end
115
+