spidr 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/spidr/rules.rb CHANGED
@@ -1,4 +1,8 @@
1
1
  module Spidr
2
+ #
3
+ # The {Rules} class represents collections of acceptance and rejection
4
+ # rules, which are used to filter data.
5
+ #
2
6
  class Rules
3
7
 
4
8
  # Accept rules
@@ -1,6 +1,10 @@
1
1
  require 'uri'
2
2
 
3
3
  module Spidr
4
+ #
5
+ # The {Sanitizers} module adds methods to {Agent} which control the
6
+ # sanitization of incoming links.
7
+ #
4
8
  module Sanitizers
5
9
  def self.included(base)
6
10
  base.module_eval do
@@ -3,6 +3,9 @@ require 'spidr/spidr'
3
3
  require 'net/http'
4
4
 
5
5
  module Spidr
6
+ #
7
+ # Stores active HTTP Sessions organized by scheme, host-name and port.
8
+ #
6
9
  class SessionCache
7
10
 
8
11
  # Proxy to use
@@ -33,6 +36,27 @@ module Spidr
33
36
  @sessions = {}
34
37
  end
35
38
 
39
+ #
40
+ # Determines if there is an active HTTP session for a given URL.
41
+ #
42
+ # @param [URI::HTTP, String] url
43
+ # The URL that represents a session.
44
+ #
45
+ # @return [Boolean]
46
+ # Specifies whether there is an active HTTP session.
47
+ #
48
+ # @since 0.2.3
49
+ #
50
+ def active?(url)
51
+ # normalize the url
52
+ url = URI(url.to_s) unless url.kind_of?(URI)
53
+
54
+ # session key
55
+ key = [url.scheme, url.host, url.port]
56
+
57
+ return @sessions.has_key?(key)
58
+ end
59
+
36
60
  #
37
61
  # Provides an active HTTP session for a given URL.
38
62
  #
@@ -46,6 +70,7 @@ module Spidr
46
70
  # normalize the url
47
71
  url = URI(url.to_s) unless url.kind_of?(URI)
48
72
 
73
+ # session key
49
74
  key = [url.scheme, url.host, url.port]
50
75
 
51
76
  unless @sessions[key]
@@ -81,10 +106,10 @@ module Spidr
81
106
  # normalize the url
82
107
  url = URI(url.to_s) unless url.kind_of?(URI)
83
108
 
109
+ # session key
84
110
  key = [url.scheme, url.host, url.port]
85
111
 
86
112
  if (sess = @sessions[key])
87
-
88
113
  begin
89
114
  sess.finish
90
115
  rescue IOError
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.2.2'
3
+ VERSION = '0.2.3'
4
4
  end
@@ -0,0 +1,85 @@
1
+ require 'spidr/auth_store'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe AuthStore do
6
+ before(:each) do
7
+ @auth_store = AuthStore.new
8
+ @uri = URI('http://zerosum.org/course/auth')
9
+ @auth_store.add(@uri, 'admin', 'password')
10
+ end
11
+
12
+ after(:each) do
13
+ @auth_store.clear!
14
+ end
15
+
16
+ it 'should retrieve auth credentials for the URL' do
17
+ uri = @uri.merge('/')
18
+
19
+ @auth_store[uri] = AuthCredential.new('user1', 'pass1')
20
+ @auth_store[uri].username.should == 'user1'
21
+ @auth_store[uri].password.should == 'pass1'
22
+ end
23
+
24
+ it 'should add auth credentials for the URL' do
25
+ uri = @uri.merge('/')
26
+
27
+ lambda {
28
+ @auth_store.add(uri, 'user1', 'pass1')
29
+ }.should change(@auth_store, :size)
30
+
31
+ @auth_store[uri].username.should == 'user1'
32
+ @auth_store[uri].password.should == 'pass1'
33
+ end
34
+
35
+ describe 'matching' do
36
+ it 'should match a longer URL to the base' do
37
+ uri = @uri.merge('/course/auth/protected.html')
38
+
39
+ @auth_store[uri].username.should == 'admin'
40
+ @auth_store[uri].password.should == 'password'
41
+ end
42
+
43
+ it 'should match the longest of all matching URLs' do
44
+ @auth_store.add(@uri.merge('/course'), 'user1', 'pass1')
45
+ @auth_store.add(@uri.merge('/course/auth/special'), 'user2', 'pass2')
46
+ @auth_store.add(@uri.merge('/course/auth/special/extra'), 'user3', 'pass3')
47
+
48
+ auth = @auth_store[@uri.merge('/course/auth/special/1.html')]
49
+ auth.username.should == 'user2'
50
+ auth.password.should == 'pass2'
51
+ end
52
+
53
+ it 'should not match a URL with a different host' do
54
+ uri = URI('http://spidr.rubyforge.org/course/auth')
55
+ @auth_store[uri].should be_nil
56
+ end
57
+
58
+ it 'should not match a URL with an alternate path' do
59
+ uri = @uri.merge('/course/admin/protected.html')
60
+ @auth_store[uri].should be_nil
61
+ end
62
+ end
63
+
64
+ it 'should override previous auth credentials' do
65
+ @auth_store.add(@uri, 'newuser', 'newpass')
66
+
67
+ @auth_store[@uri].username.should == 'newuser'
68
+ @auth_store[@uri].password.should == 'newpass'
69
+ end
70
+
71
+ it 'should clear all cookies' do
72
+ @auth_store.clear!
73
+ @auth_store.size.should == 0
74
+ end
75
+
76
+ describe 'for_url' do
77
+ it 'should return nil if no authorization exists' do
78
+ @auth_store.for_url(URI('http://php.net')).should be_nil
79
+ end
80
+
81
+ it 'should create an encoded authorization string' do
82
+ @auth_store.for_url(@uri).should == "YWRtaW46cGFzc3dvcmQ=\n"
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,108 @@
1
+ require 'spidr/cookie_jar'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe CookieJar do
6
+ before(:each) do
7
+ @cookie_jar = CookieJar.new
8
+ end
9
+
10
+ it "should retrieve cookies for the named host" do
11
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
12
+
13
+ @cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
14
+ end
15
+
16
+ it "should add a cookie to the jar" do
17
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
18
+
19
+ @cookie_jar['zerosum.org'].should == {'admin' => 'ofcourseiam'}
20
+ end
21
+
22
+ it "should merge new cookies into the jar" do
23
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
24
+ @cookie_jar['zerosum.org'] = {'other' => '1'}
25
+
26
+ @cookie_jar['zerosum.org'].should == {
27
+ 'admin' => 'ofcourseiam',
28
+ 'other' => '1'
29
+ }
30
+ end
31
+
32
+ it "should override previous cookies in the jar" do
33
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
34
+ @cookie_jar['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
35
+
36
+ @cookie_jar['zerosum.org'].should == {
37
+ 'admin' => 'somethingcompletelydifferent'
38
+ }
39
+ end
40
+
41
+ it "should clear all cookies" do
42
+ @cookie_jar['zerosum.org'] = {'cookie' => 'foobar'}
43
+ @cookie_jar.clear!
44
+
45
+ @cookie_jar.size.should == 0
46
+ end
47
+
48
+ describe "dirty" do
49
+ before(:each) do
50
+ @cookie_jar = CookieJar.new
51
+ @dirty = @cookie_jar.instance_variable_get('@dirty')
52
+ end
53
+
54
+ it "should mark a cookie dirty after adding new params" do
55
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
56
+ @cookie_jar['zerosum.org'] = {'other' => '1'}
57
+
58
+ @dirty.include?('zerosum.org').should == true
59
+ end
60
+
61
+ it "should mark a cookie dirty after overriding params" do
62
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
63
+ @cookie_jar['zerosum.org'] = {'admin' => 'nope'}
64
+
65
+ @dirty.include?('zerosum.org').should == true
66
+ end
67
+
68
+ it "should un-mark a cookie as dirty after re-encoding it" do
69
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
70
+ @cookie_jar['zerosum.org'] = {'admin' => 'nope'}
71
+
72
+ @dirty.include?('zerosum.org').should == true
73
+
74
+ @cookie_jar.for_host('zerosum.org')
75
+
76
+ @dirty.include?('zerosum.org').should == false
77
+ end
78
+ end
79
+
80
+ describe "for_host" do
81
+ before(:each) do
82
+ @cookie_jar = CookieJar.new
83
+ end
84
+
85
+ it "should return nil for unknown hosts" do
86
+ @cookie_jar.for_host('lol.com').should be_nil
87
+ end
88
+
89
+ it "should return nil for hosts with no cookie params" do
90
+ @cookie_jar['lol.com'] = {}
91
+
92
+ @cookie_jar.for_host('lol.com').should be_nil
93
+ end
94
+
95
+ it "should encode single cookie params" do
96
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
97
+
98
+ @cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam'
99
+ end
100
+
101
+ it "should encode multiple cookie params" do
102
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
103
+ @cookie_jar['zerosum.org'] = {'other' => '1'}
104
+
105
+ @cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam; other=1'
106
+ end
107
+ end
108
+ end
data/spec/page_spec.rb CHANGED
@@ -104,7 +104,6 @@ describe Page do
104
104
 
105
105
  params.each do |key,value|
106
106
  key.should_not be_empty
107
- value.should_not be_empty
108
107
  end
109
108
  end
110
109
  end
@@ -0,0 +1,58 @@
1
+ require 'spidr/session_cache'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe SessionCache do
6
+ describe "empty" do
7
+ before(:all) do
8
+ @sessions = SessionCache.new
9
+ end
10
+
11
+ it "should not have any active sessions" do
12
+ @sessions.should_not be_active(URI('http://example.com/'))
13
+ end
14
+
15
+ it "should start new sessions on-demand" do
16
+ @sessions[URI('http://example.com/')].should_not be_nil
17
+ end
18
+
19
+ after(:all) do
20
+ @sessions.clear
21
+ end
22
+ end
23
+
24
+ describe "not-empty" do
25
+ before(:all) do
26
+ @url = URI('http://example.com/')
27
+
28
+ @sessions = SessionCache.new
29
+ @sessions[@url]
30
+ end
31
+
32
+ it "should have active sessions" do
33
+ @sessions.should be_active(@url)
34
+ end
35
+
36
+ it "should provide access to sessions" do
37
+ @sessions[@url].should_not be_nil
38
+ end
39
+
40
+ it "should start new sessions on-demand" do
41
+ url2 = URI('http://www.w3c.org/')
42
+
43
+ @sessions[url2].should_not be_nil
44
+ end
45
+
46
+ it "should be able to kill sessions" do
47
+ url2 = URI('http://www.w3c.org/')
48
+
49
+ @sessions[url2].should_not be_nil
50
+ @sessions.kill!(url2)
51
+ @sessions.should_not be_active(url2)
52
+ end
53
+
54
+ after(:all) do
55
+ @sessions.clear
56
+ end
57
+ end
58
+ end
data/spidr.gemspec ADDED
@@ -0,0 +1,115 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{spidr}
8
+ s.version = "0.2.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Postmodern"]
12
+ s.date = %q{2010-02-27}
13
+ s.description = %q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
14
+ s.email = %q{postmodern.mod3@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "ChangeLog.md",
17
+ "LICENSE.txt",
18
+ "README.md"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ ".specopts",
23
+ ".yardopts",
24
+ "ChangeLog.md",
25
+ "LICENSE.txt",
26
+ "README.md",
27
+ "Rakefile",
28
+ "lib/spidr.rb",
29
+ "lib/spidr/actions.rb",
30
+ "lib/spidr/actions/actions.rb",
31
+ "lib/spidr/actions/exceptions.rb",
32
+ "lib/spidr/actions/exceptions/action.rb",
33
+ "lib/spidr/actions/exceptions/paused.rb",
34
+ "lib/spidr/actions/exceptions/skip_link.rb",
35
+ "lib/spidr/actions/exceptions/skip_page.rb",
36
+ "lib/spidr/agent.rb",
37
+ "lib/spidr/auth_credential.rb",
38
+ "lib/spidr/auth_store.rb",
39
+ "lib/spidr/cookie_jar.rb",
40
+ "lib/spidr/events.rb",
41
+ "lib/spidr/extensions.rb",
42
+ "lib/spidr/extensions/uri.rb",
43
+ "lib/spidr/filters.rb",
44
+ "lib/spidr/page.rb",
45
+ "lib/spidr/rules.rb",
46
+ "lib/spidr/sanitizers.rb",
47
+ "lib/spidr/session_cache.rb",
48
+ "lib/spidr/spidr.rb",
49
+ "lib/spidr/version.rb",
50
+ "spec/actions_spec.rb",
51
+ "spec/agent_spec.rb",
52
+ "spec/auth_store_spec.rb",
53
+ "spec/cookie_jar_spec.rb",
54
+ "spec/extensions/uri_spec.rb",
55
+ "spec/filters_spec.rb",
56
+ "spec/helpers/history.rb",
57
+ "spec/helpers/page.rb",
58
+ "spec/helpers/wsoc.rb",
59
+ "spec/page_examples.rb",
60
+ "spec/page_spec.rb",
61
+ "spec/rules_spec.rb",
62
+ "spec/sanitizers_spec.rb",
63
+ "spec/session_cache.rb",
64
+ "spec/spec_helper.rb",
65
+ "spec/spidr_spec.rb",
66
+ "spidr.gemspec"
67
+ ]
68
+ s.has_rdoc = %q{yard}
69
+ s.homepage = %q{http://github.com/postmodern/spidr}
70
+ s.rdoc_options = ["--charset=UTF-8"]
71
+ s.require_paths = ["lib"]
72
+ s.rubygems_version = %q{1.3.6}
73
+ s.summary = %q{A versatile Ruby web spidering library}
74
+ s.test_files = [
75
+ "spec/agent_spec.rb",
76
+ "spec/helpers/history.rb",
77
+ "spec/helpers/wsoc.rb",
78
+ "spec/helpers/page.rb",
79
+ "spec/spec_helper.rb",
80
+ "spec/extensions/uri_spec.rb",
81
+ "spec/page_spec.rb",
82
+ "spec/spidr_spec.rb",
83
+ "spec/sanitizers_spec.rb",
84
+ "spec/page_examples.rb",
85
+ "spec/filters_spec.rb",
86
+ "spec/actions_spec.rb",
87
+ "spec/rules_spec.rb",
88
+ "spec/auth_store_spec.rb",
89
+ "spec/cookie_jar_spec.rb",
90
+ "spec/session_cache.rb"
91
+ ]
92
+
93
+ if s.respond_to? :specification_version then
94
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
95
+ s.specification_version = 3
96
+
97
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
98
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.2.0"])
99
+ s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
100
+ s.add_development_dependency(%q<yard>, [">= 0.5.3"])
101
+ s.add_development_dependency(%q<wsoc>, [">= 0.1.1"])
102
+ else
103
+ s.add_dependency(%q<nokogiri>, [">= 1.2.0"])
104
+ s.add_dependency(%q<rspec>, [">= 1.3.0"])
105
+ s.add_dependency(%q<yard>, [">= 0.5.3"])
106
+ s.add_dependency(%q<wsoc>, [">= 0.1.1"])
107
+ end
108
+ else
109
+ s.add_dependency(%q<nokogiri>, [">= 1.2.0"])
110
+ s.add_dependency(%q<rspec>, [">= 1.3.0"])
111
+ s.add_dependency(%q<yard>, [">= 0.5.3"])
112
+ s.add_dependency(%q<wsoc>, [">= 0.1.1"])
113
+ end
114
+ end
115
+