spidr_epg 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +10 -0
- data/.rspec +1 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +291 -0
- data/ChangeLog.md~ +291 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +49 -0
- data/Gemfile~ +16 -0
- data/LICENSE.txt +20 -0
- data/README.md +193 -0
- data/README.md~ +190 -0
- data/Rakefile +29 -0
- data/gemspec.yml +19 -0
- data/lib/spidr/actions/actions.rb +83 -0
- data/lib/spidr/actions/exceptions/action.rb +9 -0
- data/lib/spidr/actions/exceptions/paused.rb +11 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/agent.rb +866 -0
- data/lib/spidr/auth_credential.rb +28 -0
- data/lib/spidr/auth_store.rb +161 -0
- data/lib/spidr/body.rb +98 -0
- data/lib/spidr/cookie_jar.rb +202 -0
- data/lib/spidr/events.rb +537 -0
- data/lib/spidr/extensions/uri.rb +52 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/filters.rb +539 -0
- data/lib/spidr/headers.rb +370 -0
- data/lib/spidr/links.rb +229 -0
- data/lib/spidr/page.rb +108 -0
- data/lib/spidr/rules.rb +79 -0
- data/lib/spidr/sanitizers.rb +56 -0
- data/lib/spidr/session_cache.rb +145 -0
- data/lib/spidr/spidr.rb +107 -0
- data/lib/spidr/version.rb +4 -0
- data/lib/spidr/version.rb~ +4 -0
- data/lib/spidr.rb +3 -0
- data/pkg/spidr-1.0.0.gem +0 -0
- data/spec/actions_spec.rb +59 -0
- data/spec/agent_spec.rb +81 -0
- data/spec/auth_store_spec.rb +85 -0
- data/spec/cookie_jar_spec.rb +144 -0
- data/spec/extensions/uri_spec.rb +43 -0
- data/spec/filters_spec.rb +61 -0
- data/spec/helpers/history.rb +34 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/helpers/wsoc.rb +83 -0
- data/spec/page_examples.rb +21 -0
- data/spec/page_spec.rb +125 -0
- data/spec/rules_spec.rb +45 -0
- data/spec/sanitizers_spec.rb +61 -0
- data/spec/session_cache.rb +58 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/spidr_spec.rb +39 -0
- data/spidr.gemspec +133 -0
- data/spidr.gemspec~ +131 -0
- metadata +158 -0
@@ -0,0 +1,144 @@
|
|
1
|
+
require 'spidr/cookie_jar'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe CookieJar do
|
6
|
+
it "should retrieve cookies for the named host" do
|
7
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
8
|
+
|
9
|
+
subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should add a cookie to the jar" do
|
13
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
14
|
+
|
15
|
+
subject['zerosum.org'].should == {'admin' => 'ofcourseiam'}
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should merge new cookies into the jar" do
|
19
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
20
|
+
subject['zerosum.org'] = {'other' => '1'}
|
21
|
+
|
22
|
+
subject['zerosum.org'].should == {
|
23
|
+
'admin' => 'ofcourseiam',
|
24
|
+
'other' => '1'
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should override previous cookies in the jar" do
|
29
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
30
|
+
subject['zerosum.org'] = {'admin' => 'somethingcompletelydifferent'}
|
31
|
+
|
32
|
+
subject['zerosum.org'].should == {
|
33
|
+
'admin' => 'somethingcompletelydifferent'
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should clear all cookies" do
|
38
|
+
subject['zerosum.org'] = {'cookie' => 'foobar'}
|
39
|
+
subject.clear!
|
40
|
+
|
41
|
+
subject.size.should == 0
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "dirty" do
|
45
|
+
let(:dirty) { subject.instance_variable_get('@dirty') }
|
46
|
+
|
47
|
+
it "should mark a cookie dirty after adding new params" do
|
48
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
49
|
+
subject['zerosum.org'] = {'other' => '1'}
|
50
|
+
|
51
|
+
dirty.include?('zerosum.org').should == true
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should mark a cookie dirty after overriding params" do
|
55
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
56
|
+
subject['zerosum.org'] = {'admin' => 'nope'}
|
57
|
+
|
58
|
+
dirty.include?('zerosum.org').should == true
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should un-mark a cookie as dirty after re-encoding it" do
|
62
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
63
|
+
subject['zerosum.org'] = {'admin' => 'nope'}
|
64
|
+
|
65
|
+
dirty.include?('zerosum.org').should == true
|
66
|
+
|
67
|
+
subject.for_host('zerosum.org')
|
68
|
+
|
69
|
+
dirty.include?('zerosum.org').should == false
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe "cookies_for_host" do
|
74
|
+
it "should return an empty Hash for unknown hosts" do
|
75
|
+
subject.cookies_for_host('lol.com').should be_empty
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should return an empty Hash for hosts with no cookie params" do
|
79
|
+
subject['lol.com'] = {}
|
80
|
+
|
81
|
+
subject.cookies_for_host('lol.com').should be_empty
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should return cookie parameters for the host" do
|
85
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
86
|
+
subject['zerosum.org'] = {'other' => '1'}
|
87
|
+
|
88
|
+
cookie = subject.cookies_for_host('zerosum.org')
|
89
|
+
|
90
|
+
cookie['admin'].should == 'ofcourseiam'
|
91
|
+
cookie['other'].should == '1'
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should include cookies for the parent domain" do
|
95
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
96
|
+
subject['sub.zerosum.org'] = {'other' => '1'}
|
97
|
+
|
98
|
+
cookie = subject.cookies_for_host('sub.zerosum.org')
|
99
|
+
|
100
|
+
cookie['admin'].should == 'ofcourseiam'
|
101
|
+
cookie['other'].should == '1'
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
describe "for_host" do
|
106
|
+
it "should return nil for unknown hosts" do
|
107
|
+
subject.for_host('lol.com').should be_nil
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should return nil for hosts with no cookie params" do
|
111
|
+
subject['lol.com'] = {}
|
112
|
+
|
113
|
+
subject.for_host('lol.com').should be_nil
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should encode single cookie params" do
|
117
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
118
|
+
|
119
|
+
subject.for_host('zerosum.org').should == 'admin=ofcourseiam'
|
120
|
+
end
|
121
|
+
|
122
|
+
it "should encode multiple cookie params" do
|
123
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
124
|
+
subject['zerosum.org'] = {'other' => '1'}
|
125
|
+
|
126
|
+
cookie = subject.for_host('zerosum.org')
|
127
|
+
|
128
|
+
cookie.should include('admin=ofcourseiam')
|
129
|
+
cookie.should include('; ')
|
130
|
+
cookie.should include('other=1')
|
131
|
+
end
|
132
|
+
|
133
|
+
it "should include cookies for the parent domain" do
|
134
|
+
subject['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
135
|
+
subject['sub.zerosum.org'] = {'other' => '1'}
|
136
|
+
|
137
|
+
cookie = subject.for_host('sub.zerosum.org')
|
138
|
+
|
139
|
+
cookie.should include('admin=ofcourseiam')
|
140
|
+
cookie.should include('; ')
|
141
|
+
cookie.should include('other=1')
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'spidr/extensions/uri'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe URI do
|
6
|
+
describe "expand_path" do
|
7
|
+
it "should preserve single directory paths" do
|
8
|
+
URI.expand_path('path').should == 'path'
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should preserve trailing '/'" do
|
12
|
+
URI.expand_path('test/path/').should == 'test/path/'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should remove multiple '/' characters" do
|
16
|
+
URI.expand_path('///test///path///').should == '/test/path/'
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should remove '.' directories from the path" do
|
20
|
+
URI.expand_path('test/./path').should == 'test/path'
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should handle '..' directories properly" do
|
24
|
+
URI.expand_path('test/../path').should == 'path'
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should limit the number of '..' directories resolved" do
|
28
|
+
URI.expand_path('/test/../../../..').should == '/'
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should preserve absolute paths" do
|
32
|
+
URI.expand_path('/test/path').should == '/test/path'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should preserve the root path" do
|
36
|
+
URI.expand_path('/').should == '/'
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should default empty paths to the root path" do
|
40
|
+
URI.expand_path('').should == '/'
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'spidr/filters'
|
2
|
+
require 'spidr/agent'
|
3
|
+
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
describe Filters do
|
7
|
+
it "should allow setting the acceptable schemes" do
|
8
|
+
agent = Agent.new
|
9
|
+
|
10
|
+
agent.schemes = [:http]
|
11
|
+
agent.schemes.should == ['http']
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should provide the hosts that will be visited" do
|
15
|
+
agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
|
16
|
+
|
17
|
+
agent.visit_hosts.should == ['spidr.rubyforge.org']
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should provide the hosts that will not be visited" do
|
21
|
+
agent = Agent.new(:ignore_hosts => ['example.com'])
|
22
|
+
|
23
|
+
agent.ignore_hosts.should == ['example.com']
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should provide the ports that will be visited" do
|
27
|
+
agent = Agent.new(:ports => [80, 443, 8000])
|
28
|
+
|
29
|
+
agent.visit_ports.should == [80, 443, 8000]
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should provide the ports that will not be visited" do
|
33
|
+
agent = Agent.new(:ignore_ports => [8000, 8080])
|
34
|
+
|
35
|
+
agent.ignore_ports.should == [8000, 8080]
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should provide the links that will be visited" do
|
39
|
+
agent = Agent.new(:links => ['index.php'])
|
40
|
+
|
41
|
+
agent.visit_links.should == ['index.php']
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should provide the links that will not be visited" do
|
45
|
+
agent = Agent.new(:ignore_links => [/login/])
|
46
|
+
|
47
|
+
agent.ignore_links.should == [/login/]
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should provide the exts that will be visited" do
|
51
|
+
agent = Agent.new(:exts => ['htm'])
|
52
|
+
|
53
|
+
agent.visit_exts.should == ['htm']
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should provide the exts that will not be visited" do
|
57
|
+
agent = Agent.new(:ignore_exts => ['cfm'])
|
58
|
+
|
59
|
+
agent.ignore_exts.should == ['cfm']
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Helpers
|
2
|
+
module History
|
3
|
+
def visited_once?(url)
|
4
|
+
return @agent.visited_urls.select { |visited_url|
|
5
|
+
visited_url == url
|
6
|
+
}.length == 1
|
7
|
+
end
|
8
|
+
|
9
|
+
def visited_link?(url)
|
10
|
+
@agent.visited?(url)
|
11
|
+
end
|
12
|
+
|
13
|
+
def visit_failed?(url)
|
14
|
+
@agent.failed?(url)
|
15
|
+
end
|
16
|
+
|
17
|
+
def should_visit_link(url)
|
18
|
+
visited_link?(url).should == true
|
19
|
+
end
|
20
|
+
|
21
|
+
def should_ignore_link(url)
|
22
|
+
visited_link?(url).should == false
|
23
|
+
end
|
24
|
+
|
25
|
+
def should_visit_once(url)
|
26
|
+
visited_once?(url).should == true
|
27
|
+
end
|
28
|
+
|
29
|
+
def should_fail_link(url)
|
30
|
+
visited_link?(url).should == false
|
31
|
+
visit_failed?(url).should == true
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'wsoc/config'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
require 'helpers/history'
|
6
|
+
|
7
|
+
module Helpers
|
8
|
+
module WSOC
|
9
|
+
include History
|
10
|
+
|
11
|
+
SERVER_URL = URI::HTTP.build(
|
12
|
+
:host => (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
|
13
|
+
:port => (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
|
14
|
+
)
|
15
|
+
|
16
|
+
SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])
|
17
|
+
|
18
|
+
COURSE_URL = SERVER_URL.merge(::WSOC::Config::COURSE_START_PATH)
|
19
|
+
|
20
|
+
COURSE_METADATA = {}
|
21
|
+
|
22
|
+
def self.included(base)
|
23
|
+
hash = JSON.parse(open(SPECS_URL).read)
|
24
|
+
metadata = hash['metadata']
|
25
|
+
specs = hash['specs']
|
26
|
+
|
27
|
+
if metadata.kind_of?(Hash)
|
28
|
+
COURSE_METADATA.merge!(metadata)
|
29
|
+
end
|
30
|
+
|
31
|
+
if specs.kind_of?(Array)
|
32
|
+
specs.each do |spec|
|
33
|
+
message = spec['message'].dump
|
34
|
+
url = spec['url'].dump
|
35
|
+
|
36
|
+
case spec['behavior']
|
37
|
+
when 'visit'
|
38
|
+
base.module_eval %{
|
39
|
+
it #{message} do
|
40
|
+
should_visit_link(#{url})
|
41
|
+
end
|
42
|
+
}
|
43
|
+
when 'ignore'
|
44
|
+
base.module_eval %{
|
45
|
+
it #{message} do
|
46
|
+
should_ignore_link(#{url})
|
47
|
+
end
|
48
|
+
}
|
49
|
+
when 'fail'
|
50
|
+
base.module_eval %{
|
51
|
+
it #{message} do
|
52
|
+
should_fail_link(#{url})
|
53
|
+
end
|
54
|
+
}
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def course
|
61
|
+
WSOC::COURSE_METADATA
|
62
|
+
end
|
63
|
+
|
64
|
+
def course_auth_store
|
65
|
+
course['auth_store']
|
66
|
+
end
|
67
|
+
|
68
|
+
def run_course
|
69
|
+
Spidr::Agent.start_at(COURSE_URL) do |agent|
|
70
|
+
course_auth_store.each do |path,auth|
|
71
|
+
agent.authorized.add(
|
72
|
+
COURSE_URL.merge(path),
|
73
|
+
auth['user'],
|
74
|
+
auth['password']
|
75
|
+
)
|
76
|
+
end
|
77
|
+
|
78
|
+
agent.every_failed_url { |url| puts "[FAILED] #{url}" }
|
79
|
+
agent.every_url { |url| puts url }
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'spidr/page'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
shared_examples_for "Page" do
|
6
|
+
it "should have a status code" do
|
7
|
+
@page.code.should be_integer
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should have a body" do
|
11
|
+
@page.body.should_not be_empty
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should provide transparent access to the response headers" do
|
15
|
+
@page.content_type.should == @page.response['Content-Type']
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should allow content-types" do
|
19
|
+
@page.content_types.should_not be_empty
|
20
|
+
end
|
21
|
+
end
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'spidr/page'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'page_examples'
|
5
|
+
require 'helpers/page'
|
6
|
+
|
7
|
+
describe Page do
|
8
|
+
describe "html" do
|
9
|
+
before(:all) do
|
10
|
+
@page = get_page('http://spidr.rubyforge.org/course/start.html')
|
11
|
+
end
|
12
|
+
|
13
|
+
it_should_behave_like "Page"
|
14
|
+
|
15
|
+
it "should be OK" do
|
16
|
+
@page.should be_ok
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should have a content-type" do
|
20
|
+
@page.content_type.should include('text/html')
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should be a html page" do
|
24
|
+
@page.should be_html
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have provide a document" do
|
28
|
+
@page.doc.class.should == Nokogiri::HTML::Document
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should allow searching the document" do
|
32
|
+
@page.doc.search('//p').length.should == 2
|
33
|
+
@page.doc.at('//p[2]').inner_text.should == 'Ready! Set! Go!'
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should have a title" do
|
37
|
+
@page.title.should == 'Spidr :: Web-Spider Obstacle Course :: Start'
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should have links" do
|
41
|
+
@page.links.should_not be_empty
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "txt" do
|
46
|
+
before(:all) do
|
47
|
+
@page = get_page('http://www.ruby-lang.org/en/LICENSE.txt')
|
48
|
+
end
|
49
|
+
|
50
|
+
it_should_behave_like "Page"
|
51
|
+
|
52
|
+
it "should be OK" do
|
53
|
+
@page.should be_ok
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should have a content-type" do
|
57
|
+
@page.content_type.should include('text/plain')
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should be a txt page" do
|
61
|
+
@page.should be_txt
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should not have provide a document" do
|
65
|
+
@page.doc.should be_nil
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should not allow searching the document" do
|
69
|
+
@page.search('//p').should be_empty
|
70
|
+
@page.at('//p').should be_nil
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should not have links" do
|
74
|
+
@page.links.should be_empty
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should not have a title" do
|
78
|
+
@page.title.should be_nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
describe "redirects" do
|
83
|
+
before(:all) do
|
84
|
+
@page = get_page('http://spidr.rubyforge.org/course/start.html')
|
85
|
+
@page.stub!(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should provide access to page-level redirects" do
|
89
|
+
@page.redirects_to.should == ['http://spidr.rubyforge.org/redirected']
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should include meta refresh redirects in the list of links" do
|
93
|
+
@page.links.should include('http://spidr.rubyforge.org/redirected')
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
describe "cookies" do
|
98
|
+
before(:all) do
|
99
|
+
@page = get_page('http://twitter.com/login')
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should provide access to the raw Cookie" do
|
103
|
+
cookie = @page.cookie
|
104
|
+
|
105
|
+
cookie.should_not be_nil
|
106
|
+
cookie.should_not be_empty
|
107
|
+
end
|
108
|
+
|
109
|
+
it "should provide access to the Cookies" do
|
110
|
+
cookies = @page.cookies
|
111
|
+
|
112
|
+
cookies.should_not be_empty
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should provide access to the key->value pairs within the Cookie" do
|
116
|
+
params = @page.cookie_params
|
117
|
+
|
118
|
+
params.should_not be_empty
|
119
|
+
|
120
|
+
params.each do |key,value|
|
121
|
+
key.should_not be_empty
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
data/spec/rules_spec.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spidr/rules'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe Rules do
|
6
|
+
subject { Rules }
|
7
|
+
|
8
|
+
it "should accept data based on acceptance data" do
|
9
|
+
rules = subject.new(:accept => [1])
|
10
|
+
|
11
|
+
rules.accept?(1).should == true
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should accept data based on acceptance regexps" do
|
15
|
+
rules = subject.new(:accept => [/1/])
|
16
|
+
|
17
|
+
rules.accept?('1').should == true
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should match non-Strings using acceptance regexps" do
|
21
|
+
rules = subject.new(:accept => [/1/])
|
22
|
+
|
23
|
+
rules.accept?(1).should == true
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should accept data using acceptance lambdas" do
|
27
|
+
rules = subject.new(:accept => [lambda { |data| data > 2 }])
|
28
|
+
|
29
|
+
rules.accept?(3).should == true
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should reject data that does not match any acceptance patterns" do
|
33
|
+
rules = subject.new(:accept => [1, 2, 3])
|
34
|
+
|
35
|
+
rules.accept?(2).should == true
|
36
|
+
rules.accept?(4).should == false
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should accept data that does not match any rejection patterns" do
|
40
|
+
rules = subject.new(:reject => [1, 2, 3])
|
41
|
+
|
42
|
+
rules.accept?(2).should == false
|
43
|
+
rules.accept?(4).should == true
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'spidr/sanitizers'
|
2
|
+
require 'spidr/agent'
|
3
|
+
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
describe Sanitizers do
|
7
|
+
describe "sanitize_url" do
|
8
|
+
let(:url) { 'http://host.com' }
|
9
|
+
before(:all) { @agent = Agent.new }
|
10
|
+
|
11
|
+
it "should sanitize URLs" do
|
12
|
+
agent = Agent.new
|
13
|
+
clean_url = agent.sanitize_url(URI(url))
|
14
|
+
|
15
|
+
clean_url.host.should == 'host.com'
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should sanitize URLs given as Strings" do
|
19
|
+
agent = Agent.new
|
20
|
+
clean_url = agent.sanitize_url(url)
|
21
|
+
|
22
|
+
clean_url.host.should == 'host.com'
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "strip_fragments" do
|
27
|
+
let(:url) { URI("http://host.com/page#lol") }
|
28
|
+
|
29
|
+
it "should strip fragment components by default" do
|
30
|
+
agent = Agent.new
|
31
|
+
clean_url = agent.sanitize_url(url)
|
32
|
+
|
33
|
+
clean_url.fragment.should be_nil
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should allow perserving fragment components" do
|
37
|
+
agent = Agent.new(:strip_fragments => false)
|
38
|
+
clean_url = agent.sanitize_url(url)
|
39
|
+
|
40
|
+
clean_url.fragment.should == 'lol'
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "strip_query" do
|
45
|
+
let(:url) { URI("http://host.com/page?x=1") }
|
46
|
+
|
47
|
+
it "should not strip query components by default" do
|
48
|
+
agent = Agent.new
|
49
|
+
clean_url = agent.sanitize_url(url)
|
50
|
+
|
51
|
+
clean_url.query.should == 'x=1'
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should allow stripping of query components" do
|
55
|
+
agent = Agent.new(:strip_query => true)
|
56
|
+
clean_url = agent.sanitize_url(url)
|
57
|
+
|
58
|
+
clean_url.query.should be_nil
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'spidr/session_cache'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe SessionCache do
|
6
|
+
describe "empty" do
|
7
|
+
before(:all) do
|
8
|
+
@sessions = SessionCache.new
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should not have any active sessions" do
|
12
|
+
@sessions.should_not be_active(URI('http://example.com/'))
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should start new sessions on-demand" do
|
16
|
+
@sessions[URI('http://example.com/')].should_not be_nil
|
17
|
+
end
|
18
|
+
|
19
|
+
after(:all) do
|
20
|
+
@sessions.clear
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
describe "not-empty" do
|
25
|
+
before(:all) do
|
26
|
+
@url = URI('http://example.com/')
|
27
|
+
|
28
|
+
@sessions = SessionCache.new
|
29
|
+
@sessions[@url]
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should have active sessions" do
|
33
|
+
@sessions.should be_active(@url)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should provide access to sessions" do
|
37
|
+
@sessions[@url].should_not be_nil
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should start new sessions on-demand" do
|
41
|
+
url2 = URI('http://www.w3c.org/')
|
42
|
+
|
43
|
+
@sessions[url2].should_not be_nil
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should be able to kill sessions" do
|
47
|
+
url2 = URI('http://www.w3c.org/')
|
48
|
+
|
49
|
+
@sessions[url2].should_not be_nil
|
50
|
+
@sessions.kill!(url2)
|
51
|
+
@sessions.should_not be_active(url2)
|
52
|
+
end
|
53
|
+
|
54
|
+
after(:all) do
|
55
|
+
@sessions.clear
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
data/spec/spec_helper.rb
ADDED