spidr 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.rdoc +191 -0
- data/Manifest.txt +10 -34
- data/{README.txt → README.rdoc} +3 -1
- data/Rakefile +6 -4
- data/lib/spidr/agent.rb +137 -97
- data/lib/spidr/auth_credential.rb +25 -0
- data/lib/spidr/auth_store.rb +157 -0
- data/lib/spidr/cookie_jar.rb +166 -0
- data/lib/spidr/filters.rb +2 -0
- data/lib/spidr/page.rb +75 -11
- data/lib/spidr/sanitizers.rb +59 -0
- data/lib/spidr/session_cache.rb +119 -0
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +2 -2
- data/spec/helpers/history.rb +34 -0
- data/spec/helpers/wsoc.rb +83 -0
- data/spec/page_examples.rb +5 -1
- data/spec/page_spec.rb +30 -0
- data/spec/sanitizers_spec.rb +67 -0
- data/tasks/yard.rb +1 -1
- metadata +24 -40
- metadata.gz.sig +0 -0
- data/History.txt +0 -167
- data/spec/helpers/course.rb +0 -95
- data/static/course/absolute/index.html +0 -10
- data/static/course/absolute/next.html +0 -9
- data/static/course/absolute/start.html +0 -19
- data/static/course/empty/index.html +0 -10
- data/static/course/empty/start.html +0 -23
- data/static/course/fail.html +0 -14
- data/static/course/frames/frame.html +0 -15
- data/static/course/frames/frame_next.html +0 -9
- data/static/course/frames/iframe.html +0 -15
- data/static/course/frames/iframe_next.html +0 -9
- data/static/course/frames/index.html +0 -10
- data/static/course/frames/start.html +0 -15
- data/static/course/index.html +0 -10
- data/static/course/javascript/index.html +0 -10
- data/static/course/javascript/start.html +0 -19
- data/static/course/loop/index.html +0 -10
- data/static/course/loop/next.html +0 -13
- data/static/course/loop/start.html +0 -19
- data/static/course/relative/current_directory.html +0 -9
- data/static/course/relative/index.html +0 -10
- data/static/course/relative/normal.html +0 -9
- data/static/course/relative/same_directory.html +0 -9
- data/static/course/relative/start.html +0 -27
- data/static/course/remote/index.html +0 -10
- data/static/course/remote/next.html +0 -9
- data/static/course/remote/start.html +0 -27
- data/static/course/scripts/course.js +0 -29
- data/static/course/scripts/jquery-1.2.6.min.js +0 -32
- data/static/course/specs.json +0 -1
- data/static/course/start.html +0 -27
- data/tasks/course.rb +0 -63
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
require 'uri'
|
|
2
|
+
|
|
3
|
+
module Spidr
|
|
4
|
+
module Sanitizers
|
|
5
|
+
def self.included(base)
|
|
6
|
+
base.module_eval do
|
|
7
|
+
# Specifies whether the Agent will strip URI fragments
|
|
8
|
+
attr_accessor :strip_fragments
|
|
9
|
+
|
|
10
|
+
# Specifies whether the Agent will strip URI queries
|
|
11
|
+
attr_accessor :strip_query
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
#
|
|
16
|
+
# Initializes the sanitization rules.
|
|
17
|
+
#
|
|
18
|
+
# @param [Hash] options
|
|
19
|
+
# Additional options.
|
|
20
|
+
#
|
|
21
|
+
# @option options [Boolean] :strip_fragments (true)
|
|
22
|
+
# Specifies whether or not to strip the fragment component from URLs.
|
|
23
|
+
#
|
|
24
|
+
# @option options [Boolean] :strip_query (false)
|
|
25
|
+
# Specifies whether or not to strip the query component from URLs.
|
|
26
|
+
#
|
|
27
|
+
# @since 0.2.2
|
|
28
|
+
#
|
|
29
|
+
def initialize(options={})
|
|
30
|
+
@strip_fragments = true
|
|
31
|
+
|
|
32
|
+
if options.has_key?(:strip_fragments)
|
|
33
|
+
@strip_fragments = options[:strip_fragments]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
@strip_query = (options[:strip_query] || false)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
#
|
|
40
|
+
# Sanitizes a URL based on filtering options.
|
|
41
|
+
#
|
|
42
|
+
# @param [URI::HTTP, URI::HTTPS, String] url
|
|
43
|
+
# The URL to be sanitized
|
|
44
|
+
#
|
|
45
|
+
# @return [URI::HTTP, URI::HTTPS]
|
|
46
|
+
# The new sanitized URL.
|
|
47
|
+
#
|
|
48
|
+
# @since 0.2.2
|
|
49
|
+
#
|
|
50
|
+
def sanitize_url(url)
|
|
51
|
+
url = URI(url.to_s)
|
|
52
|
+
|
|
53
|
+
url.fragment = nil if @strip_fragments
|
|
54
|
+
url.query = nil if @strip_query
|
|
55
|
+
|
|
56
|
+
return url
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
require 'spidr/spidr'
|
|
2
|
+
|
|
3
|
+
require 'net/http'
|
|
4
|
+
|
|
5
|
+
module Spidr
|
|
6
|
+
class SessionCache
|
|
7
|
+
|
|
8
|
+
# Proxy to use
|
|
9
|
+
attr_accessor :proxy
|
|
10
|
+
|
|
11
|
+
#
|
|
12
|
+
# Creates a new session cache.
|
|
13
|
+
#
|
|
14
|
+
# @param [Hash] proxy (Spidr.proxy)
|
|
15
|
+
# Proxy options.
|
|
16
|
+
#
|
|
17
|
+
# @option proxy [String] :host
|
|
18
|
+
# The host the proxy is running on.
|
|
19
|
+
#
|
|
20
|
+
# @option proxy [Integer] :port
|
|
21
|
+
# The port the proxy is running on.
|
|
22
|
+
#
|
|
23
|
+
# @option proxy [String] :user
|
|
24
|
+
# The user to authenticate as with the proxy.
|
|
25
|
+
#
|
|
26
|
+
# @option proxy [String] :password
|
|
27
|
+
# The password to authenticate with.
|
|
28
|
+
#
|
|
29
|
+
# @since 0.2.2
|
|
30
|
+
#
|
|
31
|
+
def initialize(proxy=Spidr.proxy)
|
|
32
|
+
@proxy = proxy
|
|
33
|
+
@sessions = {}
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
#
|
|
37
|
+
# Provides an active HTTP session for a given URL.
|
|
38
|
+
#
|
|
39
|
+
# @param [URI::HTTP, String] url
|
|
40
|
+
# The URL which will be requested later.
|
|
41
|
+
#
|
|
42
|
+
# @return [Net::HTTP]
|
|
43
|
+
# The active HTTP session object.
|
|
44
|
+
#
|
|
45
|
+
def [](url)
|
|
46
|
+
# normalize the url
|
|
47
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
|
48
|
+
|
|
49
|
+
key = [url.scheme, url.host, url.port]
|
|
50
|
+
|
|
51
|
+
unless @sessions[key]
|
|
52
|
+
session = Net::HTTP::Proxy(
|
|
53
|
+
@proxy[:host],
|
|
54
|
+
@proxy[:port],
|
|
55
|
+
@proxy[:user],
|
|
56
|
+
@proxy[:password]
|
|
57
|
+
).new(url.host,url.port)
|
|
58
|
+
|
|
59
|
+
if url.scheme == 'https'
|
|
60
|
+
session.use_ssl = true
|
|
61
|
+
session.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
@sessions[key] = session
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
return @sessions[key]
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
#
|
|
71
|
+
# Destroys an HTTP session for the given scheme, host and port.
|
|
72
|
+
#
|
|
73
|
+
# @param [URI::HTTP, String] url
|
|
74
|
+
# The URL of the requested session.
|
|
75
|
+
#
|
|
76
|
+
# @return [nil]
|
|
77
|
+
#
|
|
78
|
+
# @since 0.2.2
|
|
79
|
+
#
|
|
80
|
+
def kill!(url)
|
|
81
|
+
# normalize the url
|
|
82
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
|
83
|
+
|
|
84
|
+
key = [url.scheme, url.host, url.port]
|
|
85
|
+
|
|
86
|
+
if (sess = @sessions[key])
|
|
87
|
+
|
|
88
|
+
begin
|
|
89
|
+
sess.finish
|
|
90
|
+
rescue IOError
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
@sessions.delete(key)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
#
|
|
98
|
+
# Clears the session cache.
|
|
99
|
+
#
|
|
100
|
+
# @return [SessionCache]
|
|
101
|
+
# The cleared session cache.
|
|
102
|
+
#
|
|
103
|
+
# @since 0.2.2
|
|
104
|
+
#
|
|
105
|
+
def clear
|
|
106
|
+
@sessions.each_value do |sess|
|
|
107
|
+
begin
|
|
108
|
+
sess.finish
|
|
109
|
+
rescue IOError
|
|
110
|
+
nil
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
@sessions.clear
|
|
115
|
+
return self
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
end
|
|
119
|
+
end
|
data/lib/spidr/version.rb
CHANGED
data/spec/agent_spec.rb
CHANGED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
module Helpers
|
|
2
|
+
module History
|
|
3
|
+
def visited_once?(url)
|
|
4
|
+
return @agent.visited_urls.select { |visited_url|
|
|
5
|
+
visited_url == url
|
|
6
|
+
}.length == 1
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def visited_link?(url)
|
|
10
|
+
@agent.visited?(url)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def visit_failed?(url)
|
|
14
|
+
@agent.failed?(url)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def should_visit_link(url)
|
|
18
|
+
visited_link?(url).should == true
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def should_ignore_link(url)
|
|
22
|
+
visited_link?(url).should == false
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def should_visit_once(url)
|
|
26
|
+
visited_once?(url).should == true
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def should_fail_link(url)
|
|
30
|
+
visited_link?(url).should == false
|
|
31
|
+
visit_failed?(url).should == true
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
require 'wsoc/config'
|
|
2
|
+
require 'open-uri'
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
require 'helpers/history'
|
|
6
|
+
|
|
7
|
+
module Helpers
|
|
8
|
+
module WSOC
|
|
9
|
+
include History
|
|
10
|
+
|
|
11
|
+
SERVER_URL = URI::HTTP.build(
|
|
12
|
+
:host => (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
|
|
13
|
+
:port => (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])
|
|
17
|
+
|
|
18
|
+
COURSE_URL = SERVER_URL.merge(::WSOC::Config::COURSE_START_PATH)
|
|
19
|
+
|
|
20
|
+
COURSE_METADATA = {}
|
|
21
|
+
|
|
22
|
+
def self.included(base)
|
|
23
|
+
hash = JSON.parse(open(SPECS_URL).read)
|
|
24
|
+
metadata = hash['metadata']
|
|
25
|
+
specs = hash['specs']
|
|
26
|
+
|
|
27
|
+
if metadata.kind_of?(Hash)
|
|
28
|
+
COURSE_METADATA.merge!(metadata)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
if specs.kind_of?(Array)
|
|
32
|
+
specs.each do |spec|
|
|
33
|
+
message = spec['message'].dump
|
|
34
|
+
url = spec['url'].dump
|
|
35
|
+
|
|
36
|
+
case spec['behavior']
|
|
37
|
+
when 'visit'
|
|
38
|
+
base.module_eval %{
|
|
39
|
+
it #{message} do
|
|
40
|
+
should_visit_link(#{url})
|
|
41
|
+
end
|
|
42
|
+
}
|
|
43
|
+
when 'ignore'
|
|
44
|
+
base.module_eval %{
|
|
45
|
+
it #{message} do
|
|
46
|
+
should_ignore_link(#{url})
|
|
47
|
+
end
|
|
48
|
+
}
|
|
49
|
+
when 'fail'
|
|
50
|
+
base.module_eval %{
|
|
51
|
+
it #{message} do
|
|
52
|
+
should_fail_link(#{url})
|
|
53
|
+
end
|
|
54
|
+
}
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def course
|
|
61
|
+
WSOC::COURSE_METADATA
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def course_auth_store
|
|
65
|
+
course['auth_store']
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def run_course
|
|
69
|
+
Agent.start_at(COURSE_URL) do |agent|
|
|
70
|
+
course_auth_store.each do |path,auth|
|
|
71
|
+
agent.authorized.add(
|
|
72
|
+
COURSE_URL.merge(path),
|
|
73
|
+
auth['user'],
|
|
74
|
+
auth['password']
|
|
75
|
+
)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
agent.every_failed_url { |url| puts "[FAILED] #{url}" }
|
|
79
|
+
agent.every_url { |url| puts url }
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
data/spec/page_examples.rb
CHANGED
|
@@ -12,6 +12,10 @@ shared_examples_for "Page" do
|
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
it "should provide transparent access to the response headers" do
|
|
15
|
-
@page.content_type.should == @page.
|
|
15
|
+
@page.content_type.should == @page.response['Content-Type']
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it "should allow content-types" do
|
|
19
|
+
@page.content_types.should_not be_empty
|
|
16
20
|
end
|
|
17
21
|
end
|
data/spec/page_spec.rb
CHANGED
|
@@ -78,4 +78,34 @@ describe Page do
|
|
|
78
78
|
@page.title.should be_nil
|
|
79
79
|
end
|
|
80
80
|
end
|
|
81
|
+
|
|
82
|
+
describe "cookies" do
|
|
83
|
+
before(:all) do
|
|
84
|
+
@page = get_page('http://twitter.com/login')
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it "should provide access to the raw Cookie" do
|
|
88
|
+
cookie = @page.cookie
|
|
89
|
+
|
|
90
|
+
cookie.should_not be_nil
|
|
91
|
+
cookie.should_not be_empty
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
it "should provide access to the Cookies" do
|
|
95
|
+
cookies = @page.cookies
|
|
96
|
+
|
|
97
|
+
cookies.should_not be_empty
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it "should provide access to the key->value pairs within the Cookie" do
|
|
101
|
+
params = @page.cookie_params
|
|
102
|
+
|
|
103
|
+
params.should_not be_empty
|
|
104
|
+
|
|
105
|
+
params.each do |key,value|
|
|
106
|
+
key.should_not be_empty
|
|
107
|
+
value.should_not be_empty
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
81
111
|
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
require 'spidr/sanitizers'
|
|
2
|
+
require 'spidr/agent'
|
|
3
|
+
|
|
4
|
+
require 'spec_helper'
|
|
5
|
+
|
|
6
|
+
describe Sanitizers do
|
|
7
|
+
describe "sanitize_url" do
|
|
8
|
+
before(:all) do
|
|
9
|
+
@agent = Agent.new
|
|
10
|
+
@url = 'http://host.com'
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it "should sanitize URLs" do
|
|
14
|
+
agent = Agent.new
|
|
15
|
+
clean_url = agent.sanitize_url(URI(@url))
|
|
16
|
+
|
|
17
|
+
clean_url.host.should == 'host.com'
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
it "should sanitize URLs given as Strings" do
|
|
21
|
+
agent = Agent.new
|
|
22
|
+
clean_url = agent.sanitize_url(@url)
|
|
23
|
+
|
|
24
|
+
clean_url.host.should == 'host.com'
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
describe "strip_fragments" do
|
|
29
|
+
before(:all) do
|
|
30
|
+
@url = URI("http://host.com/page#lol")
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "should strip fragment components by default" do
|
|
34
|
+
agent = Agent.new
|
|
35
|
+
clean_url = agent.sanitize_url(@url)
|
|
36
|
+
|
|
37
|
+
clean_url.fragment.should be_nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
it "should allow perserving fragment components" do
|
|
41
|
+
agent = Agent.new(:strip_fragments => false)
|
|
42
|
+
clean_url = agent.sanitize_url(@url)
|
|
43
|
+
|
|
44
|
+
clean_url.fragment.should == 'lol'
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
describe "strip_query" do
|
|
49
|
+
before(:all) do
|
|
50
|
+
@url = URI("http://host.com/page?x=1")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it "should not strip query components by default" do
|
|
54
|
+
agent = Agent.new
|
|
55
|
+
clean_url = agent.sanitize_url(@url)
|
|
56
|
+
|
|
57
|
+
clean_url.query.should == 'x=1'
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it "should allow stripping of query components" do
|
|
61
|
+
agent = Agent.new(:strip_query => true)
|
|
62
|
+
clean_url = agent.sanitize_url(@url)
|
|
63
|
+
|
|
64
|
+
clean_url.query.should be_nil
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|