spidr 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.rdoc +191 -0
  3. data/Manifest.txt +10 -34
  4. data/{README.txt → README.rdoc} +3 -1
  5. data/Rakefile +6 -4
  6. data/lib/spidr/agent.rb +137 -97
  7. data/lib/spidr/auth_credential.rb +25 -0
  8. data/lib/spidr/auth_store.rb +157 -0
  9. data/lib/spidr/cookie_jar.rb +166 -0
  10. data/lib/spidr/filters.rb +2 -0
  11. data/lib/spidr/page.rb +75 -11
  12. data/lib/spidr/sanitizers.rb +59 -0
  13. data/lib/spidr/session_cache.rb +119 -0
  14. data/lib/spidr/version.rb +1 -1
  15. data/spec/agent_spec.rb +2 -2
  16. data/spec/helpers/history.rb +34 -0
  17. data/spec/helpers/wsoc.rb +83 -0
  18. data/spec/page_examples.rb +5 -1
  19. data/spec/page_spec.rb +30 -0
  20. data/spec/sanitizers_spec.rb +67 -0
  21. data/tasks/yard.rb +1 -1
  22. metadata +24 -40
  23. metadata.gz.sig +0 -0
  24. data/History.txt +0 -167
  25. data/spec/helpers/course.rb +0 -95
  26. data/static/course/absolute/index.html +0 -10
  27. data/static/course/absolute/next.html +0 -9
  28. data/static/course/absolute/start.html +0 -19
  29. data/static/course/empty/index.html +0 -10
  30. data/static/course/empty/start.html +0 -23
  31. data/static/course/fail.html +0 -14
  32. data/static/course/frames/frame.html +0 -15
  33. data/static/course/frames/frame_next.html +0 -9
  34. data/static/course/frames/iframe.html +0 -15
  35. data/static/course/frames/iframe_next.html +0 -9
  36. data/static/course/frames/index.html +0 -10
  37. data/static/course/frames/start.html +0 -15
  38. data/static/course/index.html +0 -10
  39. data/static/course/javascript/index.html +0 -10
  40. data/static/course/javascript/start.html +0 -19
  41. data/static/course/loop/index.html +0 -10
  42. data/static/course/loop/next.html +0 -13
  43. data/static/course/loop/start.html +0 -19
  44. data/static/course/relative/current_directory.html +0 -9
  45. data/static/course/relative/index.html +0 -10
  46. data/static/course/relative/normal.html +0 -9
  47. data/static/course/relative/same_directory.html +0 -9
  48. data/static/course/relative/start.html +0 -27
  49. data/static/course/remote/index.html +0 -10
  50. data/static/course/remote/next.html +0 -9
  51. data/static/course/remote/start.html +0 -27
  52. data/static/course/scripts/course.js +0 -29
  53. data/static/course/scripts/jquery-1.2.6.min.js +0 -32
  54. data/static/course/specs.json +0 -1
  55. data/static/course/start.html +0 -27
  56. data/tasks/course.rb +0 -63
@@ -0,0 +1,59 @@
1
+ require 'uri'
2
+
3
+ module Spidr
4
+ module Sanitizers
5
+ def self.included(base)
6
+ base.module_eval do
7
+ # Specifies whether the Agent will strip URI fragments
8
+ attr_accessor :strip_fragments
9
+
10
+ # Specifies whether the Agent will strip URI queries
11
+ attr_accessor :strip_query
12
+ end
13
+ end
14
+
15
+ #
16
+ # Initializes the sanitization rules.
17
+ #
18
+ # @param [Hash] options
19
+ # Additional options.
20
+ #
21
+ # @option options [Boolean] :strip_fragments (true)
22
+ # Specifies whether or not to strip the fragment component from URLs.
23
+ #
24
+ # @option options [Boolean] :strip_query (false)
25
+ # Specifies whether or not to strip the query component from URLs.
26
+ #
27
+ # @since 0.2.2
28
+ #
29
+ def initialize(options={})
30
+ @strip_fragments = true
31
+
32
+ if options.has_key?(:strip_fragments)
33
+ @strip_fragments = options[:strip_fragments]
34
+ end
35
+
36
+ @strip_query = (options[:strip_query] || false)
37
+ end
38
+
39
+ #
40
+ # Sanitizes a URL based on filtering options.
41
+ #
42
+ # @param [URI::HTTP, URI::HTTPS, String] url
43
+ # The URL to be sanitized
44
+ #
45
+ # @return [URI::HTTP, URI::HTTPS]
46
+ # The new sanitized URL.
47
+ #
48
+ # @since 0.2.2
49
+ #
50
+ def sanitize_url(url)
51
+ url = URI(url.to_s)
52
+
53
+ url.fragment = nil if @strip_fragments
54
+ url.query = nil if @strip_query
55
+
56
+ return url
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,119 @@
1
+ require 'spidr/spidr'
2
+
3
+ require 'net/http'
4
+
5
+ module Spidr
6
+ class SessionCache
7
+
8
+ # Proxy to use
9
+ attr_accessor :proxy
10
+
11
+ #
12
+ # Creates a new session cache.
13
+ #
14
+ # @param [Hash] proxy (Spidr.proxy)
15
+ # Proxy options.
16
+ #
17
+ # @option proxy [String] :host
18
+ # The host the proxy is running on.
19
+ #
20
+ # @option proxy [Integer] :port
21
+ # The port the proxy is running on.
22
+ #
23
+ # @option proxy [String] :user
24
+ # The user to authenticate as with the proxy.
25
+ #
26
+ # @option proxy [String] :password
27
+ # The password to authenticate with.
28
+ #
29
+ # @since 0.2.2
30
+ #
31
+ def initialize(proxy=Spidr.proxy)
32
+ @proxy = proxy
33
+ @sessions = {}
34
+ end
35
+
36
+ #
37
+ # Provides an active HTTP session for a given URL.
38
+ #
39
+ # @param [URI::HTTP, String] url
40
+ # The URL which will be requested later.
41
+ #
42
+ # @return [Net::HTTP]
43
+ # The active HTTP session object.
44
+ #
45
+ def [](url)
46
+ # normalize the url
47
+ url = URI(url.to_s) unless url.kind_of?(URI)
48
+
49
+ key = [url.scheme, url.host, url.port]
50
+
51
+ unless @sessions[key]
52
+ session = Net::HTTP::Proxy(
53
+ @proxy[:host],
54
+ @proxy[:port],
55
+ @proxy[:user],
56
+ @proxy[:password]
57
+ ).new(url.host,url.port)
58
+
59
+ if url.scheme == 'https'
60
+ session.use_ssl = true
61
+ session.verify_mode = OpenSSL::SSL::VERIFY_NONE
62
+ end
63
+
64
+ @sessions[key] = session
65
+ end
66
+
67
+ return @sessions[key]
68
+ end
69
+
70
+ #
71
+ # Destroys an HTTP session for the given scheme, host and port.
72
+ #
73
+ # @param [URI::HTTP, String] url
74
+ # The URL of the requested session.
75
+ #
76
+ # @return [nil]
77
+ #
78
+ # @since 0.2.2
79
+ #
80
+ def kill!(url)
81
+ # normalize the url
82
+ url = URI(url.to_s) unless url.kind_of?(URI)
83
+
84
+ key = [url.scheme, url.host, url.port]
85
+
86
+ if (sess = @sessions[key])
87
+
88
+ begin
89
+ sess.finish
90
+ rescue IOError
91
+ end
92
+
93
+ @sessions.delete(key)
94
+ end
95
+ end
96
+
97
+ #
98
+ # Clears the session cache.
99
+ #
100
+ # @return [SessionCache]
101
+ # The cleared session cache.
102
+ #
103
+ # @since 0.2.2
104
+ #
105
+ def clear
106
+ @sessions.each_value do |sess|
107
+ begin
108
+ sess.finish
109
+ rescue IOError
110
+ nil
111
+ end
112
+ end
113
+
114
+ @sessions.clear
115
+ return self
116
+ end
117
+
118
+ end
119
+ end
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.2.1'
3
+ VERSION = '0.2.2'
4
4
  end
data/spec/agent_spec.rb CHANGED
@@ -1,10 +1,10 @@
1
1
  require 'spidr/agent'
2
2
 
3
3
  require 'spec_helper'
4
- require 'helpers/course'
4
+ require 'helpers/wsoc'
5
5
 
6
6
  describe Agent do
7
- include Helpers::Course
7
+ include Helpers::WSOC
8
8
 
9
9
  before(:all) do
10
10
  @agent = run_course
@@ -0,0 +1,34 @@
1
+ module Helpers
2
+ module History
3
+ def visited_once?(url)
4
+ return @agent.visited_urls.select { |visited_url|
5
+ visited_url == url
6
+ }.length == 1
7
+ end
8
+
9
+ def visited_link?(url)
10
+ @agent.visited?(url)
11
+ end
12
+
13
+ def visit_failed?(url)
14
+ @agent.failed?(url)
15
+ end
16
+
17
+ def should_visit_link(url)
18
+ visited_link?(url).should == true
19
+ end
20
+
21
+ def should_ignore_link(url)
22
+ visited_link?(url).should == false
23
+ end
24
+
25
+ def should_visit_once(url)
26
+ visited_once?(url).should == true
27
+ end
28
+
29
+ def should_fail_link(url)
30
+ visited_link?(url).should == false
31
+ visit_failed?(url).should == true
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,83 @@
1
+ require 'wsoc/config'
2
+ require 'open-uri'
3
+ require 'json'
4
+
5
+ require 'helpers/history'
6
+
7
+ module Helpers
8
+ module WSOC
9
+ include History
10
+
11
+ SERVER_URL = URI::HTTP.build(
12
+ :host => (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
13
+ :port => (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
14
+ )
15
+
16
+ SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])
17
+
18
+ COURSE_URL = SERVER_URL.merge(::WSOC::Config::COURSE_START_PATH)
19
+
20
+ COURSE_METADATA = {}
21
+
22
+ def self.included(base)
23
+ hash = JSON.parse(open(SPECS_URL).read)
24
+ metadata = hash['metadata']
25
+ specs = hash['specs']
26
+
27
+ if metadata.kind_of?(Hash)
28
+ COURSE_METADATA.merge!(metadata)
29
+ end
30
+
31
+ if specs.kind_of?(Array)
32
+ specs.each do |spec|
33
+ message = spec['message'].dump
34
+ url = spec['url'].dump
35
+
36
+ case spec['behavior']
37
+ when 'visit'
38
+ base.module_eval %{
39
+ it #{message} do
40
+ should_visit_link(#{url})
41
+ end
42
+ }
43
+ when 'ignore'
44
+ base.module_eval %{
45
+ it #{message} do
46
+ should_ignore_link(#{url})
47
+ end
48
+ }
49
+ when 'fail'
50
+ base.module_eval %{
51
+ it #{message} do
52
+ should_fail_link(#{url})
53
+ end
54
+ }
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+ def course
61
+ WSOC::COURSE_METADATA
62
+ end
63
+
64
+ def course_auth_store
65
+ course['auth_store']
66
+ end
67
+
68
+ def run_course
69
+ Agent.start_at(COURSE_URL) do |agent|
70
+ course_auth_store.each do |path,auth|
71
+ agent.authorized.add(
72
+ COURSE_URL.merge(path),
73
+ auth['user'],
74
+ auth['password']
75
+ )
76
+ end
77
+
78
+ agent.every_failed_url { |url| puts "[FAILED] #{url}" }
79
+ agent.every_url { |url| puts url }
80
+ end
81
+ end
82
+ end
83
+ end
@@ -12,6 +12,10 @@ shared_examples_for "Page" do
12
12
  end
13
13
 
14
14
  it "should provide transparent access to the response headers" do
15
- @page.content_type.should == @page.content_type
15
+ @page.content_type.should == @page.response['Content-Type']
16
+ end
17
+
18
+ it "should allow content-types" do
19
+ @page.content_types.should_not be_empty
16
20
  end
17
21
  end
data/spec/page_spec.rb CHANGED
@@ -78,4 +78,34 @@ describe Page do
78
78
  @page.title.should be_nil
79
79
  end
80
80
  end
81
+
82
+ describe "cookies" do
83
+ before(:all) do
84
+ @page = get_page('http://twitter.com/login')
85
+ end
86
+
87
+ it "should provide access to the raw Cookie" do
88
+ cookie = @page.cookie
89
+
90
+ cookie.should_not be_nil
91
+ cookie.should_not be_empty
92
+ end
93
+
94
+ it "should provide access to the Cookies" do
95
+ cookies = @page.cookies
96
+
97
+ cookies.should_not be_empty
98
+ end
99
+
100
+ it "should provide access to the key->value pairs within the Cookie" do
101
+ params = @page.cookie_params
102
+
103
+ params.should_not be_empty
104
+
105
+ params.each do |key,value|
106
+ key.should_not be_empty
107
+ value.should_not be_empty
108
+ end
109
+ end
110
+ end
81
111
  end
@@ -0,0 +1,67 @@
1
+ require 'spidr/sanitizers'
2
+ require 'spidr/agent'
3
+
4
+ require 'spec_helper'
5
+
6
+ describe Sanitizers do
7
+ describe "sanitize_url" do
8
+ before(:all) do
9
+ @agent = Agent.new
10
+ @url = 'http://host.com'
11
+ end
12
+
13
+ it "should sanitize URLs" do
14
+ agent = Agent.new
15
+ clean_url = agent.sanitize_url(URI(@url))
16
+
17
+ clean_url.host.should == 'host.com'
18
+ end
19
+
20
+ it "should sanitize URLs given as Strings" do
21
+ agent = Agent.new
22
+ clean_url = agent.sanitize_url(@url)
23
+
24
+ clean_url.host.should == 'host.com'
25
+ end
26
+ end
27
+
28
+ describe "strip_fragments" do
29
+ before(:all) do
30
+ @url = URI("http://host.com/page#lol")
31
+ end
32
+
33
+ it "should strip fragment components by default" do
34
+ agent = Agent.new
35
+ clean_url = agent.sanitize_url(@url)
36
+
37
+ clean_url.fragment.should be_nil
38
+ end
39
+
40
+ it "should allow perserving fragment components" do
41
+ agent = Agent.new(:strip_fragments => false)
42
+ clean_url = agent.sanitize_url(@url)
43
+
44
+ clean_url.fragment.should == 'lol'
45
+ end
46
+ end
47
+
48
+ describe "strip_query" do
49
+ before(:all) do
50
+ @url = URI("http://host.com/page?x=1")
51
+ end
52
+
53
+ it "should not strip query components by default" do
54
+ agent = Agent.new
55
+ clean_url = agent.sanitize_url(@url)
56
+
57
+ clean_url.query.should == 'x=1'
58
+ end
59
+
60
+ it "should allow stripping of query components" do
61
+ agent = Agent.new(:strip_query => true)
62
+ clean_url = agent.sanitize_url(@url)
63
+
64
+ clean_url.query.should be_nil
65
+ end
66
+ end
67
+ end
data/tasks/yard.rb CHANGED
@@ -4,7 +4,7 @@ YARD::Rake::YardocTask.new do |t|
4
4
  t.files = ['lib/**/*.rb']
5
5
  t.options = [
6
6
  '--protected',
7
- '--files', 'History.txt',
7
+ '--files', 'History.rdoc',
8
8
  '--title', 'Spidr'
9
9
  ]
10
10
  end