spidr 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.rdoc +191 -0
- data/Manifest.txt +10 -34
- data/{README.txt → README.rdoc} +3 -1
- data/Rakefile +6 -4
- data/lib/spidr/agent.rb +137 -97
- data/lib/spidr/auth_credential.rb +25 -0
- data/lib/spidr/auth_store.rb +157 -0
- data/lib/spidr/cookie_jar.rb +166 -0
- data/lib/spidr/filters.rb +2 -0
- data/lib/spidr/page.rb +75 -11
- data/lib/spidr/sanitizers.rb +59 -0
- data/lib/spidr/session_cache.rb +119 -0
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +2 -2
- data/spec/helpers/history.rb +34 -0
- data/spec/helpers/wsoc.rb +83 -0
- data/spec/page_examples.rb +5 -1
- data/spec/page_spec.rb +30 -0
- data/spec/sanitizers_spec.rb +67 -0
- data/tasks/yard.rb +1 -1
- metadata +24 -40
- metadata.gz.sig +0 -0
- data/History.txt +0 -167
- data/spec/helpers/course.rb +0 -95
- data/static/course/absolute/index.html +0 -10
- data/static/course/absolute/next.html +0 -9
- data/static/course/absolute/start.html +0 -19
- data/static/course/empty/index.html +0 -10
- data/static/course/empty/start.html +0 -23
- data/static/course/fail.html +0 -14
- data/static/course/frames/frame.html +0 -15
- data/static/course/frames/frame_next.html +0 -9
- data/static/course/frames/iframe.html +0 -15
- data/static/course/frames/iframe_next.html +0 -9
- data/static/course/frames/index.html +0 -10
- data/static/course/frames/start.html +0 -15
- data/static/course/index.html +0 -10
- data/static/course/javascript/index.html +0 -10
- data/static/course/javascript/start.html +0 -19
- data/static/course/loop/index.html +0 -10
- data/static/course/loop/next.html +0 -13
- data/static/course/loop/start.html +0 -19
- data/static/course/relative/current_directory.html +0 -9
- data/static/course/relative/index.html +0 -10
- data/static/course/relative/normal.html +0 -9
- data/static/course/relative/same_directory.html +0 -9
- data/static/course/relative/start.html +0 -27
- data/static/course/remote/index.html +0 -10
- data/static/course/remote/next.html +0 -9
- data/static/course/remote/start.html +0 -27
- data/static/course/scripts/course.js +0 -29
- data/static/course/scripts/jquery-1.2.6.min.js +0 -32
- data/static/course/specs.json +0 -1
- data/static/course/start.html +0 -27
- data/tasks/course.rb +0 -63
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module Spidr
|
|
2
|
+
class AuthCredential
|
|
3
|
+
|
|
4
|
+
# The username
|
|
5
|
+
attr_reader :username
|
|
6
|
+
|
|
7
|
+
# The password
|
|
8
|
+
attr_reader :password
|
|
9
|
+
|
|
10
|
+
#
|
|
11
|
+
# Creates a new credential used for authentication.
|
|
12
|
+
#
|
|
13
|
+
# @param [String] username
|
|
14
|
+
# The username for the credential.
|
|
15
|
+
#
|
|
16
|
+
# @param [String] password
|
|
17
|
+
# The password for the credential.
|
|
18
|
+
#
|
|
19
|
+
def initialize(username,password)
|
|
20
|
+
@username = username
|
|
21
|
+
@password = password
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
require 'spidr/extensions/uri'
|
|
2
|
+
require 'spidr/auth_credential'
|
|
3
|
+
require 'spidr/page'
|
|
4
|
+
|
|
5
|
+
require 'base64'
|
|
6
|
+
|
|
7
|
+
module Spidr
|
|
8
|
+
class AuthStore
|
|
9
|
+
|
|
10
|
+
#
|
|
11
|
+
# Creates a new auth store.
|
|
12
|
+
#
|
|
13
|
+
# @since 0.2.2
|
|
14
|
+
#
|
|
15
|
+
def initialize
|
|
16
|
+
@credentials = {}
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
#
|
|
20
|
+
# Given a URL, return the most specific matching auth credential.
|
|
21
|
+
#
|
|
22
|
+
# @param [URI] url
|
|
23
|
+
# A fully qualified url includig optional path.
|
|
24
|
+
#
|
|
25
|
+
# @return [AuthCredential, nil]
|
|
26
|
+
# Closest matching {AuthCredential} values for the URL,
|
|
27
|
+
# or +nil+ if nothing matches.
|
|
28
|
+
#
|
|
29
|
+
# @since 0.2.2
|
|
30
|
+
#
|
|
31
|
+
def [](url)
|
|
32
|
+
# normalize the url
|
|
33
|
+
url = URI(url) unless url.kind_of?(URI)
|
|
34
|
+
|
|
35
|
+
key = [url.scheme, url.host, url.port]
|
|
36
|
+
paths = @credentials[key]
|
|
37
|
+
|
|
38
|
+
return nil unless paths
|
|
39
|
+
|
|
40
|
+
# longest path first
|
|
41
|
+
ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
|
|
42
|
+
|
|
43
|
+
# directories of the path
|
|
44
|
+
path_dirs = URI.expand_path(url.path).split('/')
|
|
45
|
+
|
|
46
|
+
ordered_paths.each do |path|
|
|
47
|
+
return paths[path] if path_dirs[0,path.length] == path
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
return nil
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
#
|
|
54
|
+
# Add an auth credential to the store for supplied base URL.
|
|
55
|
+
#
|
|
56
|
+
# @param [URI] url_base
|
|
57
|
+
# A URL pattern to associate with a set of auth credentials.
|
|
58
|
+
#
|
|
59
|
+
# @param [AuthCredential]
|
|
60
|
+
# The auth credential for this URL pattern.
|
|
61
|
+
#
|
|
62
|
+
# @return [AuthCredential]
|
|
63
|
+
# The newly added auth credential.
|
|
64
|
+
#
|
|
65
|
+
# @since 0.2.2
|
|
66
|
+
#
|
|
67
|
+
def []=(url, auth)
|
|
68
|
+
# normalize the url
|
|
69
|
+
url = URI(url) unless url.kind_of?(URI)
|
|
70
|
+
|
|
71
|
+
# normalize the URL path
|
|
72
|
+
path = URI.expand_path(url.path)
|
|
73
|
+
|
|
74
|
+
key = [url.scheme, url.host, url.port]
|
|
75
|
+
|
|
76
|
+
@credentials[key] ||= {}
|
|
77
|
+
@credentials[key][path.split('/')] = auth
|
|
78
|
+
return auth
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
#
|
|
82
|
+
# Convenience method to add username and password credentials
|
|
83
|
+
# for a named URL.
|
|
84
|
+
#
|
|
85
|
+
# @param [URI] url
|
|
86
|
+
# The base URL that requires authorization.
|
|
87
|
+
#
|
|
88
|
+
# @param [String] username
|
|
89
|
+
# The username required to access the URL.
|
|
90
|
+
#
|
|
91
|
+
# @param [String] password
|
|
92
|
+
# The password required to access the URL.
|
|
93
|
+
#
|
|
94
|
+
# @return [AuthCredential]
|
|
95
|
+
# The newly added auth credential.
|
|
96
|
+
#
|
|
97
|
+
# @since 0.2.2
|
|
98
|
+
#
|
|
99
|
+
def add(url, username, password)
|
|
100
|
+
self[url] = AuthCredential.new(username, password)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
#
|
|
104
|
+
# Returns the base64 encoded authorization string for the URL
|
|
105
|
+
# or +nil+ if no authorization exists.
|
|
106
|
+
#
|
|
107
|
+
# @param [URI] url
|
|
108
|
+
# The url.
|
|
109
|
+
#
|
|
110
|
+
# @return [String, nil]
|
|
111
|
+
# The base64 encoded authorizatio string or +nil+.
|
|
112
|
+
#
|
|
113
|
+
# @since 0.2.2
|
|
114
|
+
#
|
|
115
|
+
def for_url(url)
|
|
116
|
+
if (auth = self[url])
|
|
117
|
+
return Base64.encode64("#{auth.username}:#{auth.password}")
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
#
|
|
122
|
+
# Clear the contents of the auth store.
|
|
123
|
+
#
|
|
124
|
+
# @return [AuthStore]
|
|
125
|
+
# The cleared auth store.
|
|
126
|
+
#
|
|
127
|
+
# @since 0.2.2
|
|
128
|
+
#
|
|
129
|
+
def clear!
|
|
130
|
+
@credentials.clear
|
|
131
|
+
return self
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
#
|
|
135
|
+
# Size of the current auth store (number of URL paths stored).
|
|
136
|
+
#
|
|
137
|
+
# @return [Integer]
|
|
138
|
+
# The size of the auth store.
|
|
139
|
+
#
|
|
140
|
+
# @since 0.2.2
|
|
141
|
+
#
|
|
142
|
+
def size
|
|
143
|
+
@credentials.inject(0) { |res, arr| res + arr[1].length }
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
#
|
|
147
|
+
# Inspects the auth store.
|
|
148
|
+
#
|
|
149
|
+
# @return [String]
|
|
150
|
+
# The inspected version of the auth store.
|
|
151
|
+
#
|
|
152
|
+
def inspect
|
|
153
|
+
"#<#{self.class}: #{@credentials.inspect}>"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
end
|
|
157
|
+
end
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
require 'spidr/page'
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
5
|
+
module Spidr
|
|
6
|
+
class CookieJar
|
|
7
|
+
|
|
8
|
+
include Enumerable
|
|
9
|
+
|
|
10
|
+
#
|
|
11
|
+
# Creates a new Cookie Jar object.
|
|
12
|
+
#
|
|
13
|
+
# @since 0.2.2
|
|
14
|
+
#
|
|
15
|
+
def initialize
|
|
16
|
+
@params = {}
|
|
17
|
+
|
|
18
|
+
@dirty = Set[]
|
|
19
|
+
@cookies = {}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
#
|
|
23
|
+
# Enumerates over the host-name and cookie value pairs in the
|
|
24
|
+
# cookie jar.
|
|
25
|
+
#
|
|
26
|
+
# @yield [host, cookie]
|
|
27
|
+
# If a block is given, it will be passed each host-name and cookie
|
|
28
|
+
# value pair.
|
|
29
|
+
#
|
|
30
|
+
# @yieldparam [String] host
|
|
31
|
+
# The host-name that the cookie is bound to.
|
|
32
|
+
#
|
|
33
|
+
# @yieldparam [String] cookie
|
|
34
|
+
# The cookie value.
|
|
35
|
+
#
|
|
36
|
+
# @since 0.2.2
|
|
37
|
+
#
|
|
38
|
+
def each(&block)
|
|
39
|
+
@params.each(&block)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
#
|
|
43
|
+
# Return all relevant cookies in a single string for the
|
|
44
|
+
# named host or domain (in browser request format).
|
|
45
|
+
#
|
|
46
|
+
# @param [String] host
|
|
47
|
+
# Host or domain name for cookies.
|
|
48
|
+
#
|
|
49
|
+
# @return [String, nil]
|
|
50
|
+
# The cookie values or +nil+ if the host does not have a cookie in the
|
|
51
|
+
# jar.
|
|
52
|
+
#
|
|
53
|
+
# @since 0.2.2
|
|
54
|
+
#
|
|
55
|
+
def [](host)
|
|
56
|
+
@params[host] ||= {}
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
#
|
|
60
|
+
# Add a cookie to the jar for a particular domain.
|
|
61
|
+
#
|
|
62
|
+
# @param [String] host
|
|
63
|
+
# Host or domain name to associate with the cookie.
|
|
64
|
+
#
|
|
65
|
+
# @param [Hash{String => String}] cookies
|
|
66
|
+
# Cookie params.
|
|
67
|
+
#
|
|
68
|
+
# @since 0.2.2
|
|
69
|
+
#
|
|
70
|
+
def []=(host,cookies)
|
|
71
|
+
collected = self[host]
|
|
72
|
+
|
|
73
|
+
cookies.each do |key,value|
|
|
74
|
+
if collected[key] != value
|
|
75
|
+
collected.merge!(cookies)
|
|
76
|
+
@dirty << host
|
|
77
|
+
|
|
78
|
+
break
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
return cookies
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
#
|
|
86
|
+
# Retrieve cookies for a domain from a page response header.
|
|
87
|
+
#
|
|
88
|
+
# @param [Page] page
|
|
89
|
+
# The response page from which to extract cookie data.
|
|
90
|
+
#
|
|
91
|
+
# @return [Boolean]
|
|
92
|
+
# Specifies whether cookies were added from the page.
|
|
93
|
+
#
|
|
94
|
+
# @since 0.2.2
|
|
95
|
+
#
|
|
96
|
+
def from_page(page)
|
|
97
|
+
cookies = page.cookie_params
|
|
98
|
+
|
|
99
|
+
unless cookies.empty?
|
|
100
|
+
self[page.url.host] = cookies
|
|
101
|
+
return true
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
return false
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
#
|
|
108
|
+
# Returns the pre-encoded Cookie for a given host.
|
|
109
|
+
#
|
|
110
|
+
# @param [String] host
|
|
111
|
+
# The name of the host.
|
|
112
|
+
#
|
|
113
|
+
# @return [String]
|
|
114
|
+
# The encoded Cookie.
|
|
115
|
+
#
|
|
116
|
+
# @since 0.2.2
|
|
117
|
+
#
|
|
118
|
+
def for_host(host)
|
|
119
|
+
if @dirty.include?(host)
|
|
120
|
+
values = []
|
|
121
|
+
|
|
122
|
+
@params[host].each do |name,value|
|
|
123
|
+
values << "#{name}=#{value}"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
@cookies[host] = values.join('; ')
|
|
127
|
+
@dirty.delete(host)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
return @cookies[host]
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
#
|
|
134
|
+
# Clear out the jar, removing all stored cookies.
|
|
135
|
+
#
|
|
136
|
+
# @since 0.2.2
|
|
137
|
+
#
|
|
138
|
+
def clear!
|
|
139
|
+
@params.clear
|
|
140
|
+
|
|
141
|
+
@dirty.clear
|
|
142
|
+
@cookies.clear
|
|
143
|
+
return self
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
#
|
|
147
|
+
# Size of the current cookie jar store.
|
|
148
|
+
#
|
|
149
|
+
# @since 0.2.2
|
|
150
|
+
#
|
|
151
|
+
def size
|
|
152
|
+
@params.size
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
#
|
|
156
|
+
# Inspects the cookie jar.
|
|
157
|
+
#
|
|
158
|
+
# @return [String]
|
|
159
|
+
# The inspected version of the cookie jar.
|
|
160
|
+
#
|
|
161
|
+
def inspect
|
|
162
|
+
"#<#{self.class}: #{@params.inspect}>"
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
end
|
|
166
|
+
end
|
data/lib/spidr/filters.rb
CHANGED
data/lib/spidr/page.rb
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
require 'spidr/extensions/uri'
|
|
2
2
|
|
|
3
|
+
require 'set'
|
|
3
4
|
require 'uri'
|
|
4
5
|
require 'nokogiri'
|
|
5
6
|
|
|
6
7
|
module Spidr
|
|
7
8
|
class Page
|
|
8
9
|
|
|
10
|
+
# Reserved names used within Cookie strings
|
|
11
|
+
RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
|
|
12
|
+
|
|
9
13
|
# URL of the page
|
|
10
14
|
attr_reader :url
|
|
11
15
|
|
|
@@ -141,6 +145,18 @@ module Spidr
|
|
|
141
145
|
@response['Content-Type']
|
|
142
146
|
end
|
|
143
147
|
|
|
148
|
+
#
|
|
149
|
+
# The content types of the page.
|
|
150
|
+
#
|
|
151
|
+
# @return [Array<String>]
|
|
152
|
+
# The values within the Content-Type header.
|
|
153
|
+
#
|
|
154
|
+
# @since 0.2.2
|
|
155
|
+
#
|
|
156
|
+
def content_types
|
|
157
|
+
@headers['content-type']
|
|
158
|
+
end
|
|
159
|
+
|
|
144
160
|
#
|
|
145
161
|
# Determines if the page is plain-text.
|
|
146
162
|
#
|
|
@@ -148,7 +164,7 @@ module Spidr
|
|
|
148
164
|
# Specifies whether the page is plain-text.
|
|
149
165
|
#
|
|
150
166
|
def plain_text?
|
|
151
|
-
(
|
|
167
|
+
content_types.include?('text/plain')
|
|
152
168
|
end
|
|
153
169
|
|
|
154
170
|
alias txt? plain_text?
|
|
@@ -160,7 +176,7 @@ module Spidr
|
|
|
160
176
|
# Specifies whether the page is HTML document.
|
|
161
177
|
#
|
|
162
178
|
def html?
|
|
163
|
-
(
|
|
179
|
+
content_types.include?('text/html')
|
|
164
180
|
end
|
|
165
181
|
|
|
166
182
|
#
|
|
@@ -170,7 +186,7 @@ module Spidr
|
|
|
170
186
|
# Specifies whether the page is XML document.
|
|
171
187
|
#
|
|
172
188
|
def xml?
|
|
173
|
-
(
|
|
189
|
+
content_types.include?('text/xml')
|
|
174
190
|
end
|
|
175
191
|
|
|
176
192
|
#
|
|
@@ -180,7 +196,7 @@ module Spidr
|
|
|
180
196
|
# Specifies whether the page is XML Stylesheet (XSL).
|
|
181
197
|
#
|
|
182
198
|
def xsl?
|
|
183
|
-
(
|
|
199
|
+
content_types.include?('text/xsl')
|
|
184
200
|
end
|
|
185
201
|
|
|
186
202
|
#
|
|
@@ -190,7 +206,8 @@ module Spidr
|
|
|
190
206
|
# Specifies whether the page is JavaScript.
|
|
191
207
|
#
|
|
192
208
|
def javascript?
|
|
193
|
-
(
|
|
209
|
+
content_types.include?('text/javascript') || \
|
|
210
|
+
content_types.include?('application/javascript')
|
|
194
211
|
end
|
|
195
212
|
|
|
196
213
|
#
|
|
@@ -200,7 +217,7 @@ module Spidr
|
|
|
200
217
|
# Specifies whether the page is a CSS stylesheet.
|
|
201
218
|
#
|
|
202
219
|
def css?
|
|
203
|
-
(
|
|
220
|
+
content_types.include?('text/css')
|
|
204
221
|
end
|
|
205
222
|
|
|
206
223
|
#
|
|
@@ -210,7 +227,8 @@ module Spidr
|
|
|
210
227
|
# Specifies whether the page is a RSS feed.
|
|
211
228
|
#
|
|
212
229
|
def rss?
|
|
213
|
-
(
|
|
230
|
+
content_types.include?('application/rss+xml') || \
|
|
231
|
+
content_types.include?('application/rdf+xml')
|
|
214
232
|
end
|
|
215
233
|
|
|
216
234
|
#
|
|
@@ -220,7 +238,7 @@ module Spidr
|
|
|
220
238
|
# Specifies whether the page is an Atom feed.
|
|
221
239
|
#
|
|
222
240
|
def atom?
|
|
223
|
-
(
|
|
241
|
+
content_types.include?('application/atom+xml')
|
|
224
242
|
end
|
|
225
243
|
|
|
226
244
|
#
|
|
@@ -230,7 +248,7 @@ module Spidr
|
|
|
230
248
|
# Specifies whether the page is a MS Word document.
|
|
231
249
|
#
|
|
232
250
|
def ms_word?
|
|
233
|
-
(
|
|
251
|
+
content_types.include?('application/msword')
|
|
234
252
|
end
|
|
235
253
|
|
|
236
254
|
#
|
|
@@ -240,7 +258,7 @@ module Spidr
|
|
|
240
258
|
# Specifies whether the page is a PDF document.
|
|
241
259
|
#
|
|
242
260
|
def pdf?
|
|
243
|
-
(
|
|
261
|
+
content_types.include?('application/pdf')
|
|
244
262
|
end
|
|
245
263
|
|
|
246
264
|
#
|
|
@@ -250,7 +268,53 @@ module Spidr
|
|
|
250
268
|
# Specifies whether the page is a ZIP archive.
|
|
251
269
|
#
|
|
252
270
|
def zip?
|
|
253
|
-
(
|
|
271
|
+
content_types.include?('application/zip')
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
#
|
|
275
|
+
# The raw Cookie String sent along with the page.
|
|
276
|
+
#
|
|
277
|
+
# @return [String]
|
|
278
|
+
# The raw Cookie from the response.
|
|
279
|
+
#
|
|
280
|
+
# @since 0.2.2
|
|
281
|
+
#
|
|
282
|
+
def cookie
|
|
283
|
+
(@response['Set-Cookie'] || '')
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
#
|
|
287
|
+
# The Cookie values sent along with the page.
|
|
288
|
+
#
|
|
289
|
+
# @return [Array<String>]
|
|
290
|
+
# The Cookies from the response.
|
|
291
|
+
#
|
|
292
|
+
# @since 0.2.2
|
|
293
|
+
#
|
|
294
|
+
def cookies
|
|
295
|
+
(@headers['set-cookie'] || [])
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
#
|
|
299
|
+
# The Cookie key -> value pairs returned with the response.
|
|
300
|
+
#
|
|
301
|
+
# @return [Hash{String => String}]
|
|
302
|
+
# The cookie keys and values.
|
|
303
|
+
#
|
|
304
|
+
# @since 0.2.2
|
|
305
|
+
#
|
|
306
|
+
def cookie_params
|
|
307
|
+
params = {}
|
|
308
|
+
|
|
309
|
+
cookies.each do |key_value|
|
|
310
|
+
key, value = key_value.split('=',2)
|
|
311
|
+
|
|
312
|
+
next if RESERVED_COOKIE_NAMES.include?(key)
|
|
313
|
+
|
|
314
|
+
params[key] = (value || '')
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
return params
|
|
254
318
|
end
|
|
255
319
|
|
|
256
320
|
#
|