spidr 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.rdoc +191 -0
  3. data/Manifest.txt +10 -34
  4. data/{README.txt → README.rdoc} +3 -1
  5. data/Rakefile +6 -4
  6. data/lib/spidr/agent.rb +137 -97
  7. data/lib/spidr/auth_credential.rb +25 -0
  8. data/lib/spidr/auth_store.rb +157 -0
  9. data/lib/spidr/cookie_jar.rb +166 -0
  10. data/lib/spidr/filters.rb +2 -0
  11. data/lib/spidr/page.rb +75 -11
  12. data/lib/spidr/sanitizers.rb +59 -0
  13. data/lib/spidr/session_cache.rb +119 -0
  14. data/lib/spidr/version.rb +1 -1
  15. data/spec/agent_spec.rb +2 -2
  16. data/spec/helpers/history.rb +34 -0
  17. data/spec/helpers/wsoc.rb +83 -0
  18. data/spec/page_examples.rb +5 -1
  19. data/spec/page_spec.rb +30 -0
  20. data/spec/sanitizers_spec.rb +67 -0
  21. data/tasks/yard.rb +1 -1
  22. metadata +24 -40
  23. metadata.gz.sig +0 -0
  24. data/History.txt +0 -167
  25. data/spec/helpers/course.rb +0 -95
  26. data/static/course/absolute/index.html +0 -10
  27. data/static/course/absolute/next.html +0 -9
  28. data/static/course/absolute/start.html +0 -19
  29. data/static/course/empty/index.html +0 -10
  30. data/static/course/empty/start.html +0 -23
  31. data/static/course/fail.html +0 -14
  32. data/static/course/frames/frame.html +0 -15
  33. data/static/course/frames/frame_next.html +0 -9
  34. data/static/course/frames/iframe.html +0 -15
  35. data/static/course/frames/iframe_next.html +0 -9
  36. data/static/course/frames/index.html +0 -10
  37. data/static/course/frames/start.html +0 -15
  38. data/static/course/index.html +0 -10
  39. data/static/course/javascript/index.html +0 -10
  40. data/static/course/javascript/start.html +0 -19
  41. data/static/course/loop/index.html +0 -10
  42. data/static/course/loop/next.html +0 -13
  43. data/static/course/loop/start.html +0 -19
  44. data/static/course/relative/current_directory.html +0 -9
  45. data/static/course/relative/index.html +0 -10
  46. data/static/course/relative/normal.html +0 -9
  47. data/static/course/relative/same_directory.html +0 -9
  48. data/static/course/relative/start.html +0 -27
  49. data/static/course/remote/index.html +0 -10
  50. data/static/course/remote/next.html +0 -9
  51. data/static/course/remote/start.html +0 -27
  52. data/static/course/scripts/course.js +0 -29
  53. data/static/course/scripts/jquery-1.2.6.min.js +0 -32
  54. data/static/course/specs.json +0 -1
  55. data/static/course/start.html +0 -27
  56. data/tasks/course.rb +0 -63
@@ -0,0 +1,25 @@
1
+ module Spidr
2
+ class AuthCredential
3
+
4
+ # The username
5
+ attr_reader :username
6
+
7
+ # The password
8
+ attr_reader :password
9
+
10
+ #
11
+ # Creates a new credential used for authentication.
12
+ #
13
+ # @param [String] username
14
+ # The username for the credential.
15
+ #
16
+ # @param [String] password
17
+ # The password for the credential.
18
+ #
19
+ def initialize(username,password)
20
+ @username = username
21
+ @password = password
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,157 @@
1
+ require 'spidr/extensions/uri'
2
+ require 'spidr/auth_credential'
3
+ require 'spidr/page'
4
+
5
+ require 'base64'
6
+
7
+ module Spidr
8
+ class AuthStore
9
+
10
+ #
11
+ # Creates a new auth store.
12
+ #
13
+ # @since 0.2.2
14
+ #
15
+ def initialize
16
+ @credentials = {}
17
+ end
18
+
19
+ #
20
+ # Given a URL, return the most specific matching auth credential.
21
+ #
22
+ # @param [URI] url
23
+ # A fully qualified url includig optional path.
24
+ #
25
+ # @return [AuthCredential, nil]
26
+ # Closest matching {AuthCredential} values for the URL,
27
+ # or +nil+ if nothing matches.
28
+ #
29
+ # @since 0.2.2
30
+ #
31
+ def [](url)
32
+ # normalize the url
33
+ url = URI(url) unless url.kind_of?(URI)
34
+
35
+ key = [url.scheme, url.host, url.port]
36
+ paths = @credentials[key]
37
+
38
+ return nil unless paths
39
+
40
+ # longest path first
41
+ ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
42
+
43
+ # directories of the path
44
+ path_dirs = URI.expand_path(url.path).split('/')
45
+
46
+ ordered_paths.each do |path|
47
+ return paths[path] if path_dirs[0,path.length] == path
48
+ end
49
+
50
+ return nil
51
+ end
52
+
53
+ #
54
+ # Add an auth credential to the store for supplied base URL.
55
+ #
56
+ # @param [URI] url_base
57
+ # A URL pattern to associate with a set of auth credentials.
58
+ #
59
+ # @param [AuthCredential]
60
+ # The auth credential for this URL pattern.
61
+ #
62
+ # @return [AuthCredential]
63
+ # The newly added auth credential.
64
+ #
65
+ # @since 0.2.2
66
+ #
67
+ def []=(url, auth)
68
+ # normalize the url
69
+ url = URI(url) unless url.kind_of?(URI)
70
+
71
+ # normalize the URL path
72
+ path = URI.expand_path(url.path)
73
+
74
+ key = [url.scheme, url.host, url.port]
75
+
76
+ @credentials[key] ||= {}
77
+ @credentials[key][path.split('/')] = auth
78
+ return auth
79
+ end
80
+
81
+ #
82
+ # Convenience method to add username and password credentials
83
+ # for a named URL.
84
+ #
85
+ # @param [URI] url
86
+ # The base URL that requires authorization.
87
+ #
88
+ # @param [String] username
89
+ # The username required to access the URL.
90
+ #
91
+ # @param [String] password
92
+ # The password required to access the URL.
93
+ #
94
+ # @return [AuthCredential]
95
+ # The newly added auth credential.
96
+ #
97
+ # @since 0.2.2
98
+ #
99
+ def add(url, username, password)
100
+ self[url] = AuthCredential.new(username, password)
101
+ end
102
+
103
+ #
104
+ # Returns the base64 encoded authorization string for the URL
105
+ # or +nil+ if no authorization exists.
106
+ #
107
+ # @param [URI] url
108
+ # The url.
109
+ #
110
+ # @return [String, nil]
111
+ # The base64 encoded authorizatio string or +nil+.
112
+ #
113
+ # @since 0.2.2
114
+ #
115
+ def for_url(url)
116
+ if (auth = self[url])
117
+ return Base64.encode64("#{auth.username}:#{auth.password}")
118
+ end
119
+ end
120
+
121
+ #
122
+ # Clear the contents of the auth store.
123
+ #
124
+ # @return [AuthStore]
125
+ # The cleared auth store.
126
+ #
127
+ # @since 0.2.2
128
+ #
129
+ def clear!
130
+ @credentials.clear
131
+ return self
132
+ end
133
+
134
+ #
135
+ # Size of the current auth store (number of URL paths stored).
136
+ #
137
+ # @return [Integer]
138
+ # The size of the auth store.
139
+ #
140
+ # @since 0.2.2
141
+ #
142
+ def size
143
+ @credentials.inject(0) { |res, arr| res + arr[1].length }
144
+ end
145
+
146
+ #
147
+ # Inspects the auth store.
148
+ #
149
+ # @return [String]
150
+ # The inspected version of the auth store.
151
+ #
152
+ def inspect
153
+ "#<#{self.class}: #{@credentials.inspect}>"
154
+ end
155
+
156
+ end
157
+ end
@@ -0,0 +1,166 @@
1
+ require 'spidr/page'
2
+
3
+ require 'set'
4
+
5
+ module Spidr
6
+ class CookieJar
7
+
8
+ include Enumerable
9
+
10
+ #
11
+ # Creates a new Cookie Jar object.
12
+ #
13
+ # @since 0.2.2
14
+ #
15
+ def initialize
16
+ @params = {}
17
+
18
+ @dirty = Set[]
19
+ @cookies = {}
20
+ end
21
+
22
+ #
23
+ # Enumerates over the host-name and cookie value pairs in the
24
+ # cookie jar.
25
+ #
26
+ # @yield [host, cookie]
27
+ # If a block is given, it will be passed each host-name and cookie
28
+ # value pair.
29
+ #
30
+ # @yieldparam [String] host
31
+ # The host-name that the cookie is bound to.
32
+ #
33
+ # @yieldparam [String] cookie
34
+ # The cookie value.
35
+ #
36
+ # @since 0.2.2
37
+ #
38
+ def each(&block)
39
+ @params.each(&block)
40
+ end
41
+
42
+ #
43
+ # Return all relevant cookies in a single string for the
44
+ # named host or domain (in browser request format).
45
+ #
46
+ # @param [String] host
47
+ # Host or domain name for cookies.
48
+ #
49
+ # @return [String, nil]
50
+ # The cookie values or +nil+ if the host does not have a cookie in the
51
+ # jar.
52
+ #
53
+ # @since 0.2.2
54
+ #
55
+ def [](host)
56
+ @params[host] ||= {}
57
+ end
58
+
59
+ #
60
+ # Add a cookie to the jar for a particular domain.
61
+ #
62
+ # @param [String] host
63
+ # Host or domain name to associate with the cookie.
64
+ #
65
+ # @param [Hash{String => String}] cookies
66
+ # Cookie params.
67
+ #
68
+ # @since 0.2.2
69
+ #
70
+ def []=(host,cookies)
71
+ collected = self[host]
72
+
73
+ cookies.each do |key,value|
74
+ if collected[key] != value
75
+ collected.merge!(cookies)
76
+ @dirty << host
77
+
78
+ break
79
+ end
80
+ end
81
+
82
+ return cookies
83
+ end
84
+
85
+ #
86
+ # Retrieve cookies for a domain from a page response header.
87
+ #
88
+ # @param [Page] page
89
+ # The response page from which to extract cookie data.
90
+ #
91
+ # @return [Boolean]
92
+ # Specifies whether cookies were added from the page.
93
+ #
94
+ # @since 0.2.2
95
+ #
96
+ def from_page(page)
97
+ cookies = page.cookie_params
98
+
99
+ unless cookies.empty?
100
+ self[page.url.host] = cookies
101
+ return true
102
+ end
103
+
104
+ return false
105
+ end
106
+
107
+ #
108
+ # Returns the pre-encoded Cookie for a given host.
109
+ #
110
+ # @param [String] host
111
+ # The name of the host.
112
+ #
113
+ # @return [String]
114
+ # The encoded Cookie.
115
+ #
116
+ # @since 0.2.2
117
+ #
118
+ def for_host(host)
119
+ if @dirty.include?(host)
120
+ values = []
121
+
122
+ @params[host].each do |name,value|
123
+ values << "#{name}=#{value}"
124
+ end
125
+
126
+ @cookies[host] = values.join('; ')
127
+ @dirty.delete(host)
128
+ end
129
+
130
+ return @cookies[host]
131
+ end
132
+
133
+ #
134
+ # Clear out the jar, removing all stored cookies.
135
+ #
136
+ # @since 0.2.2
137
+ #
138
+ def clear!
139
+ @params.clear
140
+
141
+ @dirty.clear
142
+ @cookies.clear
143
+ return self
144
+ end
145
+
146
+ #
147
+ # Size of the current cookie jar store.
148
+ #
149
+ # @since 0.2.2
150
+ #
151
+ def size
152
+ @params.size
153
+ end
154
+
155
+ #
156
+ # Inspects the cookie jar.
157
+ #
158
+ # @return [String]
159
+ # The inspected version of the cookie jar.
160
+ #
161
+ def inspect
162
+ "#<#{self.class}: #{@params.inspect}>"
163
+ end
164
+
165
+ end
166
+ end
data/lib/spidr/filters.rb CHANGED
@@ -47,6 +47,8 @@ module Spidr
47
47
  # The patterns which match the URI path extensions to not visit.
48
48
  #
49
49
  def initialize(options={})
50
+ super(options)
51
+
50
52
  @schemes = []
51
53
 
52
54
  if options[:schemes]
data/lib/spidr/page.rb CHANGED
@@ -1,11 +1,15 @@
1
1
  require 'spidr/extensions/uri'
2
2
 
3
+ require 'set'
3
4
  require 'uri'
4
5
  require 'nokogiri'
5
6
 
6
7
  module Spidr
7
8
  class Page
8
9
 
10
+ # Reserved names used within Cookie strings
11
+ RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
12
+
9
13
  # URL of the page
10
14
  attr_reader :url
11
15
 
@@ -141,6 +145,18 @@ module Spidr
141
145
  @response['Content-Type']
142
146
  end
143
147
 
148
+ #
149
+ # The content types of the page.
150
+ #
151
+ # @return [Array<String>]
152
+ # The values within the Content-Type header.
153
+ #
154
+ # @since 0.2.2
155
+ #
156
+ def content_types
157
+ @headers['content-type']
158
+ end
159
+
144
160
  #
145
161
  # Determines if the page is plain-text.
146
162
  #
@@ -148,7 +164,7 @@ module Spidr
148
164
  # Specifies whether the page is plain-text.
149
165
  #
150
166
  def plain_text?
151
- (content_type =~ /text\/plain/) == 0
167
+ content_types.include?('text/plain')
152
168
  end
153
169
 
154
170
  alias txt? plain_text?
@@ -160,7 +176,7 @@ module Spidr
160
176
  # Specifies whether the page is HTML document.
161
177
  #
162
178
  def html?
163
- (content_type =~ /text\/html/) == 0
179
+ content_types.include?('text/html')
164
180
  end
165
181
 
166
182
  #
@@ -170,7 +186,7 @@ module Spidr
170
186
  # Specifies whether the page is XML document.
171
187
  #
172
188
  def xml?
173
- (content_type =~ /text\/xml/) == 0
189
+ content_types.include?('text/xml')
174
190
  end
175
191
 
176
192
  #
@@ -180,7 +196,7 @@ module Spidr
180
196
  # Specifies whether the page is XML Stylesheet (XSL).
181
197
  #
182
198
  def xsl?
183
- (content_type =~ /text\/xsl/) == 0
199
+ content_types.include?('text/xsl')
184
200
  end
185
201
 
186
202
  #
@@ -190,7 +206,8 @@ module Spidr
190
206
  # Specifies whether the page is JavaScript.
191
207
  #
192
208
  def javascript?
193
- (content_type =~ /(text|application)\/javascript/) == 0
209
+ content_types.include?('text/javascript') || \
210
+ content_types.include?('application/javascript')
194
211
  end
195
212
 
196
213
  #
@@ -200,7 +217,7 @@ module Spidr
200
217
  # Specifies whether the page is a CSS stylesheet.
201
218
  #
202
219
  def css?
203
- (content_type =~ /text\/css/) == 0
220
+ content_types.include?('text/css')
204
221
  end
205
222
 
206
223
  #
@@ -210,7 +227,8 @@ module Spidr
210
227
  # Specifies whether the page is a RSS feed.
211
228
  #
212
229
  def rss?
213
- (content_type =~ /application\/(rss|rdf)\+xml/) == 0
230
+ content_types.include?('application/rss+xml') || \
231
+ content_types.include?('application/rdf+xml')
214
232
  end
215
233
 
216
234
  #
@@ -220,7 +238,7 @@ module Spidr
220
238
  # Specifies whether the page is an Atom feed.
221
239
  #
222
240
  def atom?
223
- (content_type =~ /application\/atom\+xml/) == 0
241
+ content_types.include?('application/atom+xml')
224
242
  end
225
243
 
226
244
  #
@@ -230,7 +248,7 @@ module Spidr
230
248
  # Specifies whether the page is a MS Word document.
231
249
  #
232
250
  def ms_word?
233
- (content_type =~ /application\/msword/) == 0
251
+ content_types.include?('application/msword')
234
252
  end
235
253
 
236
254
  #
@@ -240,7 +258,7 @@ module Spidr
240
258
  # Specifies whether the page is a PDF document.
241
259
  #
242
260
  def pdf?
243
- (content_type =~ /application\/pdf/) == 0
261
+ content_types.include?('application/pdf')
244
262
  end
245
263
 
246
264
  #
@@ -250,7 +268,53 @@ module Spidr
250
268
  # Specifies whether the page is a ZIP archive.
251
269
  #
252
270
  def zip?
253
- (content_type =~ /application\/zip/) == 0
271
+ content_types.include?('application/zip')
272
+ end
273
+
274
+ #
275
+ # The raw Cookie String sent along with the page.
276
+ #
277
+ # @return [String]
278
+ # The raw Cookie from the response.
279
+ #
280
+ # @since 0.2.2
281
+ #
282
+ def cookie
283
+ (@response['Set-Cookie'] || '')
284
+ end
285
+
286
+ #
287
+ # The Cookie values sent along with the page.
288
+ #
289
+ # @return [Array<String>]
290
+ # The Cookies from the response.
291
+ #
292
+ # @since 0.2.2
293
+ #
294
+ def cookies
295
+ (@headers['set-cookie'] || [])
296
+ end
297
+
298
+ #
299
+ # The Cookie key -> value pairs returned with the response.
300
+ #
301
+ # @return [Hash{String => String}]
302
+ # The cookie keys and values.
303
+ #
304
+ # @since 0.2.2
305
+ #
306
+ def cookie_params
307
+ params = {}
308
+
309
+ cookies.each do |key_value|
310
+ key, value = key_value.split('=',2)
311
+
312
+ next if RESERVED_COOKIE_NAMES.include?(key)
313
+
314
+ params[key] = (value || '')
315
+ end
316
+
317
+ return params
254
318
  end
255
319
 
256
320
  #