spidr 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.rdoc +191 -0
  3. data/Manifest.txt +10 -34
  4. data/{README.txt → README.rdoc} +3 -1
  5. data/Rakefile +6 -4
  6. data/lib/spidr/agent.rb +137 -97
  7. data/lib/spidr/auth_credential.rb +25 -0
  8. data/lib/spidr/auth_store.rb +157 -0
  9. data/lib/spidr/cookie_jar.rb +166 -0
  10. data/lib/spidr/filters.rb +2 -0
  11. data/lib/spidr/page.rb +75 -11
  12. data/lib/spidr/sanitizers.rb +59 -0
  13. data/lib/spidr/session_cache.rb +119 -0
  14. data/lib/spidr/version.rb +1 -1
  15. data/spec/agent_spec.rb +2 -2
  16. data/spec/helpers/history.rb +34 -0
  17. data/spec/helpers/wsoc.rb +83 -0
  18. data/spec/page_examples.rb +5 -1
  19. data/spec/page_spec.rb +30 -0
  20. data/spec/sanitizers_spec.rb +67 -0
  21. data/tasks/yard.rb +1 -1
  22. metadata +24 -40
  23. metadata.gz.sig +0 -0
  24. data/History.txt +0 -167
  25. data/spec/helpers/course.rb +0 -95
  26. data/static/course/absolute/index.html +0 -10
  27. data/static/course/absolute/next.html +0 -9
  28. data/static/course/absolute/start.html +0 -19
  29. data/static/course/empty/index.html +0 -10
  30. data/static/course/empty/start.html +0 -23
  31. data/static/course/fail.html +0 -14
  32. data/static/course/frames/frame.html +0 -15
  33. data/static/course/frames/frame_next.html +0 -9
  34. data/static/course/frames/iframe.html +0 -15
  35. data/static/course/frames/iframe_next.html +0 -9
  36. data/static/course/frames/index.html +0 -10
  37. data/static/course/frames/start.html +0 -15
  38. data/static/course/index.html +0 -10
  39. data/static/course/javascript/index.html +0 -10
  40. data/static/course/javascript/start.html +0 -19
  41. data/static/course/loop/index.html +0 -10
  42. data/static/course/loop/next.html +0 -13
  43. data/static/course/loop/start.html +0 -19
  44. data/static/course/relative/current_directory.html +0 -9
  45. data/static/course/relative/index.html +0 -10
  46. data/static/course/relative/normal.html +0 -9
  47. data/static/course/relative/same_directory.html +0 -9
  48. data/static/course/relative/start.html +0 -27
  49. data/static/course/remote/index.html +0 -10
  50. data/static/course/remote/next.html +0 -9
  51. data/static/course/remote/start.html +0 -27
  52. data/static/course/scripts/course.js +0 -29
  53. data/static/course/scripts/jquery-1.2.6.min.js +0 -32
  54. data/static/course/specs.json +0 -1
  55. data/static/course/start.html +0 -27
  56. data/tasks/course.rb +0 -63
data.tar.gz.sig CHANGED
Binary file
data/History.rdoc ADDED
@@ -0,0 +1,191 @@
1
+ === 0.2.2 / 2010-01-06
2
+
3
+ * Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
4
+ * Integrated the new WSOC into the specs.
5
+ * Removed the built-in Web Spider Obstacle Course.
6
+ * Added {Spidr::Page#content_types}.
7
+ * Added {Spidr::Page#cookie}.
8
+ * Added {Spidr::Page#cookies}.
9
+ * Added {Spidr::Page#cookie_params}.
10
+ * Added {Spidr::Sanitizers}.
11
+ * Added {Spidr::SessionCache}.
12
+ * Added {Spidr::CookieJar} (thanks Nick Plante).
13
+ * Added {Spidr::AuthStore} (thanks Nick Plante).
14
+ * Added {Spidr::Agent#post_page} (thanks Nick Plante).
15
+ * Renamed Spidr::Agent#get_session to {Spidr::SessionCache#[]}.
16
+ * Renamed Spidr::Agent#kill_session to {Spidr::SessionCache#kill!}.
17
+
18
+ === 0.2.1 / 2009-11-25
19
+
20
+ * Added {Spidr::Events#every_ok_page}.
21
+ * Added {Spidr::Events#every_redirect_page}.
22
+ * Added {Spidr::Events#every_timedout_page}.
23
+ * Added {Spidr::Events#every_bad_request_page}.
24
+ * Added {Spidr::Events#every_unauthorized_page}.
25
+ * Added {Spidr::Events#every_forbidden_page}.
26
+ * Added {Spidr::Events#every_missing_page}.
27
+ * Added {Spidr::Events#every_internal_server_error_page}.
28
+ * Added {Spidr::Events#every_txt_page}.
29
+ * Added {Spidr::Events#every_html_page}.
30
+ * Added {Spidr::Events#every_xml_page}.
31
+ * Added {Spidr::Events#every_xsl_page}.
32
+ * Added {Spidr::Events#every_doc}.
33
+ * Added {Spidr::Events#every_html_doc}.
34
+ * Added {Spidr::Events#every_xml_doc}.
35
+ * Added {Spidr::Events#every_xsl_doc}.
36
+ * Added {Spidr::Events#every_rss_doc}.
37
+ * Added {Spidr::Events#every_atom_doc}.
38
+ * Added {Spidr::Events#every_javascript_page}.
39
+ * Added {Spidr::Events#every_css_page}.
40
+ * Added {Spidr::Events#every_rss_page}.
41
+ * Added {Spidr::Events#every_atom_page}.
42
+ * Added {Spidr::Events#every_ms_word_page}.
43
+ * Added {Spidr::Events#every_pdf_page}.
44
+ * Added {Spidr::Events#every_zip_page}.
45
+ * Fixed a bug where {Spidr::Agent#delay} was not being used to delay
46
+ requesting pages.
47
+ * Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
48
+
49
+ === 0.2.0 / 2009-10-10
50
+
51
+ * Added {URI.expand_path}.
52
+ * Added {Spidr::Page#search}.
53
+ * Added {Spidr::Page#at}.
54
+ * Added {Spidr::Page#title}.
55
+ * Added {Spidr::Agent#failures=}.
56
+ * Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
57
+ * Added Spidr::Agent#get_session.
58
+ * Added Spidr::Agent#kill_session.
59
+ * Added {Spidr.proxy=}.
60
+ * Added {Spidr.disable_proxy!}.
61
+ * Aliased Spidr::Page#txt? to {Spidr::Page#plain_text?}.
62
+ * Aliased Spidr::Page#ok? to {Spidr::Page#is_ok?}.
63
+ * Aliased Spidr::Page#redirect? to {Spidr::Page#is_redirect?}.
64
+ * Aliased Spidr::Page#unauthorized? to {Spidr::Page#is_unauthorized?}.
65
+ * Aliased Spidr::Page#forbidden? to {Spidr::Page#is_forbidden?}.
66
+ * Aliased Spidr::Page#missing? to {Spidr::Page#is_missing?}.
67
+ * Split URL filtering code out of {Spidr::Agent} and into
68
+ {Spidr::Filters}.
69
+ * Split URL / Page event code out of {Spidr::Agent} and into
70
+ {Spidr::Events}.
71
+ * Split pause! / continue! / skip_link! / skip_page! methods out of
72
+ {Spidr::Agent} and into {Spidr::Actions}.
73
+ * Fixed a bug in {Spidr::Page#code}, where it was not returning an Integer.
74
+ * Make sure {Spidr::Page#doc} returns Nokogiri::XML::Document objects for
75
+ RSS/RDF/Atom pages as well.
76
+ * Fixed the handling of the Location header in {Spidr::Page#links}
77
+ (thanks falter).
78
+ * Fixed a bug in {Spidr::Page#to_absolute} where trailing '/' characters on
79
+ URI paths were not being preserved (thanks falter).
80
+ * Fixed a bug where the URI query was not being sent with the request
81
+ in {Spidr::Agent#get_page} (thanks Damian Steer).
82
+ * Fixed a bug where SSL sessions were not being properly setup
83
+ (thanks falter).
84
+ * Switched {Spidr::Agent#history} to be a Set, to improve search-time
85
+ of the history (thanks falter).
86
+ * Switched {Spidr::Agent#failures} to a Set.
87
+ * Allow a block to be passed to {Spidr::Agent#run}, which will receive all
88
+ pages visited.
89
+ * Allow Spidr::Agent#start_at and Spidr::Agent#continue! to pass blocks
90
+ to {Spidr::Agent#run}.
91
+ * Made {Spidr::Agent#visit_page} public.
92
+ * Moved to YARD based documentation.
93
+
94
+ === 0.1.9 / 2009-06-13
95
+
96
+ * Upgraded to Hoe 2.0.0.
97
+ * Use Hoe.spec instead of Hoe.new.
98
+ * Use the Hoe signing task for signed gems.
99
+ * Added the Spidr::Agent#schemes and Spidr::Agent#schemes= methods.
100
+ * Added a warning message if 'net/https' cannot be loaded.
101
+ * Allow the list of acceptable URL schemes to be passed into
102
+ {Spidr::Agent#initialize}.
103
+ * Allow history and queue information to be passed into
104
+ {Spidr::Agent#initialize}.
105
+ * {Spidr::Agent#start_at} no longer clears the history or the queue.
106
+ * Fixed a bug in the sanitization of semi-escaped URLs.
107
+ * Fixed a bug where https URLs would be followed even if 'net/https'
108
+ could not be loaded.
109
+ * Removed Spidr::Agent::SCHEMES.
110
+
111
+ === 0.1.8 / 2009-05-27
112
+
113
+ * Added the Spidr::Agent#pause! and Spidr::Agent#continue! methods.
114
+ * Added the Spidr::Agent#running? and Spidr::Agent#paused? methods.
115
+ * Added an alias for pending_urls to the queue methods.
116
+ * Added {Spidr::Agent#queue} to provide read access to the queue.
117
+ * Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
118
+ queue and history.
119
+ * Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
120
+ history.
121
+ * Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
122
+ * Added more specs.
123
+
124
+ === 0.1.7 / 2009-04-24
125
+
126
+ * Added Spidr::Agent#all_headers.
127
+ * Fixed a bug where Page#headers was always +nil+.
128
+ * {Spidr::Spidr::Agent} will now follow the Location header in HTTP 300,
129
+ 301, 302, 303 and 307 Redirects.
130
+ * {Spidr::Agent} will now follow iframe and frame tags.
131
+
132
+ === 0.1.6 / 2009-04-14
133
+
134
+ * Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
135
+ * Added {Spidr::Agent#failed?}.
136
+ * Added Spidr::Agent#every_failed_url.
137
+ * Added {Spidr::Agent#clear}, which clears the history and failures URL
138
+ lists.
139
+ * Improved fault tolerance in {Spidr::Agent#get_page}.
140
+ * If a Network or HTTP error is encountered, the URL will be added to
141
+ the failures list and the next URL will be visited.
142
+ * Fixed a typo in Spidr::Agent#ignore_exts_like.
143
+ * Updated the Web Spider Obstacle Course with links that always fail to be
144
+ visited.
145
+
146
+ === 0.1.5 / 2009-03-22
147
+
148
+ * Catch malformed URIs in {Spidr::Page#to_absolute} and return +nil+.
149
+ * Filter out +nil+ URIs in {Spidr::Page#urls}.
150
+
151
+ === 0.1.4 / 2009-01-15
152
+
153
+ * Use Nokogiri for HTML and XML parsing.
154
+
155
+ === 0.1.3 / 2009-01-10
156
+
157
+ * Added the :host options to {Spidr::Agent#initialize}.
158
+ * Added the Web Spider Obstacle Course files to the Manifest.
159
+ * Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
160
+
161
+ === 0.1.2 / 2008-11-06
162
+
163
+ * Fixed a bug in {Spidr::Page#to_absolute} where URLs with no path were not
164
+ receiving a default path of <tt>/</tt>.
165
+ * Fixed a bug in {Spidr::Page#to_absolute} where URL paths were not being
166
+ expanded, in order to remove <tt>..</tt> and <tt>.</tt> directories.
167
+ * Fixed a bug where absolute URLs could have a blank path, thus causing
168
+ {Spidr::Agent#get_page} to crash when it performed the HTTP request.
169
+ * Added RSpec spec tests.
170
+ * Created a Web-Spider Obstacle Course
171
+ (http://spidr.rubyforge.org/course/start.html) which is used in the spec
172
+ tests.
173
+
174
+ === 0.1.1 / 2008-10-04
175
+
176
+ * Added a reader method for the response instance variable in Page.
177
+ * Fixed a bug in {Spidr::Page#method_missing}.
178
+
179
+ === 0.1.0 / 2008-05-23
180
+
181
+ * Initial release.
182
+ * Black-list or white-list URLs based upon:
183
+ * Host name
184
+ * Port number
185
+ * Full link
186
+ * URL extension
187
+ * Provides call-backs for:
188
+ * Every visited Page.
189
+ * Every visited URL.
190
+ * Every visited URL that matches a specified pattern.
191
+
data/Manifest.txt CHANGED
@@ -1,11 +1,12 @@
1
- History.txt
1
+ History.rdoc
2
2
  Manifest.txt
3
- README.txt
3
+ README.rdoc
4
4
  Rakefile
5
5
  lib/spidr.rb
6
6
  lib/spidr/extensions.rb
7
7
  lib/spidr/extensions/uri.rb
8
8
  lib/spidr/page.rb
9
+ lib/spidr/sanitizers.rb
9
10
  lib/spidr/rules.rb
10
11
  lib/spidr/filters.rb
11
12
  lib/spidr/events.rb
@@ -16,50 +17,25 @@ lib/spidr/actions/exceptions/paused.rb
16
17
  lib/spidr/actions/exceptions/skip_link.rb
17
18
  lib/spidr/actions/exceptions/skip_page.rb
18
19
  lib/spidr/actions/actions.rb
20
+ lib/spidr/session_cache.rb
21
+ lib/spidr/cookie_jar.rb
22
+ lib/spidr/auth_credential.rb
23
+ lib/spidr/auth_store.rb
19
24
  lib/spidr/agent.rb
20
25
  lib/spidr/spidr.rb
21
26
  lib/spidr/version.rb
22
27
  tasks/spec.rb
23
28
  tasks/yard.rb
24
- tasks/course.rb
25
29
  spec/spec_helper.rb
26
- spec/helpers/course.rb
30
+ spec/helpers/history.rb
31
+ spec/helpers/wsoc.rb
27
32
  spec/helpers/page.rb
28
33
  spec/extensions/uri_spec.rb
29
34
  spec/page_examples.rb
30
35
  spec/page_spec.rb
31
36
  spec/rules_spec.rb
37
+ spec/sanitizers_spec.rb
32
38
  spec/filters_spec.rb
33
39
  spec/actions_spec.rb
34
40
  spec/agent_spec.rb
35
41
  spec/spidr_spec.rb
36
- static/course/index.html
37
- static/course/start.html
38
- static/course/fail.html
39
- static/course/scripts/jquery-1.2.6.min.js
40
- static/course/scripts/course.js
41
- static/course/empty/index.html
42
- static/course/empty/start.html
43
- static/course/javascript/index.html
44
- static/course/javascript/start.html
45
- static/course/loop/index.html
46
- static/course/loop/start.html
47
- static/course/loop/next.html
48
- static/course/relative/index.html
49
- static/course/relative/start.html
50
- static/course/relative/normal.html
51
- static/course/relative/current_directory.html
52
- static/course/relative/same_directory.html
53
- static/course/absolute/index.html
54
- static/course/absolute/start.html
55
- static/course/absolute/next.html
56
- static/course/remote/index.html
57
- static/course/remote/start.html
58
- static/course/remote/next.html
59
- static/course/frames/index.html
60
- static/course/frames/start.html
61
- static/course/frames/iframe.html
62
- static/course/frames/iframe_next.html
63
- static/course/frames/frame.html
64
- static/course/frames/frame_next.html
65
- static/course/specs.json
@@ -18,7 +18,9 @@ and easy to use.
18
18
  * a tags.
19
19
  * iframe tags.
20
20
  * frame tags.
21
+ * Cookie protected links.
21
22
  * HTTP 300, 301, 302, 303 and 307 Redirects.
23
+ * HTTP Basic Auth protected links.
22
24
  * Black-list or white-list URLs based upon:
23
25
  * URL scheme.
24
26
  * Host name
@@ -156,7 +158,7 @@ and easy to use.
156
158
 
157
159
  The MIT License
158
160
 
159
- Copyright (c) 2008-2009 Hal Brodigan
161
+ Copyright (c) 2008-2010 Hal Brodigan
160
162
 
161
163
  Permission is hereby granted, free of charge, to any person obtaining
162
164
  a copy of this software and associated documentation files (the
data/Rakefile CHANGED
@@ -5,20 +5,22 @@ require 'hoe'
5
5
  require 'hoe/signing'
6
6
  require './tasks/spec.rb'
7
7
  require './tasks/yard.rb'
8
- require './tasks/course.rb'
9
- require './lib/spidr/version.rb'
10
8
 
11
9
  Hoe.spec('spidr') do
12
- self.rubyforge_name = 'spidr'
13
10
  self.developer('Postmodern', 'postmodern.mod3@gmail.com')
11
+
12
+ self.readme_file = 'README.rdoc'
13
+ self.history_file = 'History.rdoc'
14
14
  self.remote_rdoc_dir = 'docs'
15
+
15
16
  self.extra_deps = [
16
17
  ['nokogiri', '>=1.2.0']
17
18
  ]
18
19
 
19
20
  self.extra_dev_deps = [
20
21
  ['rspec', '>=1.2.8'],
21
- ['yard', '>=0.4.0']
22
+ ['yard', '>=0.4.0'],
23
+ ['wsoc', '>=0.1.1']
22
24
  ]
23
25
 
24
26
  self.spec_extras = {:has_rdoc => 'yard'}
data/lib/spidr/agent.rb CHANGED
@@ -1,7 +1,11 @@
1
+ require 'spidr/sanitizers'
1
2
  require 'spidr/filters'
2
3
  require 'spidr/events'
3
4
  require 'spidr/actions'
4
5
  require 'spidr/page'
6
+ require 'spidr/session_cache'
7
+ require 'spidr/cookie_jar'
8
+ require 'spidr/auth_store'
5
9
  require 'spidr/spidr'
6
10
 
7
11
  require 'net/http'
@@ -10,16 +14,17 @@ require 'set'
10
14
  module Spidr
11
15
  class Agent
12
16
 
17
+ include Sanitizers
13
18
  include Filters
14
19
  include Events
15
20
  include Actions
16
21
 
17
- # Proxy to use
18
- attr_accessor :proxy
19
-
20
22
  # User-Agent to use
21
23
  attr_accessor :user_agent
22
24
 
25
+ # HTTP Authentication credentials
26
+ attr_accessor :authorized
27
+
23
28
  # Referer to use
24
29
  attr_accessor :referer
25
30
 
@@ -35,6 +40,9 @@ module Spidr
35
40
  # Queue of URLs to visit
36
41
  attr_reader :queue
37
42
 
43
+ # Cached cookies
44
+ attr_reader :cookies
45
+
38
46
  #
39
47
  # Creates a new Agent object.
40
48
  #
@@ -79,18 +87,19 @@ module Spidr
79
87
  # The newly created agent.
80
88
  #
81
89
  def initialize(options={},&block)
82
- @proxy = (options[:proxy] || Spidr.proxy)
83
90
  @user_agent = (options[:user_agent] || Spidr.user_agent)
84
91
  @referer = options[:referer]
85
92
 
93
+ @sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
94
+ @cookies = CookieJar.new
95
+ @authorized = AuthStore.new
96
+
86
97
  @running = false
87
98
  @delay = (options[:delay] || 0)
88
99
  @history = Set[]
89
100
  @failures = Set[]
90
101
  @queue = []
91
102
 
92
- @sessions = {}
93
-
94
103
  super(options)
95
104
 
96
105
  block.call(self) if block
@@ -222,14 +231,6 @@ module Spidr
222
231
 
223
232
  @running = false
224
233
 
225
- @sessions.each_value do |sess|
226
- begin
227
- sess.finish
228
- rescue IOError
229
- nil
230
- end
231
- end
232
-
233
234
  @sessions.clear
234
235
  return self
235
236
  end
@@ -244,6 +245,37 @@ module Spidr
244
245
  @running == true
245
246
  end
246
247
 
248
+ #
249
+ # The proxy information the agent uses.
250
+ #
251
+ # @return [Hash]
252
+ # The proxy information.
253
+ #
254
+ # @see SessionCache#proxy
255
+ #
256
+ # @since 0.2.2
257
+ #
258
+ def proxy
259
+ @sessions.proxy
260
+ end
261
+
262
+ #
263
+ # Sets the proxy information that the agent uses.
264
+ #
265
+ # @param [Hash] new_proxy
266
+ # The new proxy information.
267
+ #
268
+ # @return [Hash]
269
+ # The new proxy information.
270
+ #
271
+ # @see SessionCache#proxy=
272
+ #
273
+ # @since 0.2.2
274
+ #
275
+ def proxy=(new_proxy)
276
+ @sessions.proxy = new_proxy
277
+ end
278
+
247
279
  #
248
280
  # Sets the history of URLs that were previously visited.
249
281
  #
@@ -400,10 +432,11 @@ module Spidr
400
432
  # Specifies whether the URL was enqueued, or ignored.
401
433
  #
402
434
  def enqueue(url)
403
- link = url.to_s
404
- url = URI(link) unless url.kind_of?(URI)
435
+ url = sanitize_url(url)
405
436
 
406
437
  if (!(queued?(url)) && visit?(url))
438
+ link = url.to_s
439
+
407
440
  begin
408
441
  @every_url_blocks.each { |block| block.call(url) }
409
442
 
@@ -443,37 +476,51 @@ module Spidr
443
476
  # The page for the response, or +nil+ if the request failed.
444
477
  #
445
478
  def get_page(url,&block)
446
- url = URI(url.to_s) unless url.kind_of?(URI)
479
+ url = URI(url.to_s)
447
480
 
448
- host = url.host
449
- port = url.port
481
+ prepare_request(url) do |session,path,headers|
482
+ new_page = Page.new(url,session.get(path,headers))
450
483
 
451
- unless url.path.empty?
452
- path = url.path
453
- else
454
- path = '/'
455
- end
484
+ # save any new cookies
485
+ @cookies.from_page(new_page)
456
486
 
457
- # append the URL query to the path
458
- path += "?#{url.query}" if url.query
487
+ block.call(new_page) if block
488
+ return new_page
489
+ end
490
+ end
459
491
 
460
- begin
461
- sleep(@delay) if @delay > 0
492
+ #
493
+ # Posts supplied form data and creates a new Page object from a given URL.
494
+ #
495
+ # @param [URI::HTTP] url
496
+ # The URL to request.
497
+ #
498
+ # @param [String] post_data
499
+ # Form option data.
500
+ #
501
+ # @yield [page]
502
+ # If a block is given, it will be passed the page that represents the
503
+ # response.
504
+ #
505
+ # @yieldparam [Page] page
506
+ # The page for the response.
507
+ #
508
+ # @return [Page, nil]
509
+ # The page for the response, or +nil+ if the request failed.
510
+ #
511
+ # @since 0.2.2
512
+ #
513
+ def post_page(url,post_data='',&block)
514
+ url = URI(url.to_s)
462
515
 
463
- get_session(url.scheme,host,port) do |sess|
464
- headers = {}
465
- headers['User-Agent'] = @user_agent if @user_agent
466
- headers['Referer'] = @referer if @referer
516
+ prepare_request(url) do |session,path,headers|
517
+ new_page = Page.new(url,session.post(path,post_data,headers))
467
518
 
468
- new_page = Page.new(url,sess.get(path,headers))
519
+ # save any new cookies
520
+ @cookies.from_page(new_page)
469
521
 
470
- block.call(new_page) if block
471
- return new_page
472
- end
473
- rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError
474
- failed(url)
475
- kill_session(url.scheme,host,port)
476
- return nil
522
+ block.call(new_page) if block
523
+ return new_page
477
524
  end
478
525
  end
479
526
 
@@ -529,73 +576,66 @@ module Spidr
529
576
  protected
530
577
 
531
578
  #
532
- # Provides an active HTTP session for the given scheme, host
533
- # and port.
534
- #
535
- # @param [String] scheme
536
- # The scheme of the URL, which will be requested later.
537
- #
538
- # @param [String] host
539
- # The host that the session is needed with.
579
+ # Normalizes the request path and grabs a session to handle page
580
+ # get and post requests.
540
581
  #
541
- # @param [Integer] port
542
- # The port that the session is needed for.
582
+ # @param [URI::HTTP] url
583
+ # The URL to request.
543
584
  #
544
- # @yield [session]
545
- # If a block is given, it will be passed the active HTTP session.
585
+ # @yield [request]
586
+ # A block whose purpose is to make a page request.
546
587
  #
547
588
  # @yieldparam [Net::HTTP] session
548
- # The active HTTP session object.
549
- #
550
- def get_session(scheme,host,port,&block)
551
- key = [scheme,host,port]
552
-
553
- unless @sessions[key]
554
- session = Net::HTTP::Proxy(
555
- @proxy[:host],
556
- @proxy[:port],
557
- @proxy[:user],
558
- @proxy[:password]
559
- ).new(host,port)
560
-
561
- if scheme == 'https'
562
- session.use_ssl = true
563
- session.verify_mode = OpenSSL::SSL::VERIFY_NONE
564
- end
565
-
566
- @sessions[key] = session
567
- end
568
-
569
- session = @sessions[key]
570
- block.call(session) if block
571
- return session
572
- end
573
-
574
- #
575
- # Destroys an HTTP session for the given scheme, host and port.
589
+ # An HTTP session object.
576
590
  #
577
- # @param [String] scheme
578
- # The scheme of the URL, which was requested through the session.
591
+ # @yieldparam [String] path
592
+ # Normalized URL string.
579
593
  #
580
- # @param [String] host
581
- # The host that the session was connected with.
594
+ # @yieldparam [Hash] headers
595
+ # A Hash of request header options.
582
596
  #
583
- # @param [Integer] port
584
- # The port that the session was connected to.
597
+ # @since 0.2.2
585
598
  #
586
- def kill_session(scheme,host,port,&block)
587
- key = [scheme,host,port]
588
- sess = @sessions[key]
599
+ def prepare_request(url,&block)
600
+ host = url.host
601
+ port = url.port
589
602
 
590
- begin
591
- sess.finish
592
- rescue IOError
593
- nil
603
+ unless url.path.empty?
604
+ path = url.path
605
+ else
606
+ path = '/'
594
607
  end
595
608
 
596
- @sessions.delete(key)
597
- block.call if block
598
- return nil
609
+ # append the URL query to the path
610
+ path += "?#{url.query}" if url.query
611
+
612
+ begin
613
+ sleep(@delay) if @delay > 0
614
+
615
+ headers = {}
616
+ headers['User-Agent'] = @user_agent if @user_agent
617
+ headers['Referer'] = @referer if @referer
618
+
619
+ if (authorization = @authorized.for_url(url))
620
+ headers['Authorization'] = "Basic #{authorization}"
621
+ end
622
+
623
+ if (header_cookies = @cookies.for_host(url.host))
624
+ headers['Cookie'] = header_cookies
625
+ end
626
+
627
+ block.call(@sessions[url],path,headers)
628
+ rescue SystemCallError,
629
+ Timeout::Error,
630
+ SocketError,
631
+ Net::HTTPBadResponse,
632
+ IOError
633
+
634
+ @sessions.kill!(url)
635
+
636
+ failed(url)
637
+ return nil
638
+ end
599
639
  end
600
640
 
601
641
  #
@@ -633,8 +673,8 @@ module Spidr
633
673
  # The URL to add to the failures list.
634
674
  #
635
675
  def failed(url)
636
- @every_failed_url_blocks.each { |block| block.call(url) }
637
676
  @failures << url
677
+ @every_failed_url_blocks.each { |block| block.call(url) }
638
678
  return true
639
679
  end
640
680