spidr 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.rdoc +191 -0
  3. data/Manifest.txt +10 -34
  4. data/{README.txt → README.rdoc} +3 -1
  5. data/Rakefile +6 -4
  6. data/lib/spidr/agent.rb +137 -97
  7. data/lib/spidr/auth_credential.rb +25 -0
  8. data/lib/spidr/auth_store.rb +157 -0
  9. data/lib/spidr/cookie_jar.rb +166 -0
  10. data/lib/spidr/filters.rb +2 -0
  11. data/lib/spidr/page.rb +75 -11
  12. data/lib/spidr/sanitizers.rb +59 -0
  13. data/lib/spidr/session_cache.rb +119 -0
  14. data/lib/spidr/version.rb +1 -1
  15. data/spec/agent_spec.rb +2 -2
  16. data/spec/helpers/history.rb +34 -0
  17. data/spec/helpers/wsoc.rb +83 -0
  18. data/spec/page_examples.rb +5 -1
  19. data/spec/page_spec.rb +30 -0
  20. data/spec/sanitizers_spec.rb +67 -0
  21. data/tasks/yard.rb +1 -1
  22. metadata +24 -40
  23. metadata.gz.sig +0 -0
  24. data/History.txt +0 -167
  25. data/spec/helpers/course.rb +0 -95
  26. data/static/course/absolute/index.html +0 -10
  27. data/static/course/absolute/next.html +0 -9
  28. data/static/course/absolute/start.html +0 -19
  29. data/static/course/empty/index.html +0 -10
  30. data/static/course/empty/start.html +0 -23
  31. data/static/course/fail.html +0 -14
  32. data/static/course/frames/frame.html +0 -15
  33. data/static/course/frames/frame_next.html +0 -9
  34. data/static/course/frames/iframe.html +0 -15
  35. data/static/course/frames/iframe_next.html +0 -9
  36. data/static/course/frames/index.html +0 -10
  37. data/static/course/frames/start.html +0 -15
  38. data/static/course/index.html +0 -10
  39. data/static/course/javascript/index.html +0 -10
  40. data/static/course/javascript/start.html +0 -19
  41. data/static/course/loop/index.html +0 -10
  42. data/static/course/loop/next.html +0 -13
  43. data/static/course/loop/start.html +0 -19
  44. data/static/course/relative/current_directory.html +0 -9
  45. data/static/course/relative/index.html +0 -10
  46. data/static/course/relative/normal.html +0 -9
  47. data/static/course/relative/same_directory.html +0 -9
  48. data/static/course/relative/start.html +0 -27
  49. data/static/course/remote/index.html +0 -10
  50. data/static/course/remote/next.html +0 -9
  51. data/static/course/remote/start.html +0 -27
  52. data/static/course/scripts/course.js +0 -29
  53. data/static/course/scripts/jquery-1.2.6.min.js +0 -32
  54. data/static/course/specs.json +0 -1
  55. data/static/course/start.html +0 -27
  56. data/tasks/course.rb +0 -63
data.tar.gz.sig CHANGED
Binary file
data/History.rdoc ADDED
@@ -0,0 +1,191 @@
1
+ === 0.2.2 / 2010-01-06
2
+
3
+ * Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
4
+ * Integrated the new WSOC into the specs.
5
+ * Removed the built-in Web Spider Obstacle Course.
6
+ * Added {Spidr::Page#content_types}.
7
+ * Added {Spidr::Page#cookie}.
8
+ * Added {Spidr::Page#cookies}.
9
+ * Added {Spidr::Page#cookie_params}.
10
+ * Added {Spidr::Sanitizers}.
11
+ * Added {Spidr::SessionCache}.
12
+ * Added {Spidr::CookieJar} (thanks Nick Plante).
13
+ * Added {Spidr::AuthStore} (thanks Nick Plante).
14
+ * Added {Spidr::Agent#post_page} (thanks Nick Plante).
15
+ * Renamed Spidr::Agent#get_session to {Spidr::SessionCache#[]}.
16
+ * Renamed Spidr::Agent#kill_session to {Spidr::SessionCache#kill!}.
17
+
18
+ === 0.2.1 / 2009-11-25
19
+
20
+ * Added {Spidr::Events#every_ok_page}.
21
+ * Added {Spidr::Events#every_redirect_page}.
22
+ * Added {Spidr::Events#every_timedout_page}.
23
+ * Added {Spidr::Events#every_bad_request_page}.
24
+ * Added {Spidr::Events#every_unauthorized_page}.
25
+ * Added {Spidr::Events#every_forbidden_page}.
26
+ * Added {Spidr::Events#every_missing_page}.
27
+ * Added {Spidr::Events#every_internal_server_error_page}.
28
+ * Added {Spidr::Events#every_txt_page}.
29
+ * Added {Spidr::Events#every_html_page}.
30
+ * Added {Spidr::Events#every_xml_page}.
31
+ * Added {Spidr::Events#every_xsl_page}.
32
+ * Added {Spidr::Events#every_doc}.
33
+ * Added {Spidr::Events#every_html_doc}.
34
+ * Added {Spidr::Events#every_xml_doc}.
35
+ * Added {Spidr::Events#every_xsl_doc}.
36
+ * Added {Spidr::Events#every_rss_doc}.
37
+ * Added {Spidr::Events#every_atom_doc}.
38
+ * Added {Spidr::Events#every_javascript_page}.
39
+ * Added {Spidr::Events#every_css_page}.
40
+ * Added {Spidr::Events#every_rss_page}.
41
+ * Added {Spidr::Events#every_atom_page}.
42
+ * Added {Spidr::Events#every_ms_word_page}.
43
+ * Added {Spidr::Events#every_pdf_page}.
44
+ * Added {Spidr::Events#every_zip_page}.
45
+ * Fixed a bug where {Spidr::Agent#delay} was not being used to delay
46
+ requesting pages.
47
+ * Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
48
+
49
+ === 0.2.0 / 2009-10-10
50
+
51
+ * Added {URI.expand_path}.
52
+ * Added {Spidr::Page#search}.
53
+ * Added {Spidr::Page#at}.
54
+ * Added {Spidr::Page#title}.
55
+ * Added {Spidr::Agent#failures=}.
56
+ * Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
57
+ * Added Spidr::Agent#get_session.
58
+ * Added Spidr::Agent#kill_session.
59
+ * Added {Spidr.proxy=}.
60
+ * Added {Spidr.disable_proxy!}.
61
+ * Aliased Spidr::Page#txt? to {Spidr::Page#plain_text?}.
62
+ * Aliased Spidr::Page#ok? to {Spidr::Page#is_ok?}.
63
+ * Aliased Spidr::Page#redirect? to {Spidr::Page#is_redirect?}.
64
+ * Aliased Spidr::Page#unauthorized? to {Spidr::Page#is_unauthorized?}.
65
+ * Aliased Spidr::Page#forbidden? to {Spidr::Page#is_forbidden?}.
66
+ * Aliased Spidr::Page#missing? to {Spidr::Page#is_missing?}.
67
+ * Split URL filtering code out of {Spidr::Agent} and into
68
+ {Spidr::Filters}.
69
+ * Split URL / Page event code out of {Spidr::Agent} and into
70
+ {Spidr::Events}.
71
+ * Split pause! / continue! / skip_link! / skip_page! methods out of
72
+ {Spidr::Agent} and into {Spidr::Actions}.
73
+ * Fixed a bug in {Spidr::Page#code}, where it was not returning an Integer.
74
+ * Make sure {Spidr::Page#doc} returns Nokogiri::XML::Document objects for
75
+ RSS/RDF/Atom pages as well.
76
+ * Fixed the handling of the Location header in {Spidr::Page#links}
77
+ (thanks falter).
78
+ * Fixed a bug in {Spidr::Page#to_absolute} where trailing '/' characters on
79
+ URI paths were not being preserved (thanks falter).
80
+ * Fixed a bug where the URI query was not being sent with the request
81
+ in {Spidr::Agent#get_page} (thanks Damian Steer).
82
+ * Fixed a bug where SSL sessions were not being properly setup
83
+ (thanks falter).
84
+ * Switched {Spidr::Agent#history} to be a Set, to improve search-time
85
+ of the history (thanks falter).
86
+ * Switched {Spidr::Agent#failures} to a Set.
87
+ * Allow a block to be passed to {Spidr::Agent#run}, which will receive all
88
+ pages visited.
89
+ * Allow Spidr::Agent#start_at and Spidr::Agent#continue! to pass blocks
90
+ to {Spidr::Agent#run}.
91
+ * Made {Spidr::Agent#visit_page} public.
92
+ * Moved to YARD based documentation.
93
+
94
+ === 0.1.9 / 2009-06-13
95
+
96
+ * Upgraded to Hoe 2.0.0.
97
+ * Use Hoe.spec instead of Hoe.new.
98
+ * Use the Hoe signing task for signed gems.
99
+ * Added the Spidr::Agent#schemes and Spidr::Agent#schemes= methods.
100
+ * Added a warning message if 'net/https' cannot be loaded.
101
+ * Allow the list of acceptable URL schemes to be passed into
102
+ {Spidr::Agent#initialize}.
103
+ * Allow history and queue information to be passed into
104
+ {Spidr::Agent#initialize}.
105
+ * {Spidr::Agent#start_at} no longer clears the history or the queue.
106
+ * Fixed a bug in the sanitization of semi-escaped URLs.
107
+ * Fixed a bug where https URLs would be followed even if 'net/https'
108
+ could not be loaded.
109
+ * Removed Spidr::Agent::SCHEMES.
110
+
111
+ === 0.1.8 / 2009-05-27
112
+
113
+ * Added the Spidr::Agent#pause! and Spidr::Agent#continue! methods.
114
+ * Added the Spidr::Agent#running? and Spidr::Agent#paused? methods.
115
+ * Added an alias for pending_urls to the queue methods.
116
+ * Added {Spidr::Agent#queue} to provide read access to the queue.
117
+ * Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
118
+ queue and history.
119
+ * Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
120
+ history.
121
+ * Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
122
+ * Added more specs.
123
+
124
+ === 0.1.7 / 2009-04-24
125
+
126
+ * Added Spidr::Agent#all_headers.
127
+ * Fixed a bug where Page#headers was always +nil+.
128
+ * {Spidr::Spidr::Agent} will now follow the Location header in HTTP 300,
129
+ 301, 302, 303 and 307 Redirects.
130
+ * {Spidr::Agent} will now follow iframe and frame tags.
131
+
132
+ === 0.1.6 / 2009-04-14
133
+
134
+ * Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
135
+ * Added {Spidr::Agent#failed?}.
136
+ * Added Spidr::Agent#every_failed_url.
137
+ * Added {Spidr::Agent#clear}, which clears the history and failures URL
138
+ lists.
139
+ * Improved fault tolerance in {Spidr::Agent#get_page}.
140
+ * If a Network or HTTP error is encountered, the URL will be added to
141
+ the failures list and the next URL will be visited.
142
+ * Fixed a typo in Spidr::Agent#ignore_exts_like.
143
+ * Updated the Web Spider Obstacle Course with links that always fail to be
144
+ visited.
145
+
146
+ === 0.1.5 / 2009-03-22
147
+
148
+ * Catch malformed URIs in {Spidr::Page#to_absolute} and return +nil+.
149
+ * Filter out +nil+ URIs in {Spidr::Page#urls}.
150
+
151
+ === 0.1.4 / 2009-01-15
152
+
153
+ * Use Nokogiri for HTML and XML parsing.
154
+
155
+ === 0.1.3 / 2009-01-10
156
+
157
+ * Added the :host options to {Spidr::Agent#initialize}.
158
+ * Added the Web Spider Obstacle Course files to the Manifest.
159
+ * Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
160
+
161
+ === 0.1.2 / 2008-11-06
162
+
163
+ * Fixed a bug in {Spidr::Page#to_absolute} where URLs with no path were not
164
+ receiving a default path of <tt>/</tt>.
165
+ * Fixed a bug in {Spidr::Page#to_absolute} where URL paths were not being
166
+ expanded, in order to remove <tt>..</tt> and <tt>.</tt> directories.
167
+ * Fixed a bug where absolute URLs could have a blank path, thus causing
168
+ {Spidr::Agent#get_page} to crash when it performed the HTTP request.
169
+ * Added RSpec spec tests.
170
+ * Created a Web-Spider Obstacle Course
171
+ (http://spidr.rubyforge.org/course/start.html) which is used in the spec
172
+ tests.
173
+
174
+ === 0.1.1 / 2008-10-04
175
+
176
+ * Added a reader method for the response instance variable in Page.
177
+ * Fixed a bug in {Spidr::Page#method_missing}.
178
+
179
+ === 0.1.0 / 2008-05-23
180
+
181
+ * Initial release.
182
+ * Black-list or white-list URLs based upon:
183
+ * Host name
184
+ * Port number
185
+ * Full link
186
+ * URL extension
187
+ * Provides call-backs for:
188
+ * Every visited Page.
189
+ * Every visited URL.
190
+ * Every visited URL that matches a specified pattern.
191
+
data/Manifest.txt CHANGED
@@ -1,11 +1,12 @@
1
- History.txt
1
+ History.rdoc
2
2
  Manifest.txt
3
- README.txt
3
+ README.rdoc
4
4
  Rakefile
5
5
  lib/spidr.rb
6
6
  lib/spidr/extensions.rb
7
7
  lib/spidr/extensions/uri.rb
8
8
  lib/spidr/page.rb
9
+ lib/spidr/sanitizers.rb
9
10
  lib/spidr/rules.rb
10
11
  lib/spidr/filters.rb
11
12
  lib/spidr/events.rb
@@ -16,50 +17,25 @@ lib/spidr/actions/exceptions/paused.rb
16
17
  lib/spidr/actions/exceptions/skip_link.rb
17
18
  lib/spidr/actions/exceptions/skip_page.rb
18
19
  lib/spidr/actions/actions.rb
20
+ lib/spidr/session_cache.rb
21
+ lib/spidr/cookie_jar.rb
22
+ lib/spidr/auth_credential.rb
23
+ lib/spidr/auth_store.rb
19
24
  lib/spidr/agent.rb
20
25
  lib/spidr/spidr.rb
21
26
  lib/spidr/version.rb
22
27
  tasks/spec.rb
23
28
  tasks/yard.rb
24
- tasks/course.rb
25
29
  spec/spec_helper.rb
26
- spec/helpers/course.rb
30
+ spec/helpers/history.rb
31
+ spec/helpers/wsoc.rb
27
32
  spec/helpers/page.rb
28
33
  spec/extensions/uri_spec.rb
29
34
  spec/page_examples.rb
30
35
  spec/page_spec.rb
31
36
  spec/rules_spec.rb
37
+ spec/sanitizers_spec.rb
32
38
  spec/filters_spec.rb
33
39
  spec/actions_spec.rb
34
40
  spec/agent_spec.rb
35
41
  spec/spidr_spec.rb
36
- static/course/index.html
37
- static/course/start.html
38
- static/course/fail.html
39
- static/course/scripts/jquery-1.2.6.min.js
40
- static/course/scripts/course.js
41
- static/course/empty/index.html
42
- static/course/empty/start.html
43
- static/course/javascript/index.html
44
- static/course/javascript/start.html
45
- static/course/loop/index.html
46
- static/course/loop/start.html
47
- static/course/loop/next.html
48
- static/course/relative/index.html
49
- static/course/relative/start.html
50
- static/course/relative/normal.html
51
- static/course/relative/current_directory.html
52
- static/course/relative/same_directory.html
53
- static/course/absolute/index.html
54
- static/course/absolute/start.html
55
- static/course/absolute/next.html
56
- static/course/remote/index.html
57
- static/course/remote/start.html
58
- static/course/remote/next.html
59
- static/course/frames/index.html
60
- static/course/frames/start.html
61
- static/course/frames/iframe.html
62
- static/course/frames/iframe_next.html
63
- static/course/frames/frame.html
64
- static/course/frames/frame_next.html
65
- static/course/specs.json
@@ -18,7 +18,9 @@ and easy to use.
18
18
  * a tags.
19
19
  * iframe tags.
20
20
  * frame tags.
21
+ * Cookie protected links.
21
22
  * HTTP 300, 301, 302, 303 and 307 Redirects.
23
+ * HTTP Basic Auth protected links.
22
24
  * Black-list or white-list URLs based upon:
23
25
  * URL scheme.
24
26
  * Host name
@@ -156,7 +158,7 @@ and easy to use.
156
158
 
157
159
  The MIT License
158
160
 
159
- Copyright (c) 2008-2009 Hal Brodigan
161
+ Copyright (c) 2008-2010 Hal Brodigan
160
162
 
161
163
  Permission is hereby granted, free of charge, to any person obtaining
162
164
  a copy of this software and associated documentation files (the
data/Rakefile CHANGED
@@ -5,20 +5,22 @@ require 'hoe'
5
5
  require 'hoe/signing'
6
6
  require './tasks/spec.rb'
7
7
  require './tasks/yard.rb'
8
- require './tasks/course.rb'
9
- require './lib/spidr/version.rb'
10
8
 
11
9
  Hoe.spec('spidr') do
12
- self.rubyforge_name = 'spidr'
13
10
  self.developer('Postmodern', 'postmodern.mod3@gmail.com')
11
+
12
+ self.readme_file = 'README.rdoc'
13
+ self.history_file = 'History.rdoc'
14
14
  self.remote_rdoc_dir = 'docs'
15
+
15
16
  self.extra_deps = [
16
17
  ['nokogiri', '>=1.2.0']
17
18
  ]
18
19
 
19
20
  self.extra_dev_deps = [
20
21
  ['rspec', '>=1.2.8'],
21
- ['yard', '>=0.4.0']
22
+ ['yard', '>=0.4.0'],
23
+ ['wsoc', '>=0.1.1']
22
24
  ]
23
25
 
24
26
  self.spec_extras = {:has_rdoc => 'yard'}
data/lib/spidr/agent.rb CHANGED
@@ -1,7 +1,11 @@
1
+ require 'spidr/sanitizers'
1
2
  require 'spidr/filters'
2
3
  require 'spidr/events'
3
4
  require 'spidr/actions'
4
5
  require 'spidr/page'
6
+ require 'spidr/session_cache'
7
+ require 'spidr/cookie_jar'
8
+ require 'spidr/auth_store'
5
9
  require 'spidr/spidr'
6
10
 
7
11
  require 'net/http'
@@ -10,16 +14,17 @@ require 'set'
10
14
  module Spidr
11
15
  class Agent
12
16
 
17
+ include Sanitizers
13
18
  include Filters
14
19
  include Events
15
20
  include Actions
16
21
 
17
- # Proxy to use
18
- attr_accessor :proxy
19
-
20
22
  # User-Agent to use
21
23
  attr_accessor :user_agent
22
24
 
25
+ # HTTP Authentication credentials
26
+ attr_accessor :authorized
27
+
23
28
  # Referer to use
24
29
  attr_accessor :referer
25
30
 
@@ -35,6 +40,9 @@ module Spidr
35
40
  # Queue of URLs to visit
36
41
  attr_reader :queue
37
42
 
43
+ # Cached cookies
44
+ attr_reader :cookies
45
+
38
46
  #
39
47
  # Creates a new Agent object.
40
48
  #
@@ -79,18 +87,19 @@ module Spidr
79
87
  # The newly created agent.
80
88
  #
81
89
  def initialize(options={},&block)
82
- @proxy = (options[:proxy] || Spidr.proxy)
83
90
  @user_agent = (options[:user_agent] || Spidr.user_agent)
84
91
  @referer = options[:referer]
85
92
 
93
+ @sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
94
+ @cookies = CookieJar.new
95
+ @authorized = AuthStore.new
96
+
86
97
  @running = false
87
98
  @delay = (options[:delay] || 0)
88
99
  @history = Set[]
89
100
  @failures = Set[]
90
101
  @queue = []
91
102
 
92
- @sessions = {}
93
-
94
103
  super(options)
95
104
 
96
105
  block.call(self) if block
@@ -222,14 +231,6 @@ module Spidr
222
231
 
223
232
  @running = false
224
233
 
225
- @sessions.each_value do |sess|
226
- begin
227
- sess.finish
228
- rescue IOError
229
- nil
230
- end
231
- end
232
-
233
234
  @sessions.clear
234
235
  return self
235
236
  end
@@ -244,6 +245,37 @@ module Spidr
244
245
  @running == true
245
246
  end
246
247
 
248
+ #
249
+ # The proxy information the agent uses.
250
+ #
251
+ # @return [Hash]
252
+ # The proxy information.
253
+ #
254
+ # @see SessionCache#proxy
255
+ #
256
+ # @since 0.2.2
257
+ #
258
+ def proxy
259
+ @sessions.proxy
260
+ end
261
+
262
+ #
263
+ # Sets the proxy information that the agent uses.
264
+ #
265
+ # @param [Hash] new_proxy
266
+ # The new proxy information.
267
+ #
268
+ # @return [Hash]
269
+ # The new proxy information.
270
+ #
271
+ # @see SessionCache#proxy=
272
+ #
273
+ # @since 0.2.2
274
+ #
275
+ def proxy=(new_proxy)
276
+ @sessions.proxy = new_proxy
277
+ end
278
+
247
279
  #
248
280
  # Sets the history of URLs that were previously visited.
249
281
  #
@@ -400,10 +432,11 @@ module Spidr
400
432
  # Specifies whether the URL was enqueued, or ignored.
401
433
  #
402
434
  def enqueue(url)
403
- link = url.to_s
404
- url = URI(link) unless url.kind_of?(URI)
435
+ url = sanitize_url(url)
405
436
 
406
437
  if (!(queued?(url)) && visit?(url))
438
+ link = url.to_s
439
+
407
440
  begin
408
441
  @every_url_blocks.each { |block| block.call(url) }
409
442
 
@@ -443,37 +476,51 @@ module Spidr
443
476
  # The page for the response, or +nil+ if the request failed.
444
477
  #
445
478
  def get_page(url,&block)
446
- url = URI(url.to_s) unless url.kind_of?(URI)
479
+ url = URI(url.to_s)
447
480
 
448
- host = url.host
449
- port = url.port
481
+ prepare_request(url) do |session,path,headers|
482
+ new_page = Page.new(url,session.get(path,headers))
450
483
 
451
- unless url.path.empty?
452
- path = url.path
453
- else
454
- path = '/'
455
- end
484
+ # save any new cookies
485
+ @cookies.from_page(new_page)
456
486
 
457
- # append the URL query to the path
458
- path += "?#{url.query}" if url.query
487
+ block.call(new_page) if block
488
+ return new_page
489
+ end
490
+ end
459
491
 
460
- begin
461
- sleep(@delay) if @delay > 0
492
+ #
493
+ # Posts supplied form data and creates a new Page object from a given URL.
494
+ #
495
+ # @param [URI::HTTP] url
496
+ # The URL to request.
497
+ #
498
+ # @param [String] post_data
499
+ # Form option data.
500
+ #
501
+ # @yield [page]
502
+ # If a block is given, it will be passed the page that represents the
503
+ # response.
504
+ #
505
+ # @yieldparam [Page] page
506
+ # The page for the response.
507
+ #
508
+ # @return [Page, nil]
509
+ # The page for the response, or +nil+ if the request failed.
510
+ #
511
+ # @since 0.2.2
512
+ #
513
+ def post_page(url,post_data='',&block)
514
+ url = URI(url.to_s)
462
515
 
463
- get_session(url.scheme,host,port) do |sess|
464
- headers = {}
465
- headers['User-Agent'] = @user_agent if @user_agent
466
- headers['Referer'] = @referer if @referer
516
+ prepare_request(url) do |session,path,headers|
517
+ new_page = Page.new(url,session.post(path,post_data,headers))
467
518
 
468
- new_page = Page.new(url,sess.get(path,headers))
519
+ # save any new cookies
520
+ @cookies.from_page(new_page)
469
521
 
470
- block.call(new_page) if block
471
- return new_page
472
- end
473
- rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError
474
- failed(url)
475
- kill_session(url.scheme,host,port)
476
- return nil
522
+ block.call(new_page) if block
523
+ return new_page
477
524
  end
478
525
  end
479
526
 
@@ -529,73 +576,66 @@ module Spidr
529
576
  protected
530
577
 
531
578
  #
532
- # Provides an active HTTP session for the given scheme, host
533
- # and port.
534
- #
535
- # @param [String] scheme
536
- # The scheme of the URL, which will be requested later.
537
- #
538
- # @param [String] host
539
- # The host that the session is needed with.
579
+ # Normalizes the request path and grabs a session to handle page
580
+ # get and post requests.
540
581
  #
541
- # @param [Integer] port
542
- # The port that the session is needed for.
582
+ # @param [URI::HTTP] url
583
+ # The URL to request.
543
584
  #
544
- # @yield [session]
545
- # If a block is given, it will be passed the active HTTP session.
585
+ # @yield [request]
586
+ # A block whose purpose is to make a page request.
546
587
  #
547
588
  # @yieldparam [Net::HTTP] session
548
- # The active HTTP session object.
549
- #
550
- def get_session(scheme,host,port,&block)
551
- key = [scheme,host,port]
552
-
553
- unless @sessions[key]
554
- session = Net::HTTP::Proxy(
555
- @proxy[:host],
556
- @proxy[:port],
557
- @proxy[:user],
558
- @proxy[:password]
559
- ).new(host,port)
560
-
561
- if scheme == 'https'
562
- session.use_ssl = true
563
- session.verify_mode = OpenSSL::SSL::VERIFY_NONE
564
- end
565
-
566
- @sessions[key] = session
567
- end
568
-
569
- session = @sessions[key]
570
- block.call(session) if block
571
- return session
572
- end
573
-
574
- #
575
- # Destroys an HTTP session for the given scheme, host and port.
589
+ # An HTTP session object.
576
590
  #
577
- # @param [String] scheme
578
- # The scheme of the URL, which was requested through the session.
591
+ # @yieldparam [String] path
592
+ # Normalized URL string.
579
593
  #
580
- # @param [String] host
581
- # The host that the session was connected with.
594
+ # @yieldparam [Hash] headers
595
+ # A Hash of request header options.
582
596
  #
583
- # @param [Integer] port
584
- # The port that the session was connected to.
597
+ # @since 0.2.2
585
598
  #
586
- def kill_session(scheme,host,port,&block)
587
- key = [scheme,host,port]
588
- sess = @sessions[key]
599
+ def prepare_request(url,&block)
600
+ host = url.host
601
+ port = url.port
589
602
 
590
- begin
591
- sess.finish
592
- rescue IOError
593
- nil
603
+ unless url.path.empty?
604
+ path = url.path
605
+ else
606
+ path = '/'
594
607
  end
595
608
 
596
- @sessions.delete(key)
597
- block.call if block
598
- return nil
609
+ # append the URL query to the path
610
+ path += "?#{url.query}" if url.query
611
+
612
+ begin
613
+ sleep(@delay) if @delay > 0
614
+
615
+ headers = {}
616
+ headers['User-Agent'] = @user_agent if @user_agent
617
+ headers['Referer'] = @referer if @referer
618
+
619
+ if (authorization = @authorized.for_url(url))
620
+ headers['Authorization'] = "Basic #{authorization}"
621
+ end
622
+
623
+ if (header_cookies = @cookies.for_host(url.host))
624
+ headers['Cookie'] = header_cookies
625
+ end
626
+
627
+ block.call(@sessions[url],path,headers)
628
+ rescue SystemCallError,
629
+ Timeout::Error,
630
+ SocketError,
631
+ Net::HTTPBadResponse,
632
+ IOError
633
+
634
+ @sessions.kill!(url)
635
+
636
+ failed(url)
637
+ return nil
638
+ end
599
639
  end
600
640
 
601
641
  #
@@ -633,8 +673,8 @@ module Spidr
633
673
  # The URL to add to the failures list.
634
674
  #
635
675
  def failed(url)
636
- @every_failed_url_blocks.each { |block| block.call(url) }
637
676
  @failures << url
677
+ @every_failed_url_blocks.each { |block| block.call(url) }
638
678
  return true
639
679
  end
640
680