mechanize 0.7.8 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mechanize might be problematic. Click here for more details.

Files changed (96) hide show
  1. data/History.txt +14 -0
  2. data/Manifest.txt +30 -5
  3. data/README.txt +5 -5
  4. data/Rakefile +6 -0
  5. data/{eg → examples}/flickr_upload.rb +0 -0
  6. data/{eg → examples}/mech-dump.rb +0 -0
  7. data/{eg → examples}/proxy_req.rb +0 -0
  8. data/{eg → examples}/rubyforge.rb +0 -0
  9. data/{eg → examples}/spider.rb +0 -0
  10. data/lib/www/mechanize.rb +183 -404
  11. data/lib/www/mechanize/chain.rb +34 -0
  12. data/lib/www/mechanize/chain/auth_headers.rb +79 -0
  13. data/lib/www/mechanize/chain/body_decoding_handler.rb +43 -0
  14. data/lib/www/mechanize/chain/connection_resolver.rb +78 -0
  15. data/lib/www/mechanize/chain/custom_headers.rb +23 -0
  16. data/lib/www/mechanize/chain/handler.rb +9 -0
  17. data/lib/www/mechanize/chain/header_resolver.rb +47 -0
  18. data/lib/www/mechanize/chain/parameter_resolver.rb +23 -0
  19. data/lib/www/mechanize/chain/post_connect_hook.rb +0 -0
  20. data/lib/www/mechanize/chain/pre_connect_hook.rb +22 -0
  21. data/lib/www/mechanize/chain/request_resolver.rb +28 -0
  22. data/lib/www/mechanize/chain/response_body_parser.rb +40 -0
  23. data/lib/www/mechanize/chain/response_header_handler.rb +50 -0
  24. data/lib/www/mechanize/chain/response_reader.rb +41 -0
  25. data/lib/www/mechanize/chain/ssl_resolver.rb +36 -0
  26. data/lib/www/mechanize/chain/uri_resolver.rb +56 -0
  27. data/lib/www/mechanize/cookie.rb +1 -1
  28. data/lib/www/mechanize/file_response.rb +60 -0
  29. data/lib/www/mechanize/form.rb +12 -4
  30. data/lib/www/mechanize/form/field.rb +2 -2
  31. data/lib/www/mechanize/form/file_upload.rb +1 -1
  32. data/lib/www/mechanize/form/option.rb +1 -1
  33. data/lib/www/mechanize/list.rb +4 -0
  34. data/lib/www/mechanize/page.rb +20 -10
  35. data/lib/www/mechanize/util.rb +29 -0
  36. data/mechanize.gemspec +4 -4
  37. data/test/chain/test_argument_validator.rb +14 -0
  38. data/test/chain/test_custom_headers.rb +18 -0
  39. data/test/chain/test_parameter_resolver.rb +35 -0
  40. data/test/chain/test_request_resolver.rb +29 -0
  41. data/test/chain/test_response_reader.rb +24 -0
  42. data/test/helper.rb +3 -1
  43. data/test/servlets.rb +43 -0
  44. data/test/test_authenticate.rb +13 -12
  45. data/test/test_bad_links.rb +1 -1
  46. data/test/test_blank_form.rb +1 -1
  47. data/test/test_checkboxes.rb +1 -1
  48. data/test/test_content_type.rb +1 -1
  49. data/test/test_cookie_class.rb +1 -1
  50. data/test/test_cookie_jar.rb +1 -1
  51. data/test/test_cookies.rb +1 -1
  52. data/test/test_encoded_links.rb +1 -1
  53. data/test/test_errors.rb +1 -1
  54. data/test/test_follow_meta.rb +1 -1
  55. data/test/test_form_action.rb +1 -1
  56. data/test/test_form_as_hash.rb +1 -1
  57. data/test/test_form_button.rb +22 -17
  58. data/test/test_form_no_inputname.rb +1 -1
  59. data/test/test_forms.rb +2 -1
  60. data/test/test_frames.rb +1 -1
  61. data/test/test_get_headers.rb +1 -1
  62. data/test/test_gzipping.rb +1 -1
  63. data/test/test_hash_api.rb +17 -14
  64. data/test/test_history.rb +1 -1
  65. data/test/test_history_added.rb +1 -1
  66. data/test/test_html_unscape_forms.rb +1 -1
  67. data/test/test_if_modified_since.rb +1 -1
  68. data/test/test_keep_alive.rb +1 -1
  69. data/test/test_links.rb +1 -1
  70. data/test/test_mech.rb +18 -11
  71. data/test/test_mechanize_file.rb +1 -1
  72. data/test/test_multi_select.rb +1 -1
  73. data/test/test_no_attributes.rb +1 -1
  74. data/test/test_option.rb +2 -1
  75. data/test/test_page.rb +1 -1
  76. data/test/test_pluggable_parser.rb +1 -1
  77. data/test/test_post_form.rb +1 -1
  78. data/test/test_pretty_print.rb +1 -1
  79. data/test/test_radiobutton.rb +1 -1
  80. data/test/test_redirect_limit_reached.rb +1 -1
  81. data/test/test_referer.rb +1 -1
  82. data/test/test_relative_links.rb +1 -1
  83. data/test/test_response_code.rb +7 -1
  84. data/test/test_save_file.rb +1 -1
  85. data/test/test_scheme.rb +44 -0
  86. data/test/test_select.rb +1 -1
  87. data/test/test_select_all.rb +1 -1
  88. data/test/test_select_none.rb +1 -1
  89. data/test/test_select_noopts.rb +1 -1
  90. data/test/test_set_fields.rb +1 -1
  91. data/test/test_ssl_server.rb +1 -1
  92. data/test/test_subclass.rb +4 -11
  93. data/test/test_textarea.rb +1 -1
  94. data/test/test_upload.rb +1 -1
  95. data/test/test_verbs.rb +22 -0
  96. metadata +39 -7
data/History.txt CHANGED
@@ -1,5 +1,19 @@
1
1
  = Mechanize CHANGELOG
2
2
 
3
+ === 0.8.0
4
+
5
+ * New Features:
6
+ * Lifecycle hooks. Mechanize#pre_connect_hooks, Mechanize#post_connect_hooks
7
+ * file:/// urls are now supported
8
+ * Added Mechanize::Page#link_with, frame_with for searching for links using
9
+ +criteria+.
10
+ * Implementing PUT, DELETE, and HEAD requests
11
+
12
+ * Bug Fixes:
13
+ * Fixed an infinite loop when content-length and body length don't match.
14
+ * Only setting headers once
15
+ * Adding IIS authentication support
16
+
3
17
  === 0.7.8
4
18
 
5
19
  * Bug Fixes:
data/Manifest.txt CHANGED
@@ -7,17 +7,34 @@ Manifest.txt
7
7
  NOTES.txt
8
8
  README.txt
9
9
  Rakefile
10
- eg/flickr_upload.rb
11
- eg/mech-dump.rb
12
- eg/proxy_req.rb
13
- eg/rubyforge.rb
14
- eg/spider.rb
10
+ examples/flickr_upload.rb
11
+ examples/mech-dump.rb
12
+ examples/proxy_req.rb
13
+ examples/rubyforge.rb
14
+ examples/spider.rb
15
15
  lib/mechanize.rb
16
16
  lib/www/mechanize.rb
17
+ lib/www/mechanize/chain.rb
18
+ lib/www/mechanize/chain/auth_headers.rb
19
+ lib/www/mechanize/chain/body_decoding_handler.rb
20
+ lib/www/mechanize/chain/connection_resolver.rb
21
+ lib/www/mechanize/chain/custom_headers.rb
22
+ lib/www/mechanize/chain/handler.rb
23
+ lib/www/mechanize/chain/header_resolver.rb
24
+ lib/www/mechanize/chain/parameter_resolver.rb
25
+ lib/www/mechanize/chain/post_connect_hook.rb
26
+ lib/www/mechanize/chain/pre_connect_hook.rb
27
+ lib/www/mechanize/chain/request_resolver.rb
28
+ lib/www/mechanize/chain/response_body_parser.rb
29
+ lib/www/mechanize/chain/response_header_handler.rb
30
+ lib/www/mechanize/chain/response_reader.rb
31
+ lib/www/mechanize/chain/ssl_resolver.rb
32
+ lib/www/mechanize/chain/uri_resolver.rb
17
33
  lib/www/mechanize/content_type_error.rb
18
34
  lib/www/mechanize/cookie.rb
19
35
  lib/www/mechanize/cookie_jar.rb
20
36
  lib/www/mechanize/file.rb
37
+ lib/www/mechanize/file_response.rb
21
38
  lib/www/mechanize/file_saver.rb
22
39
  lib/www/mechanize/form.rb
23
40
  lib/www/mechanize/form/button.rb
@@ -43,7 +60,13 @@ lib/www/mechanize/pluggable_parsers.rb
43
60
  lib/www/mechanize/redirect_limit_reached_error.rb
44
61
  lib/www/mechanize/response_code_error.rb
45
62
  lib/www/mechanize/unsupported_scheme_error.rb
63
+ lib/www/mechanize/util.rb
46
64
  mechanize.gemspec
65
+ test/chain/test_argument_validator.rb
66
+ test/chain/test_custom_headers.rb
67
+ test/chain/test_parameter_resolver.rb
68
+ test/chain/test_request_resolver.rb
69
+ test/chain/test_response_reader.rb
47
70
  test/data/htpasswd
48
71
  test/data/server.crt
49
72
  test/data/server.csr
@@ -132,6 +155,7 @@ test/test_referer.rb
132
155
  test/test_relative_links.rb
133
156
  test/test_response_code.rb
134
157
  test/test_save_file.rb
158
+ test/test_scheme.rb
135
159
  test/test_select.rb
136
160
  test/test_select_all.rb
137
161
  test/test_select_none.rb
@@ -141,3 +165,4 @@ test/test_ssl_server.rb
141
165
  test/test_subclass.rb
142
166
  test/test_textarea.rb
143
167
  test/test_upload.rb
168
+ test/test_verbs.rb
data/README.txt CHANGED
@@ -1,6 +1,7 @@
1
1
  = WWW::Mechanize
2
2
 
3
3
  http://mechanize.rubyforge.org/
4
+ http://github.com/tenderlove/mechanize/tree/master
4
5
 
5
6
  == DESCRIPTION
6
7
 
@@ -15,8 +16,6 @@ a history.
15
16
  * ruby 1.8.4
16
17
  * hpricot[http://code.whytheluckystiff.net/hpricot/]
17
18
 
18
- Note that the files in the net-overrides/ directory are taken from Ruby 1.9.0.
19
-
20
19
 
21
20
  == Examples
22
21
 
@@ -25,11 +24,12 @@ Also, check out the EXAMPLES[link://files/EXAMPLES_txt.html] file.
25
24
 
26
25
  == Authors
27
26
 
28
- Original Code:
29
27
  Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de)
30
28
 
31
- New Code:
32
- Copyright (c) 2007 by Aaron Patterson (aaronp@rubyforge.org)
29
+ Copyright (c) 2006-2008:
30
+
31
+ * {Aaron Patterson}[http://tenderlovemaking.com] (aaronp@rubyforge.org)
32
+ * Mike Dalessio (mike@csa.net)
33
33
 
34
34
  This library comes with a shameless plug for employing me
35
35
  (Aaron[http://tenderlovemaking.com/]) programming
data/Rakefile CHANGED
@@ -33,3 +33,9 @@ namespace :gem do
33
33
  end
34
34
  end
35
35
  end
36
+
37
+ desc "Run code-coverage analysis"
38
+ task :coverage do
39
+ rm_rf "coverage"
40
+ sh "rcov -x Library -I lib:test #{Dir[*HOE.test_globs].join(' ')}"
41
+ end
File without changes
File without changes
File without changes
File without changes
File without changes
data/lib/www/mechanize.rb CHANGED
@@ -9,6 +9,7 @@ require 'fileutils'
9
9
  require 'hpricot'
10
10
  require 'forwardable'
11
11
 
12
+ require 'www/mechanize/util'
12
13
  require 'www/mechanize/content_type_error'
13
14
  require 'www/mechanize/response_code_error'
14
15
  require 'www/mechanize/unsupported_scheme_error'
@@ -19,7 +20,9 @@ require 'www/mechanize/history'
19
20
  require 'www/mechanize/list'
20
21
  require 'www/mechanize/form'
21
22
  require 'www/mechanize/pluggable_parsers'
23
+ require 'www/mechanize/file_response'
22
24
  require 'www/mechanize/inspect'
25
+ require 'www/mechanize/chain'
23
26
  require 'www/mechanize/monkey_patch'
24
27
 
25
28
  module WWW
@@ -43,7 +46,7 @@ module WWW
43
46
  class Mechanize
44
47
  ##
45
48
  # The version of Mechanize you are using.
46
- VERSION = '0.7.8'
49
+ VERSION = '0.8.0'
47
50
 
48
51
  ##
49
52
  # User Agent aliases
@@ -61,7 +64,6 @@ module WWW
61
64
  }
62
65
 
63
66
  attr_accessor :cookie_jar
64
- attr_accessor :log
65
67
  attr_accessor :open_timeout, :read_timeout
66
68
  attr_accessor :user_agent
67
69
  attr_accessor :watch_for_set
@@ -84,10 +86,8 @@ module WWW
84
86
 
85
87
  alias :follow_redirect? :redirect_ok
86
88
 
87
- @@nonce_count = -1
88
- CNONCE = Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535)))
89
89
  @html_parser = Hpricot
90
- class << self; attr_accessor :html_parser end
90
+ class << self; attr_accessor :html_parser, :log end
91
91
 
92
92
  def initialize
93
93
  # attr_accessors
@@ -118,7 +118,6 @@ module WWW
118
118
  @password = nil # Auth Password
119
119
  @digest = nil # DigestAuth Digest
120
120
  @auth_hash = {} # Keep track of urls for sending auth
121
- @digest_response = nil
122
121
 
123
122
  # Proxy settings
124
123
  @proxy_addr = nil
@@ -144,14 +143,29 @@ module WWW
144
143
  @scheme_handlers['http'] = lambda { |link, page| link }
145
144
  @scheme_handlers['https'] = @scheme_handlers['http']
146
145
  @scheme_handlers['relative'] = @scheme_handlers['http']
146
+ @scheme_handlers['file'] = @scheme_handlers['http']
147
+
148
+ @pre_connect_hook = Chain::PreConnectHook.new
149
+ @post_connect_hook = Chain::PostConnectHook.new
147
150
 
148
151
  yield self if block_given?
149
152
  end
150
153
 
151
- def max_history=(length); @history.max_size = length; end
152
- def max_history; @history.max_size; end
154
+ def max_history=(length); @history.max_size = length end
155
+ def max_history; @history.max_size end
156
+ def log=(l); self.class.log = l end
157
+ def log; self.class.log end
158
+
159
+ def pre_connect_hooks
160
+ @pre_connect_hook.hooks
161
+ end
162
+
163
+ def post_connect_hooks
164
+ @post_connect_hook.hooks
165
+ end
153
166
 
154
167
  # Sets the proxy address, port, user, and password
168
+ # +addr+ should be a host, with no "http://"
155
169
  def set_proxy(addr, port, user = nil, pass = nil)
156
170
  @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
157
171
  end
@@ -167,15 +181,12 @@ module WWW
167
181
  @cookie_jar.to_a
168
182
  end
169
183
 
170
- # Sets the user and password to be used for basic authentication.
171
- def basic_auth(user, password)
172
- auth(user, password)
173
- end
174
-
184
+ # Sets the user and password to be used for authentication.
175
185
  def auth(user, password)
176
186
  @user = user
177
187
  @password = password
178
188
  end
189
+ alias :basic_auth :auth
179
190
 
180
191
  # Fetches the URL passed in and returns a page.
181
192
  def get(options, parameters = [], referer = nil)
@@ -202,41 +213,67 @@ module WWW
202
213
  Page.new(URI.parse(referer), {'content-type' => 'text/html'}) :
203
214
  Page.new(referer, {'content-type' => 'text/html'})
204
215
  end
205
- abs_uri = to_absolute_uri(url, referer)
206
216
 
207
- if parameters.length > 0
208
- abs_uri.query ||= ''
209
- abs_uri.query << '&' if abs_uri.query.length > 0
210
- abs_uri.query << self.class.build_query_string(parameters)
211
- end
217
+ # fetch the page
218
+ page = fetch_page( :uri => url,
219
+ :referer => referer,
220
+ :headers => headers || {},
221
+ :params => parameters
222
+ )
223
+ add_to_history(page)
224
+ yield page if block_given?
225
+ page
226
+ end
212
227
 
228
+ ####
229
+ # PUT to +url+ with +query_params+, and setting +options+:
230
+ #
231
+ # put('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
232
+ #
233
+ def put(url, query_params = {}, options = {})
234
+ options = {
235
+ :uri => url,
236
+ :headers => {},
237
+ :params => query_params,
238
+ :verb => :put
239
+ }.merge(options)
213
240
  # fetch the page
214
- request = fetch_request(abs_uri)
215
- page = fetch_page(:uri => abs_uri, :request => request, :page => referer, :headers => headers)
241
+ page = fetch_page(options)
216
242
  add_to_history(page)
217
243
  yield page if block_given?
218
244
  page
219
245
  end
246
+
247
+ ####
248
+ # DELETE to +url+ with +query_params+, and setting +options+:
249
+ #
250
+ # delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
251
+ #
252
+ def delete(url, query_params = {}, options = {})
253
+ put(url, query_params, options.merge({:verb => :delete}))
254
+ end
255
+
256
+ ####
257
+ # HEAD to +url+ with +query_params+, and setting +options+:
258
+ #
259
+ # head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
260
+ #
261
+ def head(url, query_params = {}, options = {})
262
+ put(url, query_params, options.merge({:verb => :head}))
263
+ end
220
264
 
221
265
  # Fetch a file and return the contents of the file.
222
266
  def get_file(url)
223
267
  get(url).body
224
268
  end
225
269
 
226
-
227
270
  # Clicks the WWW::Mechanize::Link object passed in and returns the
228
271
  # page fetched.
229
272
  def click(link)
230
- referer =
231
- begin
232
- link.page
233
- rescue
234
- nil
235
- end
236
- href = link.respond_to?(:has_attribute?) ?
237
- (link['href'] || link['src']) : link.href
238
- uri = to_absolute_uri(href, referer || current_page())
239
- get(uri, referer)
273
+ referer = link.page rescue referer = nil
274
+ href = link.respond_to?(:href) ? link.href :
275
+ (link['href'] || link['src'])
276
+ get(:url => href, :referer => (referer || current_page()))
240
277
  end
241
278
 
242
279
  # Equivalent to the browser back button. Returns the most recent page
@@ -282,13 +319,14 @@ module WWW
282
319
  # agent.submit(page.forms.first, page.forms.first.buttons.first)
283
320
  def submit(form, button=nil)
284
321
  form.add_button_to_query(button) if button
285
- uri = to_absolute_uri(form.action, form.page)
286
322
  case form.method.upcase
287
323
  when 'POST'
288
- post_form(uri, form)
324
+ post_form(form.action, form)
289
325
  when 'GET'
290
- uri.query = WWW::Mechanize.build_query_string(form.build_query)
291
- get(uri)
326
+ get( :url => form.action.gsub(/\?[^\?]*$/, ''),
327
+ :params => form.build_query,
328
+ :referer => form.page
329
+ )
292
330
  else
293
331
  raise "unsupported method: #{form.method.upcase}"
294
332
  end
@@ -309,7 +347,7 @@ module WWW
309
347
  if url.respond_to? :href
310
348
  url = url.href
311
349
  end
312
- @history.visited_page(to_absolute_uri(url))
350
+ @history.visited_page(resolve(url))
313
351
  end
314
352
 
315
353
  # Runs given block, then resets the page history as it was before. self is
@@ -325,166 +363,14 @@ module WWW
325
363
 
326
364
  alias :page :current_page
327
365
 
328
- class << self
329
- def html_unescape(s)
330
- return s unless s
331
- s.gsub(/&(\w+|#[0-9]+);/) { |match|
332
- number = case match
333
- when /&(\w+);/
334
- Mechanize.html_parser::NamedCharacters[$1]
335
- when /&#([0-9]+);/
336
- $1.to_i
337
- end
338
-
339
- number ? ([number].pack('U') rescue match) : match
340
- }
341
- end
342
- end
343
-
344
- protected
345
- def set_headers(uri, request, options)
346
- unless options.is_a? Hash
347
- cur_page = options
348
- else
349
- raise ArgumentError.new("cur_page must be specified") unless cur_page = options[:page]
350
- headers = options[:headers]
351
- end
352
- if @keep_alive
353
- request.add_field('Connection', 'keep-alive')
354
- request.add_field('Keep-Alive', keep_alive_time.to_s)
355
- else
356
- request.add_field('Connection', 'close')
357
- end
358
- request.add_field('Accept-Encoding', 'gzip,identity')
359
- request.add_field('Accept-Language', 'en-us,en;q=0.5')
360
- request.add_field('Host', uri.host)
361
- request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
362
-
363
- unless @cookie_jar.empty?(uri)
364
- cookies = @cookie_jar.cookies(uri)
365
- cookie = cookies.length > 0 ? cookies.join("; ") : nil
366
- if log
367
- cookies.each do |c|
368
- log.debug("using cookie: #{c}")
369
- end
370
- end
371
- request.add_field('Cookie', cookie)
372
- end
373
-
374
- # Add Referer header to request
375
- unless cur_page.uri.nil?
376
- request.add_field('Referer', cur_page.uri.to_s)
377
- end
378
-
379
- # Add User-Agent header to request
380
- request.add_field('User-Agent', @user_agent) if @user_agent
381
-
382
- # Add If-Modified-Since if page is in history
383
- if @conditional_requests
384
- if( (page = visited_page(uri)) && page.response['Last-Modified'] )
385
- request.add_field('If-Modified-Since', page.response['Last-Modified'])
386
- end
387
- end
388
-
389
- if( @auth_hash[uri.host] )
390
- case @auth_hash[uri.host]
391
- when :basic
392
- request.basic_auth(@user, @password)
393
- when :digest
394
- @digest_response = self.gen_auth_header(uri,request,@digest) if @digest
395
- request.add_field('Authorization', @digest_response) if @digest_response
396
- end
397
- end
398
-
399
- if headers
400
- headers.each do |k,v|
401
- case k
402
- when :etag then request.add_field("ETag", v)
403
- when :if_modified_since then request.add_field("If-Modified-Since", v)
404
- else
405
- raise ArgumentError.new("unknown header symbol #{k}") if k.is_a? Symbol
406
- request.add_field(k,v)
407
- end
408
- end
409
- end
410
-
411
- request
412
- end
413
-
414
- def gen_auth_header(uri, request, auth_header, is_IIS = false)
415
- @@nonce_count += 1
416
-
417
- user = @digest_user
418
- password = @digest_password
419
-
420
- auth_header =~ /^(\w+) (.*)/
421
-
422
- params = {}
423
- $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 }
424
-
425
- a_1 = "#{@user}:#{params['realm']}:#{@password}"
426
- a_2 = "#{request.method}:#{uri.path}"
427
- request_digest = ''
428
- request_digest << Digest::MD5.hexdigest(a_1)
429
- request_digest << ':' << params['nonce']
430
- request_digest << ':' << ('%08x' % @@nonce_count)
431
- request_digest << ':' << CNONCE
432
- request_digest << ':' << params['qop']
433
- request_digest << ':' << Digest::MD5.hexdigest(a_2)
434
-
435
- header = ''
436
- header << "Digest username=\"#{@user}\", "
437
- header << "realm=\"#{params['realm']}\", "
438
- if is_IIS then
439
- header << "qop=\"#{params['qop']}\", "
440
- else
441
- header << "qop=#{params['qop']}, "
442
- end
443
- header << "uri=\"#{uri.path}\", "
444
- header << "algorithm=MD5, "
445
- header << "nonce=\"#{params['nonce']}\", "
446
- header << "nc=#{'%08x' % @@nonce_count}, "
447
- header << "cnonce=\"#{CNONCE}\", "
448
- header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\""
449
-
450
- return header
451
- end
452
-
453
366
  private
454
367
 
455
- def to_absolute_uri(url, cur_page=current_page())
456
- unless url.is_a? URI
457
- url = url.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/) { |match|
458
- sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0])
459
- }
460
-
461
- url = URI.parse(
462
- Mechanize.html_unescape(
463
- url.split(/(?:%[0-9A-Fa-f]{2})+|#/).zip(
464
- url.scan(/(?:%[0-9A-Fa-f]{2})+|#/)
465
- ).map { |x,y|
466
- "#{URI.escape(x)}#{y}"
467
- }.join('')
468
- )
469
- )
470
- end
471
-
472
- url = @scheme_handlers[url.relative? ? 'relative' : url.scheme.downcase].call(url, cur_page)
473
- url.path = '/' if url.path.length == 0
474
-
475
- # construct an absolute uri
476
- if url.relative?
477
- raise 'no history. please specify an absolute URL' unless cur_page.uri
478
- base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil
479
- url = ((base && base.uri && base.uri.absolute?) ?
480
- base.uri :
481
- cur_page.uri) + url
482
- url = cur_page.uri + url
483
- # Strip initial "/.." bits from the path
484
- url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
485
- end
486
-
487
- return url
368
+ def resolve(url, referer = current_page())
369
+ hash = { :uri => url, :referer => referer }
370
+ chain = Chain.new([
371
+ Chain::URIResolver.new(@scheme_handlers)
372
+ ]).handle(hash)
373
+ hash[:uri].to_s
488
374
  end
489
375
 
490
376
  def post_form(url, form)
@@ -493,218 +379,113 @@ module WWW
493
379
 
494
380
  request_data = form.request_data
495
381
 
496
- abs_url = to_absolute_uri(url, cur_page)
497
- request = fetch_request(abs_url, :post)
498
- request.add_field('Content-Type', form.enctype)
499
- request.add_field('Content-Length', request_data.size.to_s)
500
-
501
382
  log.debug("query: #{ request_data.inspect }") if log
502
383
 
503
384
  # fetch the page
504
- page = fetch_page(abs_url, request, cur_page, [request_data])
385
+ page = fetch_page( :uri => url,
386
+ :referer => cur_page,
387
+ :verb => :post,
388
+ :params => [request_data],
389
+ :headers => {
390
+ 'Content-Type' => form.enctype,
391
+ 'Content-Length' => request_data.size.to_s,
392
+ })
505
393
  add_to_history(page)
506
394
  page
507
395
  end
508
396
 
509
- # Creates a new request object based on the scheme and type
510
- def fetch_request(uri, type = :get)
511
- raise "unsupported scheme: #{uri.scheme}" unless ['http', 'https'].include?(uri.scheme.downcase)
512
- if type == :get
513
- Net::HTTP::Get.new(uri.request_uri)
514
- else
515
- Net::HTTP::Post.new(uri.request_uri)
516
- end
517
- end
518
-
519
397
  # uri is an absolute URI
520
- def fetch_page(options, request=nil, cur_page=current_page(), request_data=[], redirects = 0)
521
- unless options.is_a? Hash
522
- raise ArgumentError.new("uri must be specified") unless uri = options
523
- raise ArgumentError.new("request must be specified") unless request
524
- else
525
- raise ArgumentError.new("uri must be specified") unless uri = options[:uri]
526
- raise ArgumentError.new("request must be specified") unless request = options[:request]
527
- cur_page = options[:page] || current_page()
528
- request_data = options[:request_data] || []
529
- headers = options[:headers]
530
- end
531
- raise "unsupported scheme: #{uri.scheme}" unless ['http', 'https'].include?(uri.scheme.downcase)
532
-
533
- log.info("#{ request.class }: #{ request.path }") if log
534
-
535
- page = nil
536
-
537
- cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= {
538
- :connection => nil,
539
- :keep_alive_options => {},
540
- })
541
- http_obj = cache_obj[:connection]
542
- if http_obj.nil? || ! http_obj.started?
543
- http_obj = cache_obj[:connection] =
544
- Net::HTTP.new( uri.host,
545
- uri.port,
546
- @proxy_addr,
547
- @proxy_port,
548
- @proxy_user,
549
- @proxy_pass
550
- )
551
- cache_obj[:keep_alive_options] = {}
552
-
553
- # Specify timeouts if given
554
- http_obj.open_timeout = @open_timeout if @open_timeout
555
- http_obj.read_timeout = @read_timeout if @read_timeout
556
- end
557
-
558
- if uri.scheme == 'https' && ! http_obj.started?
559
- http_obj.use_ssl = true
560
- http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
561
- if @ca_file
562
- http_obj.ca_file = @ca_file
563
- http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER
564
- http_obj.verify_callback = @verify_callback if @verify_callback
565
- end
566
- if @cert && @key
567
- http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
568
- http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
569
- end
570
- end
571
-
572
- # If we're keeping connections alive and the last request time is too
573
- # long ago, stop the connection. Or, if the max requests left is 1,
574
- # reset the connection.
575
- if @keep_alive && http_obj.started?
576
- opts = cache_obj[:keep_alive_options]
577
- if((opts[:timeout] &&
578
- Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) ||
579
- opts[:max] && opts[:max].to_i == 1)
580
-
581
- log.debug('Finishing stale connection') if log
582
- http_obj.finish
583
-
584
- end
585
- end
586
-
398
+ def fetch_page(params)
399
+ options = {
400
+ :request => nil,
401
+ :response => nil,
402
+ :connection => nil,
403
+ :referer => current_page(),
404
+ :uri => nil,
405
+ :verb => :get,
406
+ :agent => self,
407
+ :redirects => 0,
408
+ :params => [],
409
+ :headers => {},
410
+ }.merge(params)
411
+
412
+ before_connect = Chain.new([
413
+ Chain::URIResolver.new(@scheme_handlers),
414
+ Chain::ParameterResolver.new,
415
+ Chain::RequestResolver.new,
416
+ Chain::ConnectionResolver.new(
417
+ @connection_cache,
418
+ @keep_alive,
419
+ @proxy_addr,
420
+ @proxy_port,
421
+ @proxy_user,
422
+ @proxy_pass
423
+ ),
424
+ Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass),
425
+ Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest),
426
+ Chain::HeaderResolver.new( @keep_alive,
427
+ @keep_alive_time,
428
+ @cookie_jar,
429
+ @user_agent),
430
+ Chain::CustomHeaders.new,
431
+ @pre_connect_hook,
432
+ ])
433
+ before_connect.handle(options)
434
+
435
+ uri = options[:uri]
436
+ request = options[:request]
437
+ cur_page = options[:referer]
438
+ request_data = options[:params]
439
+ redirects = options[:redirects]
440
+ http_obj = options[:connection]
441
+
442
+ # Add If-Modified-Since if page is in history
443
+ if( (page = visited_page(uri)) && cur_page.response['Last-Modified'] )
444
+ request['If-Modified-Since'] = cur_page.response['Last-Modified']
445
+ end if(@conditional_requests)
446
+
447
+ # Specify timeouts if given
448
+ http_obj.open_timeout = @open_timeout if @open_timeout
449
+ http_obj.read_timeout = @read_timeout if @read_timeout
587
450
  http_obj.start unless http_obj.started?
588
-
589
- if headers
590
- request = set_headers(uri, request, {:page => cur_page, :headers => headers})
591
- else
592
- request = set_headers(uri, request, cur_page)
593
- end
594
-
451
+
595
452
  # Log specified headers for the request
596
- if log
597
- request.each_header do |k, v|
598
- log.debug("request-header: #{ k } => #{ v }")
599
- end
600
- end
601
-
602
- cache_obj[:last_request_time] = Time.now.to_i
603
-
453
+ log.info("#{ request.class }: #{ request.path }") if log
454
+ request.each_header do |k, v|
455
+ log.debug("request-header: #{ k } => #{ v }")
456
+ end if log
457
+
604
458
  # Send the request
459
+ attempts = 0
605
460
  begin
606
- res_klass = nil
607
- response = http_obj.request(request, *request_data) {|response|
608
-
609
- body = StringIO.new
610
- total = 0
611
- response.read_body { |part|
612
- total += part.length
613
- body.write(part)
614
- log.debug("Read #{total} bytes") if log
615
- }
616
-
617
- res_klass = Net::HTTPResponse::CODE_TO_OBJ[response.code.to_s]
618
-
619
- # Net::HTTP ignores EOFError if Content-length is given, so we emulate it here.
620
- unless res_klass <= Net::HTTPRedirection
621
- raise EOFError if response.content_length() && response.content_length() != total
622
- end
623
- body.rewind
624
-
625
- response.each_header { |k,v|
626
- log.debug("response-header: #{ k } => #{ v }")
627
- } if log
628
-
629
- content_type = nil
630
- unless response['Content-Type'].nil?
631
- data = response['Content-Type'].match(/^([^;]*)/)
632
- content_type = data[1].downcase unless data.nil?
633
- end
634
-
635
- response_body =
636
- if encoding = response['Content-Encoding']
637
- case encoding.downcase
638
- when 'gzip'
639
- log.debug('gunzip body') if log
640
- if response['Content-Length'].to_i > 0 || body.length > 0
641
- begin
642
- Zlib::GzipReader.new(body).read
643
- rescue Zlib::BufError, Zlib::GzipFile::Error
644
- log.error('Caught a Zlib::BufError') if log
645
- body.rewind
646
- body.read(10)
647
- Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body.read)
648
- end
649
- else
650
- ''
651
- end
652
- when 'x-gzip'
653
- body.read
654
- else
655
- raise 'Unsupported content encoding'
656
- end
657
- else
658
- body.read
659
- end
660
-
661
- # Find our pluggable parser
662
- page = @pluggable_parser.parser(content_type).new(
663
- uri,
664
- response,
665
- response_body,
666
- response.code
667
- ) { |parser|
668
- parser.mech = self if parser.respond_to? :mech=
669
- if parser.respond_to?(:watch_for_set=) && @watch_for_set
670
- parser.watch_for_set = @watch_for_set
671
- end
672
- }
673
-
461
+ response = http_obj.request(request, *request_data) { |response|
462
+ connection_chain = Chain.new([
463
+ Chain::ResponseReader.new(response),
464
+ Chain::BodyDecodingHandler.new,
465
+ ])
466
+ connection_chain.handle(options)
674
467
  }
675
- rescue EOFError, Errno::ECONNRESET, Errno::EPIPE
468
+ rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x
676
469
  log.error("Rescuing EOF error") if log
677
470
  http_obj.finish
471
+ raise x if attempts >= 2
678
472
  request.body = nil
679
473
  http_obj.start
474
+ attempts += 1
680
475
  retry
681
476
  end
682
-
683
- # If the server sends back keep alive options, save them
684
- if keep_alive_info = response['keep-alive']
685
- keep_alive_info.split(/,\s*/).each do |option|
686
- k, v = option.split(/=/)
687
- cache_obj[:keep_alive_options] ||= {}
688
- cache_obj[:keep_alive_options][k.intern] = v
689
- end
690
- end
691
-
692
- if page.is_a?(Page) && page.body =~ /Set-Cookie/
693
- page.search('//meta[@http-equiv="Set-Cookie"]').each do |meta|
694
- Cookie::parse(uri, meta['content'], log) { |c|
695
- log.debug("saved cookie: #{c}") if log
696
- @cookie_jar.add(uri, c)
697
- }
698
- end
699
- end
700
477
 
701
- (response.get_fields('Set-Cookie')||[]).each do |cookie|
702
- Cookie::parse(uri, cookie, log) { |c|
703
- log.debug("saved cookie: #{c}") if log
704
- @cookie_jar.add(uri, c)
705
- }
706
- end
707
-
478
+ after_connect = Chain.new([
479
+ @post_connect_hook,
480
+ Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set),
481
+ Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache),
482
+ ])
483
+ after_connect.handle(options)
484
+
485
+ res_klass = options[:res_klass]
486
+ response_body = options[:response_body]
487
+ page = options[:page]
488
+
708
489
  log.info("status: #{ page.code }") if log
709
490
 
710
491
  if follow_meta_refresh && page.respond_to?(:meta) &&
@@ -721,9 +502,12 @@ module WWW
721
502
  return page unless follow_redirect?
722
503
  log.info("follow redirect to: #{ response['Location'] }") if log
723
504
  from_uri = page.uri
724
- abs_uri = to_absolute_uri(response['Location'].to_s, page)
725
505
  raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit
726
- page = fetch_page(abs_uri, fetch_request(abs_uri), page, [], redirects + 1)
506
+ page = fetch_page( :uri => response['Location'].to_s,
507
+ :referer => page,
508
+ :params => [],
509
+ :redirects => redirects + 1
510
+ )
727
511
  @history.push(page, from_uri)
728
512
  return page
729
513
  elsif res_klass <= Net::HTTPUnauthorized
@@ -731,31 +515,26 @@ module WWW
731
515
  raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
732
516
  if response['www-authenticate'] =~ /Digest/i
733
517
  @auth_hash[uri.host] = :digest
518
+ if response['server'] =~ /Microsoft-IIS/
519
+ @auth_hash[uri.host] = :iis_digest
520
+ end
734
521
  @digest = response['www-authenticate']
735
522
  else
736
523
  @auth_hash[uri.host] = :basic
737
524
  end
738
- # Copy the request headers for the second attempt
739
- req = fetch_request(uri, request.method.downcase.to_sym)
740
- request.each_header do |k,v|
741
- req[k] = v
742
- end
743
- return fetch_page(uri, req, cur_page, request_data)
525
+ return fetch_page( :uri => uri,
526
+ :referer => cur_page,
527
+ :verb => request.method.downcase.to_sym,
528
+ :params => request_data,
529
+ :headers => request.to_hash
530
+ )
744
531
  end
745
532
 
746
533
  raise ResponseCodeError.new(page), "Unhandled response", caller
747
534
  end
748
535
 
749
- def self.build_query_string(parameters)
750
- parameters.map { |k,v|
751
- k &&
752
- [WEBrick::HTTPUtils.escape_form(k.to_s),
753
- WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
754
- }.compact.join('&')
755
- end
756
-
757
536
  def add_to_history(page)
758
- @history.push(page, to_absolute_uri(page.uri))
537
+ @history.push(page, resolve(page.uri))
759
538
  history_added.call(page) if history_added
760
539
  end
761
540
  end