mechanize 0.6.11 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mechanize might be problematic. Click here for more details.

Files changed (91) hide show
  1. data/CHANGELOG.txt +8 -0
  2. data/Manifest.txt +31 -22
  3. data/lib/mechanize.rb +2 -652
  4. data/lib/www/mechanize.rb +635 -0
  5. data/lib/www/mechanize/content_type_error.rb +16 -0
  6. data/lib/www/mechanize/cookie.rb +64 -0
  7. data/lib/{mechanize/cookie.rb → www/mechanize/cookie_jar.rb} +0 -60
  8. data/lib/www/mechanize/file.rb +73 -0
  9. data/lib/www/mechanize/file_saver.rb +39 -0
  10. data/lib/{mechanize → www/mechanize}/form.rb +119 -137
  11. data/lib/www/mechanize/form/button.rb +8 -0
  12. data/lib/www/mechanize/form/check_box.rb +13 -0
  13. data/lib/www/mechanize/form/field.rb +28 -0
  14. data/lib/www/mechanize/form/file_upload.rb +24 -0
  15. data/lib/www/mechanize/form/image_button.rb +23 -0
  16. data/lib/www/mechanize/form/multi_select_list.rb +69 -0
  17. data/lib/www/mechanize/form/option.rb +51 -0
  18. data/lib/www/mechanize/form/radio_button.rb +38 -0
  19. data/lib/www/mechanize/form/select_list.rb +41 -0
  20. data/lib/www/mechanize/headers.rb +12 -0
  21. data/lib/{mechanize → www/mechanize}/history.rb +0 -0
  22. data/lib/{mechanize → www/mechanize}/inspect.rb +21 -28
  23. data/lib/{mechanize → www/mechanize}/list.rb +0 -0
  24. data/lib/{mechanize → www/mechanize}/monkey_patch.rb +19 -0
  25. data/lib/www/mechanize/page.rb +121 -0
  26. data/lib/www/mechanize/page/base.rb +10 -0
  27. data/lib/www/mechanize/page/frame.rb +22 -0
  28. data/lib/www/mechanize/page/link.rb +50 -0
  29. data/lib/www/mechanize/page/meta.rb +10 -0
  30. data/lib/www/mechanize/pluggable_parsers.rb +93 -0
  31. data/lib/{mechanize/errors.rb → www/mechanize/response_code_error.rb} +1 -13
  32. data/test/{test_includes.rb → helper.rb} +4 -18
  33. data/test/{test_servlets.rb → servlets.rb} +0 -0
  34. data/test/tc_authenticate.rb +1 -8
  35. data/test/tc_bad_links.rb +3 -10
  36. data/test/tc_blank_form.rb +1 -8
  37. data/test/tc_checkboxes.rb +1 -8
  38. data/test/tc_cookie_class.rb +1 -6
  39. data/test/tc_cookie_jar.rb +1 -7
  40. data/test/tc_cookies.rb +10 -17
  41. data/test/tc_encoded_links.rb +5 -12
  42. data/test/tc_errors.rb +4 -11
  43. data/test/tc_follow_meta.rb +1 -8
  44. data/test/tc_form_action.rb +6 -14
  45. data/test/tc_form_as_hash.rb +1 -9
  46. data/test/tc_form_button.rb +5 -8
  47. data/test/tc_form_no_inputname.rb +1 -8
  48. data/test/tc_forms.rb +16 -24
  49. data/test/tc_frames.rb +3 -10
  50. data/test/tc_gzipping.rb +2 -9
  51. data/test/tc_history.rb +5 -12
  52. data/test/tc_html_unscape_forms.rb +8 -15
  53. data/test/tc_if_modified_since.rb +1 -6
  54. data/test/tc_keep_alive.rb +1 -8
  55. data/test/tc_links.rb +12 -19
  56. data/test/tc_mech.rb +26 -34
  57. data/test/{test_mechanize_file.rb → tc_mechanize_file.rb} +1 -6
  58. data/test/tc_multi_select.rb +10 -17
  59. data/test/tc_no_attributes.rb +1 -8
  60. data/test/tc_page.rb +3 -10
  61. data/test/tc_pluggable_parser.rb +8 -15
  62. data/test/tc_post_form.rb +3 -10
  63. data/test/tc_pretty_print.rb +3 -10
  64. data/test/tc_radiobutton.rb +2 -9
  65. data/test/tc_referer.rb +13 -20
  66. data/test/tc_relative_links.rb +1 -8
  67. data/test/tc_response_code.rb +14 -21
  68. data/test/tc_save_file.rb +1 -9
  69. data/test/tc_select.rb +3 -10
  70. data/test/tc_select_all.rb +2 -10
  71. data/test/tc_select_none.rb +2 -10
  72. data/test/tc_select_noopts.rb +2 -9
  73. data/test/tc_set_fields.rb +2 -9
  74. data/test/tc_ssl_server.rb +5 -12
  75. data/test/tc_subclass.rb +2 -9
  76. data/test/tc_textarea.rb +2 -9
  77. data/test/tc_upload.rb +2 -9
  78. data/test/test_all.rb +4 -43
  79. metadata +96 -80
  80. data/lib/mechanize/form_elements.rb +0 -254
  81. data/lib/mechanize/net-overrides/net/http.rb +0 -2107
  82. data/lib/mechanize/net-overrides/net/https.rb +0 -172
  83. data/lib/mechanize/net-overrides/net/protocol.rb +0 -380
  84. data/lib/mechanize/page.rb +0 -138
  85. data/lib/mechanize/page_elements.rb +0 -77
  86. data/lib/mechanize/parsers/rexml_page.rb +0 -35
  87. data/lib/mechanize/pluggable_parsers.rb +0 -204
  88. data/lib/mechanize/rexml.rb +0 -236
  89. data/setup.rb +0 -1585
  90. data/test/tc_proxy.rb +0 -25
  91. data/test/tc_watches.rb +0 -32
@@ -0,0 +1,635 @@
1
+ require 'net/http'
2
+ require 'net/https'
3
+ require 'uri'
4
+ require 'webrick/httputils'
5
+ require 'zlib'
6
+ require 'stringio'
7
+ require 'digest/md5'
8
+
9
+ require 'www/mechanize/content_type_error'
10
+ require 'www/mechanize/response_code_error'
11
+ require 'www/mechanize/cookie'
12
+ require 'www/mechanize/cookie_jar'
13
+ require 'www/mechanize/history'
14
+ require 'www/mechanize/list'
15
+ require 'www/mechanize/form'
16
+ require 'www/mechanize/pluggable_parsers'
17
+ require 'www/mechanize/inspect'
18
+ require 'www/mechanize/monkey_patch'
19
+
20
+ module WWW
21
+ # = Synopsis
22
+ # The Mechanize library is used for automating interaction with a website. It
23
+ # can follow links, and submit forms. Form fields can be populated and
24
+ # submitted. A history of URL's is maintained and can be queried.
25
+ #
26
+ # == Example
27
+ # require 'rubygems'
28
+ # require 'mechanize'
29
+ # require 'logger'
30
+ #
31
+ # agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
32
+ # agent.user_agent_alias = 'Mac Safari'
33
+ # page = agent.get("http://www.google.com/")
34
+ # search_form = page.forms.name("f").first
35
+ # search_form.fields.name("q").value = "Hello"
36
+ # search_results = agent.submit(search_form)
37
+ # puts search_results.body
38
+ class Mechanize
39
+ ##
40
+ # The version of Mechanize you are using.
41
+ VERSION = '0.7.0'
42
+
43
+ ##
44
+ # User Agent aliases
45
+ AGENT_ALIASES = {
46
+ 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
47
+ 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
48
+ 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
49
+ 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3',
50
+ 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
51
+ 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
52
+ 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
53
+ 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
54
+ 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"
55
+ }
56
+
57
+ attr_accessor :cookie_jar
58
+ attr_accessor :log
59
+ attr_accessor :open_timeout, :read_timeout
60
+ attr_accessor :user_agent
61
+ attr_accessor :watch_for_set
62
+ attr_accessor :ca_file
63
+ attr_accessor :key
64
+ attr_accessor :cert
65
+ attr_accessor :pass
66
+ attr_accessor :redirect_ok
67
+ attr_accessor :keep_alive_time
68
+ attr_accessor :keep_alive
69
+ attr_accessor :conditional_requests
70
+ attr_accessor :follow_meta_refresh
71
+ attr_accessor :verify_callback
72
+
73
+ attr_reader :history
74
+ attr_reader :pluggable_parser
75
+
76
+ alias :follow_redirect? :redirect_ok
77
+
78
+ @@nonce_count = -1
79
+ CNONCE = Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535)))
80
+
81
+ def initialize
82
+ # attr_accessors
83
+ @cookie_jar = CookieJar.new
84
+ @log = nil
85
+ @open_timeout = nil
86
+ @read_timeout = nil
87
+ @user_agent = AGENT_ALIASES['Mechanize']
88
+ @watch_for_set = nil
89
+ @ca_file = nil # OpenSSL server certificate file
90
+
91
+ # callback for OpenSSL errors while verifying the server certificate
92
+ # chain, can be used for debugging or to ignore errors by always
93
+ # returning _true_
94
+ @verify_callback = nil
95
+ @cert = nil # OpenSSL Certificate
96
+ @key = nil # OpenSSL Private Key
97
+ @pass = nil # OpenSSL Password
98
+ @redirect_ok = true # Should we follow redirects?
99
+
100
+ # attr_readers
101
+ @history = WWW::Mechanize::History.new
102
+ @pluggable_parser = PluggableParser.new
103
+
104
+ # Auth variables
105
+ @user = nil # Auth User
106
+ @password = nil # Auth Password
107
+ @digest = nil # DigestAuth Digest
108
+ @auth_hash = {} # Keep track of urls for sending auth
109
+ @digest_response = nil
110
+
111
+ # Proxy settings
112
+ @proxy_addr = nil
113
+ @proxy_pass = nil
114
+ @proxy_port = nil
115
+ @proxy_user = nil
116
+
117
+ @conditional_requests = true
118
+
119
+ @follow_meta_refresh = false
120
+
121
+ # Connection Cache & Keep alive
122
+ @connection_cache = {}
123
+ @keep_alive_time = 300
124
+ @keep_alive = true
125
+
126
+ yield self if block_given?
127
+ end
128
+
129
+ def max_history=(length); @history.max_size = length; end
130
+ def max_history; @history.max_size; end
131
+
132
+ # Sets the proxy address, port, user, and password
133
+ def set_proxy(addr, port, user = nil, pass = nil)
134
+ @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
135
+ end
136
+
137
+ # Set the user agent for the Mechanize object.
138
+ # See AGENT_ALIASES
139
+ def user_agent_alias=(al)
140
+ self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
141
+ end
142
+
143
+ # Returns a list of cookies stored in the cookie jar.
144
+ def cookies
145
+ @cookie_jar.to_a
146
+ end
147
+
148
+ # Sets the user and password to be used for basic authentication.
149
+ def basic_auth(user, password)
150
+ auth(user, password)
151
+ end
152
+
153
+ def auth(user, password)
154
+ @user = user
155
+ @password = password
156
+ end
157
+
158
+ # Fetches the URL passed in and returns a page.
159
+ def get(url, referer=nil, &block)
160
+ cur_page = referer || current_page ||
161
+ Page.new( nil, {'content-type'=>'text/html'})
162
+
163
+ # fetch the page
164
+ abs_uri = to_absolute_uri(url, cur_page)
165
+ request = fetch_request(abs_uri)
166
+ page = fetch_page(abs_uri, request, cur_page, &block)
167
+ add_to_history(page)
168
+ page
169
+ end
170
+
171
+ # Fetch a file and return the contents of the file.
172
+ def get_file(url)
173
+ get(url).body
174
+ end
175
+
176
+
177
+ # Clicks the WWW::Mechanize::Link object passed in and returns the
178
+ # page fetched.
179
+ def click(link)
180
+ referer =
181
+ begin
182
+ link.page
183
+ rescue
184
+ nil
185
+ end
186
+ uri = to_absolute_uri(
187
+ link.attributes['href'] || link.attributes['src'] || link.href,
188
+ referer || current_page()
189
+ )
190
+ get(uri, referer)
191
+ end
192
+
193
+ # Equivalent to the browser back button. Returns the most recent page
194
+ # visited.
195
+ def back
196
+ @history.pop
197
+ end
198
+
199
+ # Posts to the given URL wht the query parameters passed in. Query
200
+ # parameters can be passed as a hash, or as an array of arrays.
201
+ # Example:
202
+ # agent.post('http://example.com/', "foo" => "bar")
203
+ # or
204
+ # agent.post('http://example.com/', [ ["foo", "bar"] ])
205
+ def post(url, query={})
206
+ node = Hpricot::Elem.new(Hpricot::STag.new('form'))
207
+ node['method'] = 'POST'
208
+ node['enctype'] = 'application/x-www-form-urlencoded'
209
+
210
+ form = Form.new(node)
211
+ query.each { |k,v|
212
+ form.fields << Form::Field.new(k,v)
213
+ }
214
+ post_form(url, form)
215
+ end
216
+
217
+ # Submit a form with an optional button.
218
+ # Without a button:
219
+ # page = agent.get('http://example.com')
220
+ # agent.submit(page.forms.first)
221
+ # With a button
222
+ # agent.submit(page.forms.first, page.forms.first.buttons.first)
223
+ def submit(form, button=nil)
224
+ form.add_button_to_query(button) if button
225
+ uri = to_absolute_uri(form.action, form.page)
226
+ case form.method.upcase
227
+ when 'POST'
228
+ post_form(uri, form)
229
+ when 'GET'
230
+ uri.query = WWW::Mechanize.build_query_string(form.build_query)
231
+ get(uri)
232
+ else
233
+ raise "unsupported method: #{form.method.upcase}"
234
+ end
235
+ end
236
+
237
+ # Returns the current page loaded by Mechanize
238
+ def current_page
239
+ @history.last
240
+ end
241
+
242
+ # Returns whether or not a url has been visited
243
+ def visited?(url)
244
+ ! visited_page(url).nil?
245
+ end
246
+
247
+ # Returns a visited page for the url passed in, otherwise nil
248
+ def visited_page(url)
249
+ if url.respond_to? :href
250
+ url = url.href
251
+ end
252
+ @history.visited_page(to_absolute_uri(url))
253
+ end
254
+
255
+ # Runs given block, then resets the page history as it was before. self is
256
+ # given as a parameter to the block. Returns the value of the block.
257
+ def transact
258
+ history_backup = @history.dup
259
+ begin
260
+ yield self
261
+ ensure
262
+ @history = history_backup
263
+ end
264
+ end
265
+
266
+ alias :page :current_page
267
+
268
+ class << self
269
+ def html_unescape(s)
270
+ return s unless s
271
+ s.gsub(/&(\w+|#[0-9]+);/) { |match|
272
+ number = case match
273
+ when /&(\w+);/
274
+ Hpricot::NamedCharacters[$1]
275
+ when /&#([0-9]+);/
276
+ $1.to_i
277
+ end
278
+
279
+ number ? ([number].pack('U') rescue match) : match
280
+ }
281
+ end
282
+ end
283
+
284
+ protected
285
+ def set_headers(uri, request, cur_page)
286
+ if @keep_alive
287
+ request.add_field('Connection', 'keep-alive')
288
+ request.add_field('Keep-Alive', keep_alive_time.to_s)
289
+ else
290
+ request.add_field('Connection', 'close')
291
+ end
292
+ request.add_field('Accept-Encoding', 'gzip,identity')
293
+ request.add_field('Accept-Language', 'en-us,en;q=0.5')
294
+ request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
295
+
296
+ unless @cookie_jar.empty?(uri)
297
+ cookies = @cookie_jar.cookies(uri)
298
+ cookie = cookies.length > 0 ? cookies.join("; ") : nil
299
+ if log
300
+ cookies.each do |c|
301
+ log.debug("using cookie: #{c}")
302
+ end
303
+ end
304
+ request.add_field('Cookie', cookie)
305
+ end
306
+
307
+ # Add Referer header to request
308
+ unless cur_page.uri.nil?
309
+ request.add_field('Referer', cur_page.uri.to_s)
310
+ end
311
+
312
+ # Add User-Agent header to request
313
+ request.add_field('User-Agent', @user_agent) if @user_agent
314
+
315
+ # Add If-Modified-Since if page is in history
316
+ if @conditional_requests
317
+ if( (page = visited_page(uri)) && page.response['Last-Modified'] )
318
+ request.add_field('If-Modified-Since', page.response['Last-Modified'])
319
+ end
320
+ end
321
+
322
+ if( @auth_hash[uri.host] )
323
+ case @auth_hash[uri.host]
324
+ when :basic
325
+ request.basic_auth(@user, @password)
326
+ when :digest
327
+ @digest_response = self.gen_auth_header(uri,request,@digest) if @digest
328
+ request.add_field('Authorization', @digest_response) if @digest_response
329
+ end
330
+ end
331
+
332
+ request
333
+ end
334
+
335
+ def gen_auth_header(uri, request, auth_header, is_IIS = false)
336
+ @@nonce_count += 1
337
+
338
+ user = @digest_user
339
+ password = @digest_password
340
+
341
+ auth_header =~ /^(\w+) (.*)/
342
+
343
+ params = {}
344
+ $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 }
345
+
346
+ a_1 = "#{@user}:#{params['realm']}:#{@password}"
347
+ a_2 = "#{request.method}:#{uri.path}"
348
+ request_digest = ''
349
+ request_digest << Digest::MD5.hexdigest(a_1)
350
+ request_digest << ':' << params['nonce']
351
+ request_digest << ':' << ('%08x' % @@nonce_count)
352
+ request_digest << ':' << CNONCE
353
+ request_digest << ':' << params['qop']
354
+ request_digest << ':' << Digest::MD5.hexdigest(a_2)
355
+
356
+ header = ''
357
+ header << "Digest username=\"#{@user}\", "
358
+ header << "realm=\"#{params['realm']}\", "
359
+ if is_IIS then
360
+ header << "qop=\"#{params['qop']}\", "
361
+ else
362
+ header << "qop=#{params['qop']}, "
363
+ end
364
+ header << "uri=\"#{uri.path}\", "
365
+ header << "algorithm=MD5, "
366
+ header << "nonce=\"#{params['nonce']}\", "
367
+ header << "nc=#{'%08x' % @@nonce_count}, "
368
+ header << "cnonce=\"#{CNONCE}\", "
369
+ header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\""
370
+
371
+ return header
372
+ end
373
+
374
+ private
375
+
376
+ def to_absolute_uri(url, cur_page=current_page())
377
+ unless url.is_a? URI
378
+ url = url.to_s.strip.gsub(/[^#{0.chr}-#{125.chr}]/) { |match|
379
+ sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0])
380
+ }
381
+
382
+ url = URI.parse(
383
+ Mechanize.html_unescape(
384
+ url.split(/%[0-9A-Fa-f]{2}|#/).zip(
385
+ url.scan(/%[0-9A-Fa-f]{2}|#/)
386
+ ).map { |x,y|
387
+ "#{URI.escape(x)}#{y}"
388
+ }.join('')
389
+ )
390
+ )
391
+ end
392
+
393
+ url.path = '/' if url.path.length == 0
394
+
395
+ # construct an absolute uri
396
+ if url.relative?
397
+ raise 'no history. please specify an absolute URL' unless cur_page.uri
398
+ base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil
399
+ url = ((base && base.uri && base.uri.absolute?) ?
400
+ base.uri :
401
+ cur_page.uri) + url
402
+ url = cur_page.uri + url
403
+ # Strip initial "/.." bits from the path
404
+ url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
405
+ end
406
+
407
+ return url
408
+ end
409
+
410
+ def post_form(url, form)
411
+ cur_page = form.page || current_page ||
412
+ Page.new( nil, {'content-type'=>'text/html'})
413
+
414
+ request_data = form.request_data
415
+
416
+ abs_url = to_absolute_uri(url, cur_page)
417
+ request = fetch_request(abs_url, :post)
418
+ request.add_field('Content-Type', form.enctype)
419
+ request.add_field('Content-Length', request_data.size.to_s)
420
+
421
+ log.debug("query: #{ request_data.inspect }") if log
422
+
423
+ # fetch the page
424
+ page = fetch_page(abs_url, request, cur_page, [request_data])
425
+ add_to_history(page)
426
+ page
427
+ end
428
+
429
+ # Creates a new request object based on the scheme and type
430
+ def fetch_request(uri, type = :get)
431
+ raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
432
+ if type == :get
433
+ Net::HTTP::Get.new(uri.request_uri)
434
+ else
435
+ Net::HTTP::Post.new(uri.request_uri)
436
+ end
437
+ end
438
+
439
+ # uri is an absolute URI
440
+ def fetch_page(uri, request, cur_page=current_page(), request_data=[])
441
+ raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
442
+
443
+ log.info("#{ request.class }: #{ request.path }") if log
444
+
445
+ page = nil
446
+
447
+ cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= {
448
+ :connection => nil,
449
+ :keep_alive_options => {},
450
+ })
451
+ http_obj = cache_obj[:connection]
452
+ if http_obj.nil? || ! http_obj.started?
453
+ http_obj = cache_obj[:connection] =
454
+ Net::HTTP.new( uri.host,
455
+ uri.port,
456
+ @proxy_addr,
457
+ @proxy_port,
458
+ @proxy_user,
459
+ @proxy_pass
460
+ )
461
+ cache_obj[:keep_alive_options] = {}
462
+
463
+ # Specify timeouts if given
464
+ http_obj.open_timeout = @open_timeout if @open_timeout
465
+ http_obj.read_timeout = @read_timeout if @read_timeout
466
+ end
467
+
468
+ if uri.scheme == 'https' && ! http_obj.started?
469
+ http_obj.use_ssl = true
470
+ http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
471
+ if @ca_file
472
+ http_obj.ca_file = @ca_file
473
+ http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER
474
+ http_obj.verify_callback = @verify_callback if @verify_callback
475
+ end
476
+ if @cert && @key
477
+ http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
478
+ http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
479
+ end
480
+ end
481
+
482
+ # If we're keeping connections alive and the last request time is too
483
+ # long ago, stop the connection. Or, if the max requests left is 1,
484
+ # reset the connection.
485
+ if @keep_alive && http_obj.started?
486
+ opts = cache_obj[:keep_alive_options]
487
+ if((opts[:timeout] &&
488
+ Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) ||
489
+ opts[:max] && opts[:max].to_i == 1)
490
+
491
+ log.debug('Finishing stale connection') if log
492
+ http_obj.finish
493
+
494
+ end
495
+ end
496
+
497
+ http_obj.start unless http_obj.started?
498
+
499
+ request = set_headers(uri, request, cur_page)
500
+
501
+ # Log specified headers for the request
502
+ if log
503
+ request.each_header do |k, v|
504
+ log.debug("request-header: #{ k } => #{ v }")
505
+ end
506
+ end
507
+
508
+ cache_obj[:last_request_time] = Time.now.to_i
509
+
510
+ # Send the request
511
+ response = http_obj.request(request, *request_data) {|response|
512
+
513
+ body = StringIO.new
514
+ total = 0
515
+ response.read_body { |part|
516
+ total += part.length
517
+ body.write(part)
518
+ log.debug("Read #{total} bytes") if log
519
+ }
520
+ body.rewind
521
+
522
+ response.each_header { |k,v|
523
+ log.debug("response-header: #{ k } => #{ v }")
524
+ } if log
525
+
526
+ content_type = nil
527
+ unless response['Content-Type'].nil?
528
+ data = response['Content-Type'].match(/^([^;]*)/)
529
+ content_type = data[1].downcase unless data.nil?
530
+ end
531
+
532
+ response_body =
533
+ if encoding = response['Content-Encoding']
534
+ case encoding.downcase
535
+ when 'gzip'
536
+ log.debug('gunzip body') if log
537
+ Zlib::GzipReader.new(body).read
538
+ when 'x-gzip'
539
+ body.read
540
+ else
541
+ raise 'Unsupported content encoding'
542
+ end
543
+ else
544
+ body.read
545
+ end
546
+
547
+ # Find our pluggable parser
548
+ page = @pluggable_parser.parser(content_type).new(
549
+ uri,
550
+ response,
551
+ response_body,
552
+ response.code
553
+ ) { |parser|
554
+ parser.mech = self if parser.respond_to? :mech=
555
+ if parser.respond_to?(:watch_for_set=) && @watch_for_set
556
+ parser.watch_for_set = @watch_for_set
557
+ end
558
+ }
559
+
560
+ }
561
+
562
+ # If the server sends back keep alive options, save them
563
+ if keep_alive_info = response['keep-alive']
564
+ keep_alive_info.split(/,\s*/).each do |option|
565
+ k, v = option.split(/=/)
566
+ cache_obj[:keep_alive_options] ||= {}
567
+ cache_obj[:keep_alive_options][k.intern] = v
568
+ end
569
+ end
570
+
571
+ (response.get_fields('Set-Cookie')||[]).each do |cookie|
572
+ Cookie::parse(uri, cookie, log) { |c|
573
+ log.debug("saved cookie: #{c}") if log
574
+ @cookie_jar.add(uri, c)
575
+ }
576
+ end
577
+
578
+ log.info("status: #{ page.code }") if log
579
+
580
+ res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
581
+
582
+ if follow_meta_refresh && page.respond_to?(:meta) &&
583
+ (redirect = page.meta.first)
584
+ return redirect.click
585
+ end
586
+
587
+ return page if res_klass <= Net::HTTPSuccess
588
+
589
+ if res_klass == Net::HTTPNotModified
590
+ log.debug("Got cached page") if log
591
+ return visited_page(uri)
592
+ elsif res_klass <= Net::HTTPRedirection
593
+ return page unless follow_redirect?
594
+ log.info("follow redirect to: #{ response['Location'] }") if log
595
+ from_uri = page.uri
596
+ abs_uri = to_absolute_uri(response['Location'].to_s, page)
597
+ page = fetch_page(abs_uri, fetch_request(abs_uri), page)
598
+ @history.push(page, from_uri)
599
+ return page
600
+ elsif res_klass <= Net::HTTPUnauthorized
601
+ raise ResponseCodeError.new(page) unless @user || @password
602
+ raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
603
+ if response['www-authenticate'] =~ /Digest/i
604
+ @auth_hash[uri.host] = :digest
605
+ @digest = response['www-authenticate']
606
+ else
607
+ @auth_hash[uri.host] = :basic
608
+ end
609
+ return fetch_page( uri,
610
+ fetch_request(uri, request.method.downcase.to_sym),
611
+ cur_page,
612
+ request_data
613
+ )
614
+ end
615
+
616
+ raise ResponseCodeError.new(page), "Unhandled response", caller
617
+ end
618
+
619
+ def self.build_query_string(parameters)
620
+ vals = []
621
+ parameters.each { |k,v|
622
+ next if k.nil?
623
+ vals <<
624
+ [WEBrick::HTTPUtils.escape_form(k),
625
+ WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
626
+ }
627
+
628
+ vals.join("&")
629
+ end
630
+
631
+ def add_to_history(page)
632
+ @history.push(page, to_absolute_uri(page.uri))
633
+ end
634
+ end
635
+ end