mechanize 2.0.pre.2 → 2.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mechanize might be problematic. Click here for more details.

Files changed (46) hide show
  1. data.tar.gz.sig +0 -0
  2. data/CHANGELOG.rdoc +22 -0
  3. data/Manifest.txt +11 -8
  4. data/Rakefile +2 -2
  5. data/examples/flickr_upload.rb +6 -7
  6. data/examples/mech-dump.rb +0 -2
  7. data/examples/proxy_req.rb +0 -2
  8. data/examples/rubyforge.rb +1 -3
  9. data/examples/spider.rb +2 -3
  10. data/lib/mechanize.rb +228 -680
  11. data/lib/mechanize/form/field.rb +1 -1
  12. data/lib/mechanize/history.rb +23 -5
  13. data/lib/mechanize/http.rb +3 -0
  14. data/lib/mechanize/http/agent.rb +738 -0
  15. data/lib/mechanize/inspect.rb +2 -2
  16. data/lib/mechanize/page.rb +101 -42
  17. data/lib/mechanize/page/frame.rb +24 -17
  18. data/lib/mechanize/page/link.rb +72 -54
  19. data/lib/mechanize/page/meta_refresh.rb +56 -0
  20. data/lib/mechanize/response_read_error.rb +27 -0
  21. data/test/htdocs/frame_referer_test.html +10 -0
  22. data/test/htdocs/tc_referer.html +4 -0
  23. data/test/test_frames.rb +9 -0
  24. data/test/test_history.rb +74 -98
  25. data/test/test_mechanize.rb +334 -812
  26. data/test/test_mechanize_form.rb +32 -3
  27. data/test/{test_textarea.rb → test_mechanize_form_textarea.rb} +1 -1
  28. data/test/test_mechanize_http_agent.rb +697 -0
  29. data/test/test_mechanize_link.rb +83 -0
  30. data/test/test_mechanize_page_encoding.rb +147 -0
  31. data/test/test_mechanize_page_link.rb +379 -0
  32. data/test/test_mechanize_page_meta_refresh.rb +115 -0
  33. data/test/test_pretty_print.rb +1 -1
  34. data/test/test_referer.rb +29 -5
  35. data/test/test_response_code.rb +21 -20
  36. data/test/test_robots.rb +13 -17
  37. data/test/test_scheme.rb +1 -1
  38. metadata +30 -31
  39. metadata.gz.sig +0 -0
  40. data/lib/mechanize/page/meta.rb +0 -48
  41. data/test/test_form_no_inputname.rb +0 -15
  42. data/test/test_links.rb +0 -146
  43. data/test/test_mechanize_page.rb +0 -224
  44. data/test/test_meta.rb +0 -67
  45. data/test/test_upload.rb +0 -109
  46. data/test/test_verbs.rb +0 -25
@@ -29,7 +29,7 @@ class Mechanize
29
29
  return -1 if Hash === other.node
30
30
  node <=> other.node
31
31
  end
32
-
32
+
33
33
  # This method is a shortcut to get field's DOM id.
34
34
  # Common usage: form.field_with(:dom_id => "foo")
35
35
  def dom_id
@@ -14,26 +14,40 @@ class Mechanize
14
14
  @history_index = orig.instance_variable_get(:@history_index).dup
15
15
  end
16
16
 
17
+ def inspect # :nodoc:
18
+ uris = map { |page| page.uri }.join ', '
19
+
20
+ "[#{uris}]"
21
+ end
22
+
17
23
  def push(page, uri = nil)
18
24
  super(page)
25
+
19
26
  @history_index[(uri ? uri : page.uri).to_s] = page
27
+
20
28
  if @max_size && self.length > @max_size
21
29
  while self.length > @max_size
22
30
  self.shift
23
31
  end
24
32
  end
33
+
25
34
  self
26
35
  end
27
36
  alias :<< :push
28
37
 
29
- def visited?(url)
30
- ! visited_page(url).nil?
31
- end
38
+ def visited_page(uri)
39
+ page = @history_index[uri.to_s]
32
40
 
33
- def visited_page(url)
34
- @history_index[(url.respond_to?(:uri) ? url.uri : url).to_s]
41
+ return page if page # HACK
42
+
43
+ uri = uri.dup
44
+ uri.path = '/' if uri.path.empty?
45
+
46
+ @history_index[uri.to_s]
35
47
  end
36
48
 
49
+ alias visited? visited_page
50
+
37
51
  def clear
38
52
  @history_index.clear
39
53
  super
@@ -43,7 +57,9 @@ class Mechanize
43
57
  return nil if length == 0
44
58
  page = self[0]
45
59
  self[0] = nil
60
+
46
61
  super
62
+
47
63
  remove_from_index(page)
48
64
  page
49
65
  end
@@ -56,10 +72,12 @@ class Mechanize
56
72
  end
57
73
 
58
74
  private
75
+
59
76
  def remove_from_index(page)
60
77
  @history_index.each do |k,v|
61
78
  @history_index.delete(k) if v == page
62
79
  end
63
80
  end
81
+
64
82
  end
65
83
  end
@@ -0,0 +1,3 @@
1
+ class Mechanize::HTTP
2
+ end
3
+
@@ -0,0 +1,738 @@
1
+ ##
2
+ # An HTTP (and local disk access) user agent
3
+
4
+ class Mechanize::HTTP::Agent
5
+
6
+ attr_reader :cookie_jar
7
+
8
+ # Disables If-Modified-Since conditional requests (enabled by default)
9
+ attr_accessor :conditional_requests
10
+ attr_accessor :context
11
+
12
+ # Follow HTML meta refresh. If set to +:anywhere+ meta refresh tags outside
13
+ # of the head element will be followed.
14
+ attr_accessor :follow_meta_refresh
15
+ attr_accessor :gzip_enabled
16
+ attr_accessor :history
17
+
18
+ # Length of time to wait until a connection is opened in seconds
19
+ attr_accessor :open_timeout
20
+
21
+ attr_accessor :password
22
+ attr_reader :proxy_uri
23
+
24
+ # A list of hooks to call after retrieving a response. Hooks are called with
25
+ # the agent and the response returned.
26
+
27
+ attr_reader :post_connect_hooks
28
+
29
+ # A list of hooks to call before making a request. Hooks are called with
30
+ # the agent and the request to be performed.
31
+
32
+ attr_reader :pre_connect_hooks
33
+
34
+ # Length of time to attempt to read data from the server
35
+ attr_accessor :read_timeout
36
+
37
+ # Controls how this agent deals with redirects. The following values are
38
+ # allowed:
39
+ #
40
+ # :all, true:: All 3xx redirects are followed (default)
41
+ # :permanent:: Only 301 Moved Permanantly redirects are followed
42
+ # false:: No redirects are followed
43
+
44
+ attr_accessor :redirect_ok
45
+ attr_accessor :redirection_limit
46
+
47
+ # A hash of request headers to be used
48
+
49
+ attr_accessor :request_headers
50
+
51
+ # When true, this agent will consult the site's robots.txt for each access.
52
+
53
+ attr_reader :robots
54
+
55
+ attr_accessor :scheme_handlers
56
+
57
+ attr_accessor :user
58
+ attr_reader :user_agent
59
+
60
+ # Path to an OpenSSL server certificate file
61
+ attr_accessor :ca_file
62
+
63
+ # An OpenSSL private key or the path to a private key
64
+ attr_accessor :key
65
+
66
+ # An OpenSSL client certificate or the path to a certificate file.
67
+ attr_accessor :cert
68
+
69
+ # OpenSSL key password
70
+ attr_accessor :pass
71
+
72
+ # A callback for additional certificate verification. See
73
+ # OpenSSL::SSL::SSLContext#verify_callback
74
+ #
75
+ # The callback can be used for debugging or to ignore errors by always
76
+ # returning +true+. Specifying nil uses the default method that was valid
77
+ # when the SSLContext was created
78
+ attr_accessor :verify_callback
79
+
80
+ attr_reader :http # :nodoc:
81
+
82
+ def initialize
83
+ @auth_hash = {} # Keep track of urls for sending auth
84
+ @conditional_requests = true
85
+ @context = nil
86
+ @cookie_jar = Mechanize::CookieJar.new
87
+ @digest = nil # DigestAuth Digest
88
+ @digest_auth = Net::HTTP::DigestAuth.new
89
+ @follow_meta_refresh = false
90
+ @gzip_enabled = true
91
+ @history = Mechanize::History.new
92
+ @keep_alive_time = 300
93
+ @open_timeout = nil
94
+ @password = nil # HTTP auth password
95
+ @post_connect_hooks = []
96
+ @pre_connect_hooks = []
97
+ @proxy_uri = nil
98
+ @read_timeout = nil
99
+ @redirect_ok = true
100
+ @redirection_limit = 20
101
+ @request_headers = {}
102
+ @robots = false
103
+ @user = nil # HTTP auth user
104
+ @user_agent = nil
105
+ @webrobots = nil
106
+
107
+ @ca_file = nil # OpenSSL server certificate file
108
+ @cert = nil # OpenSSL Certificate
109
+ @key = nil # OpenSSL Private Key
110
+ @pass = nil # OpenSSL Password
111
+ @verify_callback = nil
112
+
113
+ @scheme_handlers = Hash.new { |h, scheme|
114
+ h[scheme] = lambda { |link, page|
115
+ raise Mechanize::UnsupportedSchemeError, scheme
116
+ }
117
+ }
118
+
119
+ @scheme_handlers['http'] = lambda { |link, page| link }
120
+ @scheme_handlers['https'] = @scheme_handlers['http']
121
+ @scheme_handlers['relative'] = @scheme_handlers['http']
122
+ @scheme_handlers['file'] = @scheme_handlers['http']
123
+ end
124
+
125
+ # Equivalent to the browser back button. Returns the most recent page
126
+ # visited.
127
+ def back
128
+ @history.pop
129
+ end
130
+
131
+ def certificate
132
+ @http.certificate
133
+ end
134
+
135
+ def connection_for uri
136
+ case uri.scheme.downcase
137
+ when 'http', 'https' then
138
+ return @http
139
+ when 'file' then
140
+ return Mechanize::FileConnection.new
141
+ end
142
+ end
143
+
144
+ ##
145
+ # Returns the latest page loaded by the agent
146
+
147
+ def current_page
148
+ @history.last
149
+ end
150
+
151
+ def enable_gzip request
152
+ request['accept-encoding'] = if @gzip_enabled
153
+ 'gzip,deflate,identity'
154
+ else
155
+ 'identity'
156
+ end
157
+ end
158
+
159
+ # uri is an absolute URI
160
+ def fetch uri, method = :get, headers = {}, params = [],
161
+ referer = current_page, redirects = 0
162
+ referer_uri = referer ? referer.uri : nil
163
+
164
+ uri = resolve uri, referer
165
+
166
+ uri, params = resolve_parameters uri, method, params
167
+
168
+ request = http_request uri, method, params
169
+
170
+ connection = connection_for uri
171
+
172
+ request_auth request, uri
173
+
174
+ enable_gzip request
175
+
176
+ request_language_charset request
177
+ request_cookies request, uri
178
+ request_host request, uri
179
+ request_referer request, uri, referer_uri
180
+ request_user_agent request
181
+ request_add_headers request, headers
182
+
183
+ pre_connect request
184
+
185
+ # Consult robots.txt
186
+ if robots && uri.is_a?(URI::HTTP)
187
+ robots_allowed?(uri) or raise Mechanize::RobotsDisallowedError.new(uri)
188
+ end
189
+
190
+ # Add If-Modified-Since if page is in history
191
+ if (page = visited_page(uri)) and page.response['Last-Modified']
192
+ request['If-Modified-Since'] = page.response['Last-Modified']
193
+ end if(@conditional_requests)
194
+
195
+ # Specify timeouts if given
196
+ connection.open_timeout = @open_timeout if @open_timeout
197
+ connection.read_timeout = @read_timeout if @read_timeout
198
+
199
+ request_log request
200
+
201
+ response_body_io = nil
202
+
203
+ # Send the request
204
+ response = connection.request(uri, request) { |res|
205
+ response_log res
206
+
207
+ response_body_io = response_read res, request
208
+
209
+ res
210
+ }
211
+
212
+ response_body = response_content_encoding response, response_body_io
213
+
214
+ post_connect uri, response, response_body
215
+
216
+ page = response_parse response, response_body, uri
217
+
218
+ response_cookies response, uri, page
219
+
220
+ meta = response_follow_meta_refresh response, uri, page, redirects
221
+ return meta if meta
222
+
223
+ case response
224
+ when Net::HTTPSuccess
225
+ if robots && page.is_a?(Mechanize::Page)
226
+ page.parser.noindex? and raise Mechanize::RobotsDisallowedError.new(uri)
227
+ end
228
+
229
+ page
230
+ when Mechanize::FileResponse
231
+ page
232
+ when Net::HTTPNotModified
233
+ log.debug("Got cached page") if log
234
+ visited_page(uri) || page
235
+ when Net::HTTPRedirection
236
+ response_redirect response, method, page, redirects
237
+ when Net::HTTPUnauthorized
238
+ response_authenticate(response, page, uri, request, headers, params,
239
+ referer)
240
+ else
241
+ raise Mechanize::ResponseCodeError.new(page), "Unhandled response"
242
+ end
243
+ end
244
+
245
+ def max_history
246
+ @history.max_size
247
+ end
248
+
249
+ def max_history=(length)
250
+ @history.max_size = length
251
+ end
252
+
253
+ def http_request uri, method, params = nil
254
+ case uri.scheme.downcase
255
+ when 'http', 'https' then
256
+ klass = Net::HTTP.const_get(method.to_s.capitalize)
257
+
258
+ request ||= klass.new(uri.request_uri)
259
+ request.body = params.first if params
260
+
261
+ request
262
+ when 'file' then
263
+ Mechanize::FileRequest.new uri
264
+ end
265
+ end
266
+
267
+ def log
268
+ Mechanize.log
269
+ end
270
+
271
+ ##
272
+ # Invokes hooks added to post_connect_hooks after a +response+ is returned
273
+ # and the response +body+ is handled.
274
+ #
275
+ # Yields the +context+, the +uri+ for the request, the +response+ and the
276
+ # response +body+.
277
+
278
+ def post_connect uri, response, body # :yields: agent, uri, response, body
279
+ @post_connect_hooks.each do |hook|
280
+ hook.call self, uri, response, body
281
+ end
282
+ end
283
+
284
+ ##
285
+ # Invokes hooks added to pre_connect_hooks before a +request+ is made.
286
+ # Yields the +agent+ and the +request+ that will be performed to each hook.
287
+
288
+ def pre_connect request # :yields: agent, request
289
+ @pre_connect_hooks.each do |hook|
290
+ hook.call self, request
291
+ end
292
+ end
293
+
294
+ def request_auth request, uri
295
+ auth_type = @auth_hash[uri.host]
296
+
297
+ return unless auth_type
298
+
299
+ case auth_type
300
+ when :basic
301
+ request.basic_auth @user, @password
302
+ when :digest, :iis_digest
303
+ uri.user = @user
304
+ uri.password = @password
305
+
306
+ iis = auth_type == :iis_digest
307
+
308
+ auth = @digest_auth.auth_header uri, @digest, request.method, iis
309
+
310
+ request['Authorization'] = auth
311
+ end
312
+ end
313
+
314
+ def request_cookies request, uri
315
+ return if @cookie_jar.empty? uri
316
+
317
+ cookies = @cookie_jar.cookies uri
318
+
319
+ return if cookies.empty?
320
+
321
+ request.add_field 'Cookie', cookies.join('; ')
322
+ end
323
+
324
+ def request_host request, uri
325
+ port = [80, 443].include?(uri.port.to_i) ? nil : uri.port
326
+ host = uri.host
327
+
328
+ request['Host'] = [host, port].compact.join ':'
329
+ end
330
+
331
+ def request_language_charset request
332
+ request['accept-charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
333
+ request['accept-language'] = 'en-us,en;q=0.5'
334
+ end
335
+
336
+ # Log specified headers for the request
337
+ def request_log request
338
+ return unless log
339
+
340
+ log.info("#{request.class}: #{request.path}")
341
+
342
+ request.each_header do |k, v|
343
+ log.debug("request-header: #{k} => #{v}")
344
+ end
345
+ end
346
+
347
+ def request_add_headers request, headers = {}
348
+ @request_headers.each do |k,v|
349
+ request[k] = v
350
+ end
351
+
352
+ headers.each do |field, value|
353
+ case field
354
+ when :etag then request["ETag"] = value
355
+ when :if_modified_since then request["If-Modified-Since"] = value
356
+ when Symbol then
357
+ raise ArgumentError, "unknown header symbol #{field}"
358
+ else
359
+ request[field] = value
360
+ end
361
+ end
362
+ end
363
+
364
+ def request_referer request, uri, referer
365
+ return unless referer
366
+ return if 'https' == referer.scheme.downcase and
367
+ 'https' != uri.scheme.downcase
368
+
369
+ request['Referer'] = referer
370
+ end
371
+
372
+ def request_user_agent request
373
+ request['User-Agent'] = @user_agent if @user_agent
374
+ end
375
+
376
+ def resolve(uri, referer = current_page)
377
+ uri = uri.dup if uri.is_a?(URI)
378
+
379
+ unless uri.is_a?(URI)
380
+ uri = uri.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/o) { |match|
381
+ if RUBY_VERSION >= "1.9.0"
382
+ Mechanize::Util.uri_escape(match)
383
+ else
384
+ sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C')[0])
385
+ end
386
+ }
387
+
388
+ unescaped = uri.split(/(?:%[0-9A-Fa-f]{2})+|#/)
389
+ escaped = uri.scan(/(?:%[0-9A-Fa-f]{2})+|#/)
390
+
391
+ escaped_uri = Mechanize::Util.html_unescape(
392
+ unescaped.zip(escaped).map { |x,y|
393
+ "#{WEBrick::HTTPUtils.escape(x)}#{y}"
394
+ }.join('')
395
+ )
396
+
397
+ begin
398
+ uri = URI.parse(escaped_uri)
399
+ rescue
400
+ uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_uri))
401
+ end
402
+ end
403
+
404
+ scheme = uri.relative? ? 'relative' : uri.scheme.downcase
405
+ uri = @scheme_handlers[scheme].call(uri, referer)
406
+
407
+ if referer && referer.uri
408
+ if uri.path.length == 0 && uri.relative?
409
+ uri.path = referer.uri.path
410
+ end
411
+ end
412
+
413
+ uri.path = '/' if uri.path.length == 0
414
+
415
+ if uri.relative?
416
+ raise ArgumentError, "absolute URL needed (not #{uri})" unless
417
+ referer && referer.uri
418
+
419
+ base = nil
420
+ if referer.respond_to?(:bases) && referer.parser
421
+ base = referer.bases.last
422
+ end
423
+
424
+ uri = ((base && base.uri && base.uri.absolute?) ?
425
+ base.uri :
426
+ referer.uri) + uri
427
+ uri = referer.uri + uri
428
+ # Strip initial "/.." bits from the path
429
+ uri.path.sub!(/^(\/\.\.)+(?=\/)/, '')
430
+ end
431
+
432
+ unless ['http', 'https', 'file'].include?(uri.scheme.downcase)
433
+ raise ArgumentError, "unsupported scheme: #{uri.scheme}"
434
+ end
435
+
436
+ uri
437
+ end
438
+
439
+ def resolve_parameters uri, method, parameters
440
+ case method
441
+ when :head, :get, :delete, :trace then
442
+ if parameters and parameters.length > 0
443
+ uri.query ||= ''
444
+ uri.query << '&' if uri.query.length > 0
445
+ uri.query << Mechanize::Util.build_query_string(parameters)
446
+ end
447
+
448
+ return uri, nil
449
+ end
450
+
451
+ return uri, parameters
452
+ end
453
+
454
+ def response_content_encoding response, body_io
455
+ length = response.content_length || body_io.length
456
+
457
+ case response['Content-Encoding']
458
+ when nil, 'none', '7bit' then
459
+ body_io.string
460
+ when 'deflate' then
461
+ log.debug('deflate body') if log
462
+
463
+ return if length.zero?
464
+
465
+ begin
466
+ Zlib::Inflate.inflate body_io.string
467
+ rescue Zlib::BufError, Zlib::DataError
468
+ log.error('Unable to inflate page, retrying with raw deflate') if log
469
+ begin
470
+ Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body_io.string)
471
+ rescue Zlib::BufError, Zlib::DataError
472
+ log.error("unable to inflate page: #{$!}") if log
473
+ ''
474
+ end
475
+ end
476
+ when 'gzip', 'x-gzip' then
477
+ log.debug('gzip body') if log
478
+
479
+ return if length.zero?
480
+
481
+ begin
482
+ zio = Zlib::GzipReader.new body_io
483
+ zio.read
484
+ rescue Zlib::BufError, Zlib::GzipFile::Error
485
+ log.error('Unable to gunzip body, trying raw inflate') if log
486
+ body_io.rewind
487
+ body_io.read 10
488
+ Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body_io.read)
489
+ rescue Zlib::DataError
490
+ log.error("unable to gunzip page: #{$!}") if log
491
+ ''
492
+ ensure
493
+ zio.close if zio and not zio.closed?
494
+ end
495
+ else
496
+ raise Mechanize::Error,
497
+ "Unsupported Content-Encoding: #{response['Content-Encoding']}"
498
+ end
499
+ end
500
+
501
+ def response_cookies response, uri, page
502
+ if Mechanize::Page === page and page.body =~ /Set-Cookie/n
503
+ page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta|
504
+ Mechanize::Cookie.parse(uri, meta['content']) { |c|
505
+ log.debug("saved cookie: #{c}") if log
506
+ @cookie_jar.add(uri, c)
507
+ }
508
+ end
509
+ end
510
+
511
+ header_cookies = response.get_fields 'Set-Cookie'
512
+
513
+ return unless header_cookies
514
+
515
+ header_cookies.each do |cookie|
516
+ Mechanize::Cookie.parse(uri, cookie) { |c|
517
+ log.debug("saved cookie: #{c}") if log
518
+ @cookie_jar.add(uri, c)
519
+ }
520
+ end
521
+ end
522
+
523
+ def response_follow_meta_refresh response, uri, page, redirects
524
+ return unless @follow_meta_refresh
525
+
526
+ redirect_uri = nil
527
+ referer = page
528
+
529
+ if page.respond_to?(:meta_refresh) and (redirect = page.meta_refresh.first)
530
+ redirect_uri = Mechanize::Util.uri_unescape redirect.uri.to_s
531
+ sleep redirect.node['delay'].to_f
532
+ referer = Mechanize::Page.new(nil, {'content-type'=>'text/html'})
533
+ elsif refresh = response['refresh']
534
+ delay, redirect_uri = Mechanize::Page::MetaRefresh.parse refresh, uri
535
+ raise Mechanize::Error, 'Invalid refresh http header' unless delay
536
+ raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
537
+ redirects + 1 > @redirection_limit
538
+ sleep delay.to_f
539
+ end
540
+
541
+ if redirect_uri
542
+ @history.push(page, page.uri)
543
+ fetch redirect_uri, :get, {}, [], referer, redirects + 1
544
+ end
545
+ end
546
+
547
+ def response_log response
548
+ return unless log
549
+
550
+ log.info("status: #{response.class} #{response.http_version} " \
551
+ "#{response.code} #{response.message}")
552
+
553
+ response.each_header do |k, v|
554
+ log.debug("response-header: #{k} => #{v}")
555
+ end
556
+ end
557
+
558
+ def response_parse response, body, uri
559
+ @context.parse uri, response, body
560
+ end
561
+
562
+ def response_read response, request
563
+ body_io = StringIO.new
564
+ body_io.set_encoding Encoding::BINARY if body_io.respond_to? :set_encoding
565
+ total = 0
566
+
567
+ begin
568
+ response.read_body { |part|
569
+ total += part.length
570
+ body_io.write(part)
571
+ log.debug("Read #{part.length} bytes (#{total} total)") if log
572
+ }
573
+ rescue Net::HTTP::Persistent::Error => e
574
+ body_io.rewind
575
+ raise Mechanize::ResponseReadError.new(e, response, body_io)
576
+ end
577
+
578
+ body_io.rewind
579
+
580
+ raise Mechanize::ResponseCodeError, response if
581
+ Net::HTTPUnknownResponse === response
582
+
583
+ content_length = response.content_length
584
+
585
+ unless Net::HTTP::Head === request or Net::HTTPRedirection === response then
586
+ raise EOFError, "Content-Length (#{content_length}) does not match " \
587
+ "response body length (#{body_io.length})" if
588
+ content_length and content_length != body_io.length
589
+ end
590
+
591
+ body_io
592
+ end
593
+
594
+ def response_redirect response, method, page, redirects
595
+ case @redirect_ok
596
+ when true, :all
597
+ # shortcut
598
+ when false, nil
599
+ return page
600
+ when :permanent
601
+ return page if response_class != Net::HTTPMovedPermanently
602
+ end
603
+
604
+ log.info("follow redirect to: #{response['Location']}") if log
605
+
606
+ from_uri = page.uri
607
+
608
+ raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
609
+ redirects + 1 > @redirection_limit
610
+
611
+ redirect_method = method == :head ? :head : :get
612
+
613
+ page = fetch(response['Location'].to_s, redirect_method, {}, [], page,
614
+ redirects + 1)
615
+
616
+ @history.push(page, from_uri)
617
+
618
+ return page
619
+ end
620
+
621
+ def response_authenticate(response, page, uri, request, headers, params,
622
+ referer)
623
+ raise Mechanize::ResponseCodeError, page unless @user || @password
624
+ raise Mechanize::ResponseCodeError, page if @auth_hash.has_key?(uri.host)
625
+
626
+ if response['www-authenticate'] =~ /Digest/i
627
+ @auth_hash[uri.host] = :digest
628
+ if response['server'] =~ /Microsoft-IIS/
629
+ @auth_hash[uri.host] = :iis_digest
630
+ end
631
+ @digest = response['www-authenticate']
632
+ else
633
+ @auth_hash[uri.host] = :basic
634
+ end
635
+
636
+ fetch uri, request.method.downcase.to_sym, headers, params, referer
637
+ end
638
+
639
+ def robots= value
640
+ require 'webrobots' if value
641
+ @webrobots = nil if value != @robots
642
+ @robots = value
643
+ end
644
+
645
+ ##
646
+ # Tests if this agent is allowed to access +url+, consulting the site's
647
+ # robots.txt.
648
+
649
+ def robots_allowed? uri
650
+ return true if uri.request_uri == '/robots.txt'
651
+
652
+ webrobots.allowed? uri
653
+ end
654
+
655
+ # Opposite of robots_allowed?
656
+
657
+ def robots_disallowed? url
658
+ !robots_allowed? url
659
+ end
660
+
661
+ # Returns an error object if there is an error in fetching or parsing
662
+ # robots.txt of the site +url+.
663
+ def robots_error(url)
664
+ webrobots.error(url)
665
+ end
666
+
667
+ # Raises the error if there is an error in fetching or parsing robots.txt of
668
+ # the site +url+.
669
+ def robots_error!(url)
670
+ webrobots.error!(url)
671
+ end
672
+
673
+ # Removes robots.txt cache for the site +url+.
674
+ def robots_reset(url)
675
+ webrobots.reset(url)
676
+ end
677
+
678
+ def set_http
679
+ @http = Net::HTTP::Persistent.new 'mechanize', @proxy_uri
680
+
681
+ @http.keep_alive = @keep_alive_time
682
+
683
+ @http.ca_file = @ca_file
684
+ @http.verify_callback = @verify_callback
685
+
686
+ if @cert and @key then
687
+ cert = if OpenSSL::X509::Certificate === @cert then
688
+ @cert
689
+ else
690
+ OpenSSL::X509::Certificate.new ::File.read @cert
691
+ end
692
+
693
+ key = if OpenSSL::PKey::PKey === @key then
694
+ @key
695
+ else
696
+ OpenSSL::PKey::RSA.new ::File.read(@key), @pass
697
+ end
698
+
699
+ @http.certificate = cert
700
+ @http.private_key = key
701
+ end
702
+ end
703
+
704
+ # Sets the proxy address, port, user, and password +addr+ should be a host,
705
+ # with no "http://"
706
+ def set_proxy(addr, port, user = nil, pass = nil)
707
+ return unless addr and port
708
+ @proxy_uri = URI "http://#{addr}"
709
+ @proxy_uri.port = port
710
+ @proxy_uri.user = user if user
711
+ @proxy_uri.password = pass if pass
712
+
713
+ @proxy_uri
714
+ end
715
+
716
+ def user_agent= user_agent
717
+ @webrobots = nil if user_agent != @user_agent
718
+ @user_agent = user_agent
719
+ end
720
+
721
+ # Returns a visited page for the url passed in, otherwise nil
722
+ def visited_page url
723
+ @history.visited_page resolve url
724
+ end
725
+
726
+ def get_robots(uri) # :nodoc:
727
+ fetch(uri).body
728
+ rescue Mechanize::ResponseCodeError => e
729
+ return '' if e.response_code == '404'
730
+ raise e
731
+ end
732
+
733
+ def webrobots
734
+ @webrobots ||= WebRobots.new(@user_agent, :http_get => method(:get_robots))
735
+ end
736
+
737
+ end
738
+