neocoin-mechanize 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (174) hide show
  1. data/.autotest +6 -0
  2. data/.gemtest +0 -0
  3. data/CHANGELOG.rdoc +638 -0
  4. data/EXAMPLES.rdoc +187 -0
  5. data/FAQ.rdoc +11 -0
  6. data/GUIDE.rdoc +163 -0
  7. data/LICENSE.rdoc +20 -0
  8. data/Manifest.txt +172 -0
  9. data/README.rdoc +63 -0
  10. data/Rakefile +36 -0
  11. data/examples/flickr_upload.rb +22 -0
  12. data/examples/mech-dump.rb +5 -0
  13. data/examples/proxy_req.rb +7 -0
  14. data/examples/rubyforge.rb +20 -0
  15. data/examples/spider.rb +21 -0
  16. data/lib/mechanize.rb +662 -0
  17. data/lib/mechanize/content_type_error.rb +14 -0
  18. data/lib/mechanize/cookie.rb +85 -0
  19. data/lib/mechanize/cookie_jar.rb +241 -0
  20. data/lib/mechanize/element_matcher.rb +35 -0
  21. data/lib/mechanize/file.rb +80 -0
  22. data/lib/mechanize/file_connection.rb +17 -0
  23. data/lib/mechanize/file_request.rb +26 -0
  24. data/lib/mechanize/file_response.rb +74 -0
  25. data/lib/mechanize/file_saver.rb +37 -0
  26. data/lib/mechanize/form.rb +478 -0
  27. data/lib/mechanize/form/button.rb +9 -0
  28. data/lib/mechanize/form/check_box.rb +11 -0
  29. data/lib/mechanize/form/field.rb +44 -0
  30. data/lib/mechanize/form/file_upload.rb +23 -0
  31. data/lib/mechanize/form/image_button.rb +20 -0
  32. data/lib/mechanize/form/multi_select_list.rb +83 -0
  33. data/lib/mechanize/form/option.rb +49 -0
  34. data/lib/mechanize/form/radio_button.rb +48 -0
  35. data/lib/mechanize/form/select_list.rb +40 -0
  36. data/lib/mechanize/headers.rb +25 -0
  37. data/lib/mechanize/history.rb +83 -0
  38. data/lib/mechanize/http.rb +3 -0
  39. data/lib/mechanize/http/agent.rb +738 -0
  40. data/lib/mechanize/inspect.rb +88 -0
  41. data/lib/mechanize/monkey_patch.rb +37 -0
  42. data/lib/mechanize/page.rb +408 -0
  43. data/lib/mechanize/page/base.rb +8 -0
  44. data/lib/mechanize/page/frame.rb +27 -0
  45. data/lib/mechanize/page/image.rb +30 -0
  46. data/lib/mechanize/page/label.rb +20 -0
  47. data/lib/mechanize/page/link.rb +82 -0
  48. data/lib/mechanize/page/meta_refresh.rb +56 -0
  49. data/lib/mechanize/pluggable_parsers.rb +101 -0
  50. data/lib/mechanize/redirect_limit_reached_error.rb +16 -0
  51. data/lib/mechanize/redirect_not_get_or_head_error.rb +19 -0
  52. data/lib/mechanize/response_code_error.rb +22 -0
  53. data/lib/mechanize/response_read_error.rb +27 -0
  54. data/lib/mechanize/robots_disallowed_error.rb +29 -0
  55. data/lib/mechanize/unsupported_scheme_error.rb +8 -0
  56. data/lib/mechanize/util.rb +113 -0
  57. data/test/data/htpasswd +1 -0
  58. data/test/data/server.crt +16 -0
  59. data/test/data/server.csr +12 -0
  60. data/test/data/server.key +15 -0
  61. data/test/data/server.pem +15 -0
  62. data/test/helper.rb +175 -0
  63. data/test/htdocs/alt_text.html +10 -0
  64. data/test/htdocs/bad_form_test.html +9 -0
  65. data/test/htdocs/button.jpg +0 -0
  66. data/test/htdocs/canonical_uri.html +9 -0
  67. data/test/htdocs/dir with spaces/foo.html +1 -0
  68. data/test/htdocs/empty_form.html +6 -0
  69. data/test/htdocs/file_upload.html +26 -0
  70. data/test/htdocs/find_link.html +41 -0
  71. data/test/htdocs/form_multi_select.html +16 -0
  72. data/test/htdocs/form_multival.html +37 -0
  73. data/test/htdocs/form_no_action.html +18 -0
  74. data/test/htdocs/form_no_input_name.html +16 -0
  75. data/test/htdocs/form_select.html +16 -0
  76. data/test/htdocs/form_select_all.html +16 -0
  77. data/test/htdocs/form_select_none.html +17 -0
  78. data/test/htdocs/form_select_noopts.html +10 -0
  79. data/test/htdocs/form_set_fields.html +14 -0
  80. data/test/htdocs/form_test.html +188 -0
  81. data/test/htdocs/frame_referer_test.html +10 -0
  82. data/test/htdocs/frame_test.html +30 -0
  83. data/test/htdocs/google.html +13 -0
  84. data/test/htdocs/iframe_test.html +16 -0
  85. data/test/htdocs/index.html +6 -0
  86. data/test/htdocs/link with space.html +5 -0
  87. data/test/htdocs/meta_cookie.html +11 -0
  88. data/test/htdocs/no_title_test.html +6 -0
  89. data/test/htdocs/nofollow.html +9 -0
  90. data/test/htdocs/noindex.html +9 -0
  91. data/test/htdocs/norobots.html +8 -0
  92. data/test/htdocs/rails_3_encoding_hack_form_test.html +27 -0
  93. data/test/htdocs/rel_nofollow.html +8 -0
  94. data/test/htdocs/relative/tc_relative_links.html +21 -0
  95. data/test/htdocs/robots.html +8 -0
  96. data/test/htdocs/robots.txt +2 -0
  97. data/test/htdocs/tc_bad_charset.html +9 -0
  98. data/test/htdocs/tc_bad_links.html +5 -0
  99. data/test/htdocs/tc_base_images.html +10 -0
  100. data/test/htdocs/tc_base_link.html +8 -0
  101. data/test/htdocs/tc_blank_form.html +11 -0
  102. data/test/htdocs/tc_charset.html +6 -0
  103. data/test/htdocs/tc_checkboxes.html +19 -0
  104. data/test/htdocs/tc_encoded_links.html +5 -0
  105. data/test/htdocs/tc_field_precedence.html +11 -0
  106. data/test/htdocs/tc_follow_meta.html +8 -0
  107. data/test/htdocs/tc_form_action.html +48 -0
  108. data/test/htdocs/tc_images.html +8 -0
  109. data/test/htdocs/tc_links.html +18 -0
  110. data/test/htdocs/tc_meta_in_body.html +9 -0
  111. data/test/htdocs/tc_no_attributes.html +16 -0
  112. data/test/htdocs/tc_pretty_print.html +17 -0
  113. data/test/htdocs/tc_radiobuttons.html +17 -0
  114. data/test/htdocs/tc_referer.html +16 -0
  115. data/test/htdocs/tc_relative_links.html +19 -0
  116. data/test/htdocs/tc_textarea.html +23 -0
  117. data/test/htdocs/test_bad_encoding.html +52 -0
  118. data/test/htdocs/test_click.html +11 -0
  119. data/test/htdocs/unusual______.html +5 -0
  120. data/test/servlets.rb +402 -0
  121. data/test/ssl_server.rb +48 -0
  122. data/test/test_cookies.rb +129 -0
  123. data/test/test_form_action.rb +52 -0
  124. data/test/test_form_as_hash.rb +59 -0
  125. data/test/test_form_button.rb +46 -0
  126. data/test/test_frames.rb +34 -0
  127. data/test/test_headers.rb +33 -0
  128. data/test/test_history.rb +118 -0
  129. data/test/test_history_added.rb +16 -0
  130. data/test/test_html_unscape_forms.rb +46 -0
  131. data/test/test_if_modified_since.rb +20 -0
  132. data/test/test_images.rb +19 -0
  133. data/test/test_mechanize.rb +842 -0
  134. data/test/test_mechanize_cookie.rb +345 -0
  135. data/test/test_mechanize_cookie_jar.rb +401 -0
  136. data/test/test_mechanize_file.rb +53 -0
  137. data/test/test_mechanize_file_request.rb +19 -0
  138. data/test/test_mechanize_file_response.rb +21 -0
  139. data/test/test_mechanize_form.rb +576 -0
  140. data/test/test_mechanize_form_check_box.rb +37 -0
  141. data/test/test_mechanize_form_encoding.rb +120 -0
  142. data/test/test_mechanize_form_field.rb +21 -0
  143. data/test/test_mechanize_form_image_button.rb +12 -0
  144. data/test/test_mechanize_form_textarea.rb +51 -0
  145. data/test/test_mechanize_http_agent.rb +697 -0
  146. data/test/test_mechanize_link.rb +84 -0
  147. data/test/test_mechanize_page_encoding.rb +147 -0
  148. data/test/test_mechanize_page_link.rb +382 -0
  149. data/test/test_mechanize_page_meta_refresh.rb +115 -0
  150. data/test/test_mechanize_redirect_not_get_or_head_error.rb +18 -0
  151. data/test/test_mechanize_subclass.rb +22 -0
  152. data/test/test_mechanize_util.rb +92 -0
  153. data/test/test_multi_select.rb +118 -0
  154. data/test/test_no_attributes.rb +13 -0
  155. data/test/test_option.rb +18 -0
  156. data/test/test_pluggable_parser.rb +136 -0
  157. data/test/test_post_form.rb +37 -0
  158. data/test/test_pretty_print.rb +22 -0
  159. data/test/test_radiobutton.rb +75 -0
  160. data/test/test_redirect_limit_reached.rb +39 -0
  161. data/test/test_referer.rb +81 -0
  162. data/test/test_relative_links.rb +40 -0
  163. data/test/test_request.rb +13 -0
  164. data/test/test_response_code.rb +53 -0
  165. data/test/test_robots.rb +72 -0
  166. data/test/test_save_file.rb +48 -0
  167. data/test/test_scheme.rb +48 -0
  168. data/test/test_select.rb +119 -0
  169. data/test/test_select_all.rb +15 -0
  170. data/test/test_select_none.rb +15 -0
  171. data/test/test_select_noopts.rb +18 -0
  172. data/test/test_set_fields.rb +44 -0
  173. data/test/test_ssl_server.rb +20 -0
  174. metadata +354 -0
@@ -0,0 +1,3 @@
1
+ class Mechanize::HTTP
2
+ end
3
+
@@ -0,0 +1,738 @@
1
+ ##
2
+ # An HTTP (and local disk access) user agent
3
+
4
+ class Mechanize::HTTP::Agent
5
+
6
+ attr_reader :cookie_jar
7
+
8
+ # Disables If-Modified-Since conditional requests (enabled by default)
9
+ attr_accessor :conditional_requests
10
+ attr_accessor :context
11
+
12
+ # Follow HTML meta refresh. If set to +:anywhere+ meta refresh tags outside
13
+ # of the head element will be followed.
14
+ attr_accessor :follow_meta_refresh
15
+ attr_accessor :gzip_enabled
16
+ attr_accessor :history
17
+
18
+ # Length of time to wait until a connection is opened in seconds
19
+ attr_accessor :open_timeout
20
+
21
+ attr_accessor :password
22
+ attr_reader :proxy_uri
23
+
24
+ # A list of hooks to call after retrieving a response. Hooks are called with
25
+ # the agent and the response returned.
26
+
27
+ attr_reader :post_connect_hooks
28
+
29
+ # A list of hooks to call before making a request. Hooks are called with
30
+ # the agent and the request to be performed.
31
+
32
+ attr_reader :pre_connect_hooks
33
+
34
+ # Length of time to attempt to read data from the server
35
+ attr_accessor :read_timeout
36
+
37
+ # Controls how this agent deals with redirects. The following values are
38
+ # allowed:
39
+ #
40
+ # :all, true:: All 3xx redirects are followed (default)
41
+ # :permanent:: Only 301 Moved Permanantly redirects are followed
42
+ # false:: No redirects are followed
43
+
44
+ attr_accessor :redirect_ok
45
+ attr_accessor :redirection_limit
46
+
47
+ # A hash of request headers to be used
48
+
49
+ attr_accessor :request_headers
50
+
51
+ # When true, this agent will consult the site's robots.txt for each access.
52
+
53
+ attr_reader :robots
54
+
55
+ attr_accessor :scheme_handlers
56
+
57
+ attr_accessor :user
58
+ attr_reader :user_agent
59
+
60
+ # Path to an OpenSSL server certificate file
61
+ attr_accessor :ca_file
62
+
63
+ # An OpenSSL private key or the path to a private key
64
+ attr_accessor :key
65
+
66
+ # An OpenSSL client certificate or the path to a certificate file.
67
+ attr_accessor :cert
68
+
69
+ # OpenSSL key password
70
+ attr_accessor :pass
71
+
72
+ # A callback for additional certificate verification. See
73
+ # OpenSSL::SSL::SSLContext#verify_callback
74
+ #
75
+ # The callback can be used for debugging or to ignore errors by always
76
+ # returning +true+. Specifying nil uses the default method that was valid
77
+ # when the SSLContext was created
78
+ attr_accessor :verify_callback
79
+
80
+ attr_reader :http # :nodoc:
81
+
82
+ def initialize
83
+ @auth_hash = {} # Keep track of urls for sending auth
84
+ @conditional_requests = true
85
+ @context = nil
86
+ @cookie_jar = Mechanize::CookieJar.new
87
+ @digest = nil # DigestAuth Digest
88
+ @digest_auth = Net::HTTP::DigestAuth.new
89
+ @follow_meta_refresh = false
90
+ @gzip_enabled = true
91
+ @history = Mechanize::History.new
92
+ @keep_alive_time = 300
93
+ @open_timeout = nil
94
+ @password = nil # HTTP auth password
95
+ @post_connect_hooks = []
96
+ @pre_connect_hooks = []
97
+ @proxy_uri = nil
98
+ @read_timeout = nil
99
+ @redirect_ok = true
100
+ @redirection_limit = 20
101
+ @request_headers = {}
102
+ @robots = false
103
+ @user = nil # HTTP auth user
104
+ @user_agent = nil
105
+ @webrobots = nil
106
+
107
+ @ca_file = nil # OpenSSL server certificate file
108
+ @cert = nil # OpenSSL Certificate
109
+ @key = nil # OpenSSL Private Key
110
+ @pass = nil # OpenSSL Password
111
+ @verify_callback = nil
112
+
113
+ @scheme_handlers = Hash.new { |h, scheme|
114
+ h[scheme] = lambda { |link, page|
115
+ raise Mechanize::UnsupportedSchemeError, scheme
116
+ }
117
+ }
118
+
119
+ @scheme_handlers['http'] = lambda { |link, page| link }
120
+ @scheme_handlers['https'] = @scheme_handlers['http']
121
+ @scheme_handlers['relative'] = @scheme_handlers['http']
122
+ @scheme_handlers['file'] = @scheme_handlers['http']
123
+ end
124
+
125
+ # Equivalent to the browser back button. Returns the most recent page
126
+ # visited.
127
+ def back
128
+ @history.pop
129
+ end
130
+
131
+ def certificate
132
+ @http.certificate
133
+ end
134
+
135
+ def connection_for uri
136
+ case uri.scheme.downcase
137
+ when 'http', 'https' then
138
+ return @http
139
+ when 'file' then
140
+ return Mechanize::FileConnection.new
141
+ end
142
+ end
143
+
144
+ ##
145
+ # Returns the latest page loaded by the agent
146
+
147
+ def current_page
148
+ @history.last
149
+ end
150
+
151
+ def enable_gzip request
152
+ request['accept-encoding'] = if @gzip_enabled
153
+ 'gzip,deflate,identity'
154
+ else
155
+ 'identity'
156
+ end
157
+ end
158
+
159
+ # uri is an absolute URI
160
+ def fetch uri, method = :get, headers = {}, params = [],
161
+ referer = current_page, redirects = 0
162
+ referer_uri = referer ? referer.uri : nil
163
+
164
+ uri = resolve uri, referer
165
+
166
+ uri, params = resolve_parameters uri, method, params
167
+
168
+ request = http_request uri, method, params
169
+
170
+ connection = connection_for uri
171
+
172
+ request_auth request, uri
173
+
174
+ enable_gzip request
175
+
176
+ request_language_charset request
177
+ request_cookies request, uri
178
+ request_host request, uri
179
+ request_referer request, uri, referer_uri
180
+ request_user_agent request
181
+ request_add_headers request, headers
182
+
183
+ pre_connect request
184
+
185
+ # Consult robots.txt
186
+ if robots && uri.is_a?(URI::HTTP)
187
+ robots_allowed?(uri) or raise Mechanize::RobotsDisallowedError.new(uri)
188
+ end
189
+
190
+ # Add If-Modified-Since if page is in history
191
+ if (page = visited_page(uri)) and page.response['Last-Modified']
192
+ request['If-Modified-Since'] = page.response['Last-Modified']
193
+ end if(@conditional_requests)
194
+
195
+ # Specify timeouts if given
196
+ connection.open_timeout = @open_timeout if @open_timeout
197
+ connection.read_timeout = @read_timeout if @read_timeout
198
+
199
+ request_log request
200
+
201
+ response_body_io = nil
202
+
203
+ # Send the request
204
+ response = connection.request(uri, request) { |res|
205
+ response_log res
206
+
207
+ response_body_io = response_read res, request
208
+
209
+ res
210
+ }
211
+
212
+ response_body = response_content_encoding response, response_body_io
213
+
214
+ post_connect uri, response, response_body
215
+
216
+ page = response_parse response, response_body, uri
217
+
218
+ response_cookies response, uri, page
219
+
220
+ meta = response_follow_meta_refresh response, uri, page, redirects
221
+ return meta if meta
222
+
223
+ case response
224
+ when Net::HTTPSuccess
225
+ if robots && page.is_a?(Mechanize::Page)
226
+ page.parser.noindex? and raise Mechanize::RobotsDisallowedError.new(uri)
227
+ end
228
+
229
+ page
230
+ when Mechanize::FileResponse
231
+ page
232
+ when Net::HTTPNotModified
233
+ log.debug("Got cached page") if log
234
+ visited_page(uri) || page
235
+ when Net::HTTPRedirection
236
+ response_redirect response, method, page, redirects
237
+ when Net::HTTPUnauthorized
238
+ response_authenticate(response, page, uri, request, headers, params,
239
+ referer)
240
+ else
241
+ raise Mechanize::ResponseCodeError.new(page), "Unhandled response"
242
+ end
243
+ end
244
+
245
+ def max_history
246
+ @history.max_size
247
+ end
248
+
249
+ def max_history=(length)
250
+ @history.max_size = length
251
+ end
252
+
253
+ def http_request uri, method, params = nil
254
+ case uri.scheme.downcase
255
+ when 'http', 'https' then
256
+ klass = Net::HTTP.const_get(method.to_s.capitalize)
257
+
258
+ request ||= klass.new(uri.request_uri)
259
+ request.body = params.first if params
260
+
261
+ request
262
+ when 'file' then
263
+ Mechanize::FileRequest.new uri
264
+ end
265
+ end
266
+
267
+ def log
268
+ Mechanize.log
269
+ end
270
+
271
+ ##
272
+ # Invokes hooks added to post_connect_hooks after a +response+ is returned
273
+ # and the response +body+ is handled.
274
+ #
275
+ # Yields the +context+, the +uri+ for the request, the +response+ and the
276
+ # response +body+.
277
+
278
+ def post_connect uri, response, body # :yields: agent, uri, response, body
279
+ @post_connect_hooks.each do |hook|
280
+ hook.call self, uri, response, body
281
+ end
282
+ end
283
+
284
+ ##
285
+ # Invokes hooks added to pre_connect_hooks before a +request+ is made.
286
+ # Yields the +agent+ and the +request+ that will be performed to each hook.
287
+
288
+ def pre_connect request # :yields: agent, request
289
+ @pre_connect_hooks.each do |hook|
290
+ hook.call self, request
291
+ end
292
+ end
293
+
294
+ def request_auth request, uri
295
+ auth_type = @auth_hash[uri.host]
296
+
297
+ return unless auth_type
298
+
299
+ case auth_type
300
+ when :basic
301
+ request.basic_auth @user, @password
302
+ when :digest, :iis_digest
303
+ uri.user = @user
304
+ uri.password = @password
305
+
306
+ iis = auth_type == :iis_digest
307
+
308
+ auth = @digest_auth.auth_header uri, @digest, request.method, iis
309
+
310
+ request['Authorization'] = auth
311
+ end
312
+ end
313
+
314
+ def request_cookies request, uri
315
+ return if @cookie_jar.empty? uri
316
+
317
+ cookies = @cookie_jar.cookies uri
318
+
319
+ return if cookies.empty?
320
+
321
+ request.add_field 'Cookie', cookies.join('; ')
322
+ end
323
+
324
+ def request_host request, uri
325
+ port = [80, 443].include?(uri.port.to_i) ? nil : uri.port
326
+ host = uri.host
327
+
328
+ request['Host'] = [host, port].compact.join ':'
329
+ end
330
+
331
+ def request_language_charset request
332
+ request['accept-charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
333
+ request['accept-language'] = 'en-us,en;q=0.5'
334
+ end
335
+
336
+ # Log specified headers for the request
337
+ def request_log request
338
+ return unless log
339
+
340
+ log.info("#{request.class}: #{request.path}")
341
+
342
+ request.each_header do |k, v|
343
+ log.debug("request-header: #{k} => #{v}")
344
+ end
345
+ end
346
+
347
+ def request_add_headers request, headers = {}
348
+ @request_headers.each do |k,v|
349
+ request[k] = v
350
+ end
351
+
352
+ headers.each do |field, value|
353
+ case field
354
+ when :etag then request["ETag"] = value
355
+ when :if_modified_since then request["If-Modified-Since"] = value
356
+ when Symbol then
357
+ raise ArgumentError, "unknown header symbol #{field}"
358
+ else
359
+ request[field] = value
360
+ end
361
+ end
362
+ end
363
+
364
+ def request_referer request, uri, referer
365
+ return unless referer
366
+ return if 'https' == referer.scheme.downcase and
367
+ 'https' != uri.scheme.downcase
368
+
369
+ request['Referer'] = referer
370
+ end
371
+
372
+ def request_user_agent request
373
+ request['User-Agent'] = @user_agent if @user_agent
374
+ end
375
+
376
+ def resolve(uri, referer = current_page)
377
+ uri = uri.dup if uri.is_a?(URI)
378
+
379
+ unless uri.is_a?(URI)
380
+ uri = uri.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/o) { |match|
381
+ if RUBY_VERSION >= "1.9.0"
382
+ Mechanize::Util.uri_escape(match)
383
+ else
384
+ sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C')[0])
385
+ end
386
+ }
387
+
388
+ unescaped = uri.split(/(?:%[0-9A-Fa-f]{2})+|#/)
389
+ escaped = uri.scan(/(?:%[0-9A-Fa-f]{2})+|#/)
390
+
391
+ escaped_uri = Mechanize::Util.html_unescape(
392
+ unescaped.zip(escaped).map { |x,y|
393
+ "#{WEBrick::HTTPUtils.escape(x)}#{y}"
394
+ }.join('')
395
+ )
396
+
397
+ begin
398
+ uri = URI.parse(escaped_uri)
399
+ rescue
400
+ uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_uri))
401
+ end
402
+ end
403
+
404
+ scheme = uri.relative? ? 'relative' : uri.scheme.downcase
405
+ uri = @scheme_handlers[scheme].call(uri, referer)
406
+
407
+ if referer && referer.uri
408
+ if uri.path.length == 0 && uri.relative?
409
+ uri.path = referer.uri.path
410
+ end
411
+ end
412
+
413
+ uri.path = '/' if uri.path.length == 0
414
+
415
+ if uri.relative?
416
+ raise ArgumentError, "absolute URL needed (not #{uri})" unless
417
+ referer && referer.uri
418
+
419
+ base = nil
420
+ if referer.respond_to?(:bases) && referer.parser
421
+ base = referer.bases.last
422
+ end
423
+
424
+ uri = ((base && base.uri && base.uri.absolute?) ?
425
+ base.uri :
426
+ referer.uri) + uri
427
+ uri = referer.uri + uri
428
+ # Strip initial "/.." bits from the path
429
+ uri.path.sub!(/^(\/\.\.)+(?=\/)/, '')
430
+ end
431
+
432
+ unless ['http', 'https', 'file'].include?(uri.scheme.downcase)
433
+ raise ArgumentError, "unsupported scheme: #{uri.scheme}"
434
+ end
435
+
436
+ uri
437
+ end
438
+
439
+ def resolve_parameters uri, method, parameters
440
+ case method
441
+ when :head, :get, :delete, :trace then
442
+ if parameters and parameters.length > 0
443
+ uri.query ||= ''
444
+ uri.query << '&' if uri.query.length > 0
445
+ uri.query << Mechanize::Util.build_query_string(parameters)
446
+ end
447
+
448
+ return uri, nil
449
+ end
450
+
451
+ return uri, parameters
452
+ end
453
+
454
+ def response_content_encoding response, body_io
455
+ length = response.content_length || body_io.length
456
+
457
+ case response['Content-Encoding']
458
+ when nil, 'none', '7bit' then
459
+ body_io.string
460
+ when 'deflate' then
461
+ log.debug('deflate body') if log
462
+
463
+ return if length.zero?
464
+
465
+ begin
466
+ Zlib::Inflate.inflate body_io.string
467
+ rescue Zlib::BufError, Zlib::DataError
468
+ log.error('Unable to inflate page, retrying with raw deflate') if log
469
+ begin
470
+ Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body_io.string)
471
+ rescue Zlib::BufError, Zlib::DataError
472
+ log.error("unable to inflate page: #{$!}") if log
473
+ ''
474
+ end
475
+ end
476
+ when 'gzip', 'x-gzip', 'agzip' then
477
+ log.debug('gzip body') if log
478
+
479
+ return if length.zero?
480
+
481
+ begin
482
+ zio = Zlib::GzipReader.new body_io
483
+ zio.read
484
+ rescue Zlib::BufError, Zlib::GzipFile::Error
485
+ log.error('Unable to gunzip body, trying raw inflate') if log
486
+ body_io.rewind
487
+ body_io.read 10
488
+ Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body_io.read)
489
+ rescue Zlib::DataError
490
+ log.error("unable to gunzip page: #{$!}") if log
491
+ ''
492
+ ensure
493
+ zio.close if zio and not zio.closed?
494
+ end
495
+ else
496
+ raise Mechanize::Error,
497
+ "Unsupported Content-Encoding: #{response['Content-Encoding']}"
498
+ end
499
+ end
500
+
501
+ def response_cookies response, uri, page
502
+ if Mechanize::Page === page and page.body =~ /Set-Cookie/n
503
+ page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta|
504
+ Mechanize::Cookie.parse(uri, meta['content']) { |c|
505
+ log.debug("saved cookie: #{c}") if log
506
+ @cookie_jar.add(uri, c)
507
+ }
508
+ end
509
+ end
510
+
511
+ header_cookies = response.get_fields 'Set-Cookie'
512
+
513
+ return unless header_cookies
514
+
515
+ header_cookies.each do |cookie|
516
+ Mechanize::Cookie.parse(uri, cookie) { |c|
517
+ log.debug("saved cookie: #{c}") if log
518
+ @cookie_jar.add(uri, c)
519
+ }
520
+ end
521
+ end
522
+
523
+ def response_follow_meta_refresh response, uri, page, redirects
524
+ return unless @follow_meta_refresh
525
+
526
+ redirect_uri = nil
527
+ referer = page
528
+
529
+ if page.respond_to?(:meta_refresh) and (redirect = page.meta_refresh.first)
530
+ redirect_uri = Mechanize::Util.uri_unescape redirect.uri.to_s
531
+ sleep redirect.node['delay'].to_f
532
+ referer = Mechanize::Page.new(nil, {'content-type'=>'text/html'})
533
+ elsif refresh = response['refresh']
534
+ delay, redirect_uri = Mechanize::Page::MetaRefresh.parse refresh, uri
535
+ raise Mechanize::Error, 'Invalid refresh http header' unless delay
536
+ raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
537
+ redirects + 1 > @redirection_limit
538
+ sleep delay.to_f
539
+ end
540
+
541
+ if redirect_uri
542
+ @history.push(page, page.uri)
543
+ fetch redirect_uri, :get, {}, [], referer, redirects + 1
544
+ end
545
+ end
546
+
547
+ def response_log response
548
+ return unless log
549
+
550
+ log.info("status: #{response.class} #{response.http_version} " \
551
+ "#{response.code} #{response.message}")
552
+
553
+ response.each_header do |k, v|
554
+ log.debug("response-header: #{k} => #{v}")
555
+ end
556
+ end
557
+
558
+ def response_parse response, body, uri
559
+ @context.parse uri, response, body
560
+ end
561
+
562
+ def response_read response, request
563
+ body_io = StringIO.new
564
+ body_io.set_encoding Encoding::BINARY if body_io.respond_to? :set_encoding
565
+ total = 0
566
+
567
+ begin
568
+ response.read_body { |part|
569
+ total += part.length
570
+ body_io.write(part)
571
+ log.debug("Read #{part.length} bytes (#{total} total)") if log
572
+ }
573
+ rescue Net::HTTP::Persistent::Error => e
574
+ body_io.rewind
575
+ raise Mechanize::ResponseReadError.new(e, response, body_io)
576
+ end
577
+
578
+ body_io.rewind
579
+
580
+ raise Mechanize::ResponseCodeError, response if
581
+ Net::HTTPUnknownResponse === response
582
+
583
+ content_length = response.content_length
584
+
585
+ unless Net::HTTP::Head === request or Net::HTTPRedirection === response then
586
+ raise EOFError, "Content-Length (#{content_length}) does not match " \
587
+ "response body length (#{body_io.length})" if
588
+ content_length and content_length != body_io.length
589
+ end
590
+
591
+ body_io
592
+ end
593
+
594
+ def response_redirect response, method, page, redirects
595
+ case @redirect_ok
596
+ when true, :all
597
+ # shortcut
598
+ when false, nil
599
+ return page
600
+ when :permanent
601
+ return page if response_class != Net::HTTPMovedPermanently
602
+ end
603
+
604
+ log.info("follow redirect to: #{response['Location']}") if log
605
+
606
+ from_uri = page.uri
607
+
608
+ raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
609
+ redirects + 1 > @redirection_limit
610
+
611
+ redirect_method = method == :head ? :head : :get
612
+
613
+ page = fetch(response['Location'].to_s, redirect_method, {}, [], page,
614
+ redirects + 1)
615
+
616
+ @history.push(page, from_uri)
617
+
618
+ return page
619
+ end
620
+
621
+ def response_authenticate(response, page, uri, request, headers, params,
622
+ referer)
623
+ raise Mechanize::ResponseCodeError, page unless @user || @password
624
+ raise Mechanize::ResponseCodeError, page if @auth_hash.has_key?(uri.host)
625
+
626
+ if response['www-authenticate'] =~ /Digest/i
627
+ @auth_hash[uri.host] = :digest
628
+ if response['server'] =~ /Microsoft-IIS/
629
+ @auth_hash[uri.host] = :iis_digest
630
+ end
631
+ @digest = response['www-authenticate']
632
+ else
633
+ @auth_hash[uri.host] = :basic
634
+ end
635
+
636
+ fetch uri, request.method.downcase.to_sym, headers, params, referer
637
+ end
638
+
639
+ def robots= value
640
+ require 'webrobots' if value
641
+ @webrobots = nil if value != @robots
642
+ @robots = value
643
+ end
644
+
645
+ ##
646
+ # Tests if this agent is allowed to access +url+, consulting the site's
647
+ # robots.txt.
648
+
649
+ def robots_allowed? uri
650
+ return true if uri.request_uri == '/robots.txt'
651
+
652
+ webrobots.allowed? uri
653
+ end
654
+
655
+ # Opposite of robots_allowed?
656
+
657
+ def robots_disallowed? url
658
+ !robots_allowed? url
659
+ end
660
+
661
+ # Returns an error object if there is an error in fetching or parsing
662
+ # robots.txt of the site +url+.
663
+ def robots_error(url)
664
+ webrobots.error(url)
665
+ end
666
+
667
+ # Raises the error if there is an error in fetching or parsing robots.txt of
668
+ # the site +url+.
669
+ def robots_error!(url)
670
+ webrobots.error!(url)
671
+ end
672
+
673
+ # Removes robots.txt cache for the site +url+.
674
+ def robots_reset(url)
675
+ webrobots.reset(url)
676
+ end
677
+
678
+ def set_http
679
+ @http = Net::HTTP::Persistent.new 'mechanize', @proxy_uri
680
+
681
+ @http.keep_alive = @keep_alive_time
682
+
683
+ @http.ca_file = @ca_file
684
+ @http.verify_callback = @verify_callback
685
+
686
+ if @cert and @key then
687
+ cert = if OpenSSL::X509::Certificate === @cert then
688
+ @cert
689
+ else
690
+ OpenSSL::X509::Certificate.new ::File.read @cert
691
+ end
692
+
693
+ key = if OpenSSL::PKey::PKey === @key then
694
+ @key
695
+ else
696
+ OpenSSL::PKey::RSA.new ::File.read(@key), @pass
697
+ end
698
+
699
+ @http.certificate = cert
700
+ @http.private_key = key
701
+ end
702
+ end
703
+
704
+ # Sets the proxy address, port, user, and password +addr+ should be a host,
705
+ # with no "http://"
706
+ def set_proxy(addr, port, user = nil, pass = nil)
707
+ return unless addr and port
708
+ @proxy_uri = URI "http://#{addr}"
709
+ @proxy_uri.port = port
710
+ @proxy_uri.user = user if user
711
+ @proxy_uri.password = pass if pass
712
+
713
+ @proxy_uri
714
+ end
715
+
716
+ def user_agent= user_agent
717
+ @webrobots = nil if user_agent != @user_agent
718
+ @user_agent = user_agent
719
+ end
720
+
721
+ # Returns a visited page for the url passed in, otherwise nil
722
+ def visited_page url
723
+ @history.visited_page resolve url
724
+ end
725
+
726
+ def get_robots(uri) # :nodoc:
727
+ fetch(uri).body
728
+ rescue Mechanize::ResponseCodeError => e
729
+ return '' if e.response_code == '404'
730
+ raise e
731
+ end
732
+
733
+ def webrobots
734
+ @webrobots ||= WebRobots.new(@user_agent, :http_get => method(:get_robots))
735
+ end
736
+
737
+ end
738
+