diamond-mechanize 2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (154) hide show
  1. data/CHANGELOG.rdoc +718 -0
  2. data/EXAMPLES.rdoc +187 -0
  3. data/FAQ.rdoc +11 -0
  4. data/GUIDE.rdoc +163 -0
  5. data/LICENSE.rdoc +20 -0
  6. data/Manifest.txt +159 -0
  7. data/README.rdoc +64 -0
  8. data/Rakefile +49 -0
  9. data/lib/mechanize.rb +1079 -0
  10. data/lib/mechanize/content_type_error.rb +13 -0
  11. data/lib/mechanize/cookie.rb +232 -0
  12. data/lib/mechanize/cookie_jar.rb +194 -0
  13. data/lib/mechanize/download.rb +59 -0
  14. data/lib/mechanize/element_matcher.rb +36 -0
  15. data/lib/mechanize/file.rb +65 -0
  16. data/lib/mechanize/file_connection.rb +17 -0
  17. data/lib/mechanize/file_request.rb +26 -0
  18. data/lib/mechanize/file_response.rb +74 -0
  19. data/lib/mechanize/file_saver.rb +39 -0
  20. data/lib/mechanize/form.rb +543 -0
  21. data/lib/mechanize/form/button.rb +6 -0
  22. data/lib/mechanize/form/check_box.rb +12 -0
  23. data/lib/mechanize/form/field.rb +54 -0
  24. data/lib/mechanize/form/file_upload.rb +21 -0
  25. data/lib/mechanize/form/hidden.rb +3 -0
  26. data/lib/mechanize/form/image_button.rb +19 -0
  27. data/lib/mechanize/form/keygen.rb +34 -0
  28. data/lib/mechanize/form/multi_select_list.rb +94 -0
  29. data/lib/mechanize/form/option.rb +50 -0
  30. data/lib/mechanize/form/radio_button.rb +55 -0
  31. data/lib/mechanize/form/reset.rb +3 -0
  32. data/lib/mechanize/form/select_list.rb +44 -0
  33. data/lib/mechanize/form/submit.rb +3 -0
  34. data/lib/mechanize/form/text.rb +3 -0
  35. data/lib/mechanize/form/textarea.rb +3 -0
  36. data/lib/mechanize/headers.rb +23 -0
  37. data/lib/mechanize/history.rb +82 -0
  38. data/lib/mechanize/http.rb +8 -0
  39. data/lib/mechanize/http/agent.rb +1004 -0
  40. data/lib/mechanize/http/auth_challenge.rb +59 -0
  41. data/lib/mechanize/http/auth_realm.rb +31 -0
  42. data/lib/mechanize/http/content_disposition_parser.rb +188 -0
  43. data/lib/mechanize/http/www_authenticate_parser.rb +155 -0
  44. data/lib/mechanize/monkey_patch.rb +16 -0
  45. data/lib/mechanize/page.rb +440 -0
  46. data/lib/mechanize/page/base.rb +7 -0
  47. data/lib/mechanize/page/frame.rb +27 -0
  48. data/lib/mechanize/page/image.rb +30 -0
  49. data/lib/mechanize/page/label.rb +20 -0
  50. data/lib/mechanize/page/link.rb +98 -0
  51. data/lib/mechanize/page/meta_refresh.rb +68 -0
  52. data/lib/mechanize/parser.rb +173 -0
  53. data/lib/mechanize/pluggable_parsers.rb +144 -0
  54. data/lib/mechanize/redirect_limit_reached_error.rb +19 -0
  55. data/lib/mechanize/redirect_not_get_or_head_error.rb +21 -0
  56. data/lib/mechanize/response_code_error.rb +21 -0
  57. data/lib/mechanize/response_read_error.rb +27 -0
  58. data/lib/mechanize/robots_disallowed_error.rb +28 -0
  59. data/lib/mechanize/test_case.rb +663 -0
  60. data/lib/mechanize/unauthorized_error.rb +3 -0
  61. data/lib/mechanize/unsupported_scheme_error.rb +6 -0
  62. data/lib/mechanize/util.rb +101 -0
  63. data/test/data/htpasswd +1 -0
  64. data/test/data/server.crt +16 -0
  65. data/test/data/server.csr +12 -0
  66. data/test/data/server.key +15 -0
  67. data/test/data/server.pem +15 -0
  68. data/test/htdocs/alt_text.html +10 -0
  69. data/test/htdocs/bad_form_test.html +9 -0
  70. data/test/htdocs/button.jpg +0 -0
  71. data/test/htdocs/canonical_uri.html +9 -0
  72. data/test/htdocs/dir with spaces/foo.html +1 -0
  73. data/test/htdocs/empty_form.html +6 -0
  74. data/test/htdocs/file_upload.html +26 -0
  75. data/test/htdocs/find_link.html +41 -0
  76. data/test/htdocs/form_multi_select.html +16 -0
  77. data/test/htdocs/form_multival.html +37 -0
  78. data/test/htdocs/form_no_action.html +18 -0
  79. data/test/htdocs/form_no_input_name.html +16 -0
  80. data/test/htdocs/form_order_test.html +11 -0
  81. data/test/htdocs/form_select.html +16 -0
  82. data/test/htdocs/form_set_fields.html +14 -0
  83. data/test/htdocs/form_test.html +188 -0
  84. data/test/htdocs/frame_referer_test.html +10 -0
  85. data/test/htdocs/frame_test.html +30 -0
  86. data/test/htdocs/google.html +13 -0
  87. data/test/htdocs/index.html +6 -0
  88. data/test/htdocs/link with space.html +5 -0
  89. data/test/htdocs/meta_cookie.html +11 -0
  90. data/test/htdocs/no_title_test.html +6 -0
  91. data/test/htdocs/noindex.html +9 -0
  92. data/test/htdocs/rails_3_encoding_hack_form_test.html +27 -0
  93. data/test/htdocs/relative/tc_relative_links.html +21 -0
  94. data/test/htdocs/robots.html +8 -0
  95. data/test/htdocs/robots.txt +2 -0
  96. data/test/htdocs/tc_bad_charset.html +9 -0
  97. data/test/htdocs/tc_bad_links.html +5 -0
  98. data/test/htdocs/tc_base_link.html +8 -0
  99. data/test/htdocs/tc_blank_form.html +11 -0
  100. data/test/htdocs/tc_charset.html +6 -0
  101. data/test/htdocs/tc_checkboxes.html +19 -0
  102. data/test/htdocs/tc_encoded_links.html +5 -0
  103. data/test/htdocs/tc_field_precedence.html +11 -0
  104. data/test/htdocs/tc_follow_meta.html +8 -0
  105. data/test/htdocs/tc_form_action.html +48 -0
  106. data/test/htdocs/tc_links.html +19 -0
  107. data/test/htdocs/tc_meta_in_body.html +9 -0
  108. data/test/htdocs/tc_pretty_print.html +17 -0
  109. data/test/htdocs/tc_referer.html +16 -0
  110. data/test/htdocs/tc_relative_links.html +19 -0
  111. data/test/htdocs/tc_textarea.html +23 -0
  112. data/test/htdocs/test_click.html +11 -0
  113. data/test/htdocs/unusual______.html +5 -0
  114. data/test/test_mechanize.rb +1164 -0
  115. data/test/test_mechanize_cookie.rb +451 -0
  116. data/test/test_mechanize_cookie_jar.rb +483 -0
  117. data/test/test_mechanize_download.rb +43 -0
  118. data/test/test_mechanize_file.rb +61 -0
  119. data/test/test_mechanize_file_connection.rb +21 -0
  120. data/test/test_mechanize_file_request.rb +19 -0
  121. data/test/test_mechanize_file_saver.rb +21 -0
  122. data/test/test_mechanize_form.rb +875 -0
  123. data/test/test_mechanize_form_check_box.rb +38 -0
  124. data/test/test_mechanize_form_encoding.rb +114 -0
  125. data/test/test_mechanize_form_field.rb +63 -0
  126. data/test/test_mechanize_form_file_upload.rb +20 -0
  127. data/test/test_mechanize_form_image_button.rb +12 -0
  128. data/test/test_mechanize_form_keygen.rb +32 -0
  129. data/test/test_mechanize_form_multi_select_list.rb +84 -0
  130. data/test/test_mechanize_form_option.rb +55 -0
  131. data/test/test_mechanize_form_radio_button.rb +78 -0
  132. data/test/test_mechanize_form_select_list.rb +76 -0
  133. data/test/test_mechanize_form_textarea.rb +52 -0
  134. data/test/test_mechanize_headers.rb +35 -0
  135. data/test/test_mechanize_history.rb +103 -0
  136. data/test/test_mechanize_http_agent.rb +1225 -0
  137. data/test/test_mechanize_http_auth_challenge.rb +39 -0
  138. data/test/test_mechanize_http_auth_realm.rb +49 -0
  139. data/test/test_mechanize_http_content_disposition_parser.rb +118 -0
  140. data/test/test_mechanize_http_www_authenticate_parser.rb +146 -0
  141. data/test/test_mechanize_link.rb +80 -0
  142. data/test/test_mechanize_page.rb +118 -0
  143. data/test/test_mechanize_page_encoding.rb +182 -0
  144. data/test/test_mechanize_page_frame.rb +16 -0
  145. data/test/test_mechanize_page_link.rb +390 -0
  146. data/test/test_mechanize_page_meta_refresh.rb +127 -0
  147. data/test/test_mechanize_parser.rb +289 -0
  148. data/test/test_mechanize_pluggable_parser.rb +52 -0
  149. data/test/test_mechanize_redirect_limit_reached_error.rb +24 -0
  150. data/test/test_mechanize_redirect_not_get_or_head_error.rb +14 -0
  151. data/test/test_mechanize_subclass.rb +22 -0
  152. data/test/test_mechanize_util.rb +103 -0
  153. data/test/test_multi_select.rb +119 -0
  154. metadata +216 -0
@@ -0,0 +1,82 @@
1
+ ##
2
+ # This class manages history for your mechanize object.
3
+
4
+ class Mechanize::History < Array
5
+
6
+ attr_accessor :max_size
7
+
8
+ def initialize(max_size = nil)
9
+ @max_size = max_size
10
+ @history_index = {}
11
+ end
12
+
13
+ def initialize_copy(orig)
14
+ super
15
+ @history_index = orig.instance_variable_get(:@history_index).dup
16
+ end
17
+
18
+ def inspect # :nodoc:
19
+ uris = map { |page| page.uri }.join ', '
20
+
21
+ "[#{uris}]"
22
+ end
23
+
24
+ def push(page, uri = nil)
25
+ super page
26
+
27
+ index = uri ? uri : page.uri
28
+ @history_index[index.to_s] = page
29
+
30
+ shift while length > @max_size if @max_size
31
+
32
+ self
33
+ end
34
+
35
+ alias :<< :push
36
+
37
+ def visited? uri
38
+ page = @history_index[uri.to_s]
39
+
40
+ return page if page # HACK
41
+
42
+ uri = uri.dup
43
+ uri.path = '/' if uri.path.empty?
44
+
45
+ @history_index[uri.to_s]
46
+ end
47
+
48
+ alias visited_page visited?
49
+
50
+ def clear
51
+ @history_index.clear
52
+ super
53
+ end
54
+
55
+ def shift
56
+ return nil if length == 0
57
+ page = self[0]
58
+ self[0] = nil
59
+
60
+ super
61
+
62
+ remove_from_index(page)
63
+ page
64
+ end
65
+
66
+ def pop
67
+ return nil if length == 0
68
+ page = super
69
+ remove_from_index(page)
70
+ page
71
+ end
72
+
73
+ private
74
+
75
+ def remove_from_index(page)
76
+ @history_index.each do |k,v|
77
+ @history_index.delete(k) if v == page
78
+ end
79
+ end
80
+
81
+ end
82
+
@@ -0,0 +1,8 @@
1
+ ##
2
+ # Mechanize::HTTP contains classes for communicated with HTTP servers. All
3
+ # API under this namespace is considered private and is subject to change at
4
+ # any time.
5
+
6
+ class Mechanize::HTTP
7
+ end
8
+
@@ -0,0 +1,1004 @@
1
+ require 'tempfile'
2
+ require 'net/ntlm'
3
+ require 'kconv'
4
+ require 'webrobots'
5
+
6
+ ##
7
+ # An HTTP (and local disk access) user agent. This class is an implementation
8
+ # detail and is subject to change at any time.
9
+
10
+ class Mechanize::HTTP::Agent
11
+
12
+ # :section: Headers
13
+
14
+ # Disables If-Modified-Since conditional requests (enabled by default)
15
+ attr_accessor :conditional_requests
16
+
17
+ # Is gzip compression of requests enabled?
18
+ attr_accessor :gzip_enabled
19
+
20
+ # A hash of request headers to be used for every request
21
+ attr_accessor :request_headers
22
+
23
+ # The User-Agent header to send
24
+ attr_reader :user_agent
25
+
26
+ # :section: History
27
+
28
+ # history of requests made
29
+ attr_accessor :history
30
+
31
+ # :section: Hooks
32
+
33
+ # A list of hooks to call after retrieving a response. Hooks are called with
34
+ # the agent and the response returned.
35
+ attr_reader :post_connect_hooks
36
+
37
+ # A list of hooks to call before making a request. Hooks are called with
38
+ # the agent and the request to be performed.
39
+ attr_reader :pre_connect_hooks
40
+
41
+ # A list of hooks to call to handle the content-encoding of a request.
42
+ attr_reader :content_encoding_hooks
43
+
44
+ # :section: HTTP Authentication
45
+
46
+ attr_reader :authenticate_methods # :nodoc:
47
+ attr_reader :digest_challenges # :nodoc:
48
+ attr_accessor :user
49
+ attr_accessor :password
50
+
51
+ # :section: Redirection
52
+
53
+ # Follow HTML meta refresh and HTTP Refresh. If set to +:anywhere+ meta
54
+ # refresh tags outside of the head element will be followed.
55
+ attr_accessor :follow_meta_refresh
56
+
57
+ # Follow an HTML meta refresh that has no "url=" in the content attribute.
58
+ #
59
+ # Defaults to false to prevent infinite refresh loops.
60
+ attr_accessor :follow_meta_refresh_self
61
+
62
+ # Controls how this agent deals with redirects. The following values are
63
+ # allowed:
64
+ #
65
+ # :all, true:: All 3xx redirects are followed (default)
66
+ # :permanent:: Only 301 Moved Permanantly redirects are followed
67
+ # false:: No redirects are followed
68
+ attr_accessor :redirect_ok
69
+
70
+ # Maximum number of redirects to follow
71
+ attr_accessor :redirection_limit
72
+
73
+ # :section: Robots
74
+
75
+ # When true, this agent will consult the site's robots.txt for each access.
76
+ attr_reader :robots
77
+
78
+ # :section: SSL
79
+
80
+ # Path to an OpenSSL server certificate file
81
+ attr_accessor :ca_file
82
+
83
+ # An OpenSSL private key or the path to a private key
84
+ attr_accessor :key
85
+
86
+ # An OpenSSL client certificate or the path to a certificate file.
87
+ attr_accessor :cert
88
+
89
+ # An SSL certificate store
90
+ attr_accessor :cert_store
91
+
92
+ # OpenSSL key password
93
+ attr_accessor :pass
94
+
95
+ # A callback for additional certificate verification. See
96
+ # OpenSSL::SSL::SSLContext#verify_callback
97
+ #
98
+ # The callback can be used for debugging or to ignore errors by always
99
+ # returning +true+. Specifying nil uses the default method that was valid
100
+ # when the SSLContext was created
101
+ attr_accessor :verify_callback
102
+
103
+ # How to verify SSL connections. Defaults to VERIFY_PEER
104
+ attr_accessor :verify_mode
105
+
106
+ # :section: Timeouts
107
+
108
+ # Reset connections that have not been used in this many seconds
109
+ attr_reader :idle_timeout
110
+
111
+ # Set to false to disable HTTP/1.1 keep-alive requests
112
+ attr_accessor :keep_alive
113
+
114
+ # Length of time to wait until a connection is opened in seconds
115
+ attr_accessor :open_timeout
116
+
117
+ # Length of time to attempt to read data from the server
118
+ attr_accessor :read_timeout
119
+
120
+ # :section:
121
+
122
+ # The cookies for this agent
123
+ attr_accessor :cookie_jar
124
+
125
+ # URI for a proxy connection
126
+ attr_reader :proxy_uri
127
+
128
+ # Retry non-idempotent requests?
129
+ attr_reader :retry_change_requests
130
+
131
+ # Responses larger than this will be written to a Tempfile instead of stored
132
+ # in memory.
133
+ attr_accessor :max_file_buffer
134
+
135
+ # :section: Utility
136
+
137
+ # The context parses responses into pages
138
+ attr_accessor :context
139
+
140
+ attr_reader :http # :nodoc:
141
+
142
+ # Handlers for various URI schemes
143
+ attr_accessor :scheme_handlers
144
+
145
+ # :section:
146
+
147
+ # Creates a new Mechanize HTTP user agent. The user agent is an
148
+ # implementation detail of mechanize and its API may change at any time.
149
+
150
+ def initialize
151
+ @conditional_requests = true
152
+ @context = nil
153
+ @content_encoding_hooks = []
154
+ @cookie_jar = Mechanize::CookieJar.new
155
+ @follow_meta_refresh = false
156
+ @follow_meta_refresh_self = false
157
+ @gzip_enabled = true
158
+ @history = Mechanize::History.new
159
+ @idle_timeout = nil
160
+ @keep_alive = true
161
+ @keep_alive_time = 300
162
+ @max_file_buffer = 10240
163
+ @open_timeout = nil
164
+ @post_connect_hooks = []
165
+ @pre_connect_hooks = []
166
+ @proxy_uri = nil
167
+ @read_timeout = nil
168
+ @redirect_ok = true
169
+ @redirection_limit = 20
170
+ @request_headers = {}
171
+ @retry_change_requests = false
172
+ @robots = false
173
+ @user_agent = nil
174
+ @webrobots = nil
175
+
176
+ # HTTP Authentication
177
+ @authenticate_parser = Mechanize::HTTP::WWWAuthenticateParser.new
178
+ @authenticate_methods = Hash.new do |methods, uri|
179
+ methods[uri] = Hash.new do |realms, auth_scheme|
180
+ realms[auth_scheme] = []
181
+ end
182
+ end
183
+ @digest_auth = Net::HTTP::DigestAuth.new
184
+ @digest_challenges = {}
185
+ @password = nil # HTTP auth password
186
+ @user = nil # HTTP auth user
187
+
188
+ # SSL
189
+ @ca_file = nil
190
+ @cert = nil
191
+ @cert_store = nil
192
+ @key = nil
193
+ @pass = nil
194
+ @verify_callback = nil
195
+ @verify_mode = nil
196
+
197
+ @scheme_handlers = Hash.new { |h, scheme|
198
+ h[scheme] = lambda { |link, page|
199
+ raise Mechanize::UnsupportedSchemeError, scheme
200
+ }
201
+ }
202
+
203
+ @scheme_handlers['http'] = lambda { |link, page| link }
204
+ @scheme_handlers['https'] = @scheme_handlers['http']
205
+ @scheme_handlers['relative'] = @scheme_handlers['http']
206
+ @scheme_handlers['file'] = @scheme_handlers['http']
207
+ end
208
+
209
+ # Retrieves +uri+ and parses it into a page or other object according to
210
+ # PluggableParser. If the URI is an HTTP or HTTPS scheme URI the given HTTP
211
+ # +method+ is used to retrieve it, along with the HTTP +headers+, request
212
+ # +params+ and HTTP +referer+.
213
+ #
214
+ # +redirects+ tracks the number of redirects experienced when retrieving the
215
+ # page. If it is over the redirection_limit an error will be raised.
216
+
217
+ def fetch uri, method = :get, headers = {}, params = [],
218
+ referer = current_page, redirects = 0
219
+ referer_uri = referer ? referer.uri : nil
220
+
221
+ uri = resolve uri, referer
222
+
223
+ uri, params = resolve_parameters uri, method, params
224
+
225
+ request = http_request uri, method, params
226
+
227
+ connection = connection_for uri
228
+
229
+ request_auth request, uri
230
+
231
+ disable_keep_alive request
232
+ enable_gzip request
233
+
234
+ request_language_charset request
235
+ request_cookies request, uri
236
+ request_host request, uri
237
+ request_referer request, uri, referer_uri
238
+ request_user_agent request
239
+ request_add_headers request, headers
240
+
241
+ pre_connect request
242
+
243
+ # Consult robots.txt
244
+ if robots && uri.is_a?(URI::HTTP)
245
+ robots_allowed?(uri) or raise Mechanize::RobotsDisallowedError.new(uri)
246
+ end
247
+
248
+ # Add If-Modified-Since if page is in history
249
+ page = visited_page(uri)
250
+
251
+ if (page = visited_page(uri)) and page.response['Last-Modified']
252
+ request['If-Modified-Since'] = page.response['Last-Modified']
253
+ end if(@conditional_requests)
254
+
255
+ # Specify timeouts if given
256
+ connection.open_timeout = @open_timeout if @open_timeout
257
+ connection.read_timeout = @read_timeout if @read_timeout
258
+
259
+ request_log request
260
+
261
+ response_body_io = nil
262
+
263
+ # Send the request
264
+ response = connection.request(uri, request) { |res|
265
+ response_log res
266
+
267
+ response_body_io = response_read res, request
268
+
269
+ res
270
+ }
271
+
272
+ hook_content_encoding response, uri, response_body_io
273
+
274
+ response_body_io = response_content_encoding response, response_body_io
275
+
276
+ post_connect uri, response, response_body_io
277
+
278
+ page = response_parse response, response_body_io, uri
279
+
280
+ response_cookies response, uri, page
281
+
282
+ meta = response_follow_meta_refresh response, uri, page, redirects
283
+ return meta if meta
284
+
285
+ case response
286
+ when Net::HTTPSuccess
287
+ if robots && page.is_a?(Mechanize::Page)
288
+ page.parser.noindex? and raise Mechanize::RobotsDisallowedError.new(uri)
289
+ end
290
+
291
+ page
292
+ when Mechanize::FileResponse
293
+ page
294
+ when Net::HTTPNotModified
295
+ log.debug("Got cached page") if log
296
+ visited_page(uri) || page
297
+ when Net::HTTPRedirection
298
+ response_redirect response, method, page, redirects, referer
299
+ when Net::HTTPUnauthorized
300
+ response_authenticate(response, page, uri, request, headers, params,
301
+ referer)
302
+ else
303
+ raise Mechanize::ResponseCodeError.new(page), "Unhandled response"
304
+ end
305
+ end
306
+
307
+ # Retry non-idempotent requests
308
+
309
+ def retry_change_requests= retri
310
+ @retry_change_requests = retri
311
+ @http.retry_change_requests = retri if @http
312
+ end
313
+
314
+ # :section: Headers
315
+
316
+ def user_agent= user_agent
317
+ @webrobots = nil if user_agent != @user_agent
318
+ @user_agent = user_agent
319
+ end
320
+
321
+ # :section: History
322
+
323
+ # Equivalent to the browser back button. Returns the most recent page
324
+ # visited.
325
+ def back
326
+ @history.pop
327
+ end
328
+
329
+ ##
330
+ # Returns the latest page loaded by the agent
331
+
332
+ def current_page
333
+ @history.last
334
+ end
335
+
336
+ def max_history
337
+ @history.max_size
338
+ end
339
+
340
+ def max_history=(length)
341
+ @history.max_size = length
342
+ end
343
+
344
+ # Returns a visited page for the url passed in, otherwise nil
345
+ def visited_page url
346
+ @history.visited_page resolve url
347
+ end
348
+
349
+ # :section: Hooks
350
+
351
+ def hook_content_encoding response, uri, response_body_io
352
+ @content_encoding_hooks.each do |hook|
353
+ hook.call self, uri, response, response_body_io
354
+ end
355
+ end
356
+
357
+ ##
358
+ # Invokes hooks added to post_connect_hooks after a +response+ is returned
359
+ # and the response +body+ is handled.
360
+ #
361
+ # Yields the +context+, the +uri+ for the request, the +response+ and the
362
+ # response +body+.
363
+
364
+ def post_connect uri, response, body_io # :yields: agent, uri, response, body
365
+ @post_connect_hooks.each do |hook|
366
+ begin
367
+ hook.call self, uri, response, body_io.read
368
+ ensure
369
+ body_io.rewind
370
+ end
371
+ end
372
+ end
373
+
374
+ ##
375
+ # Invokes hooks added to pre_connect_hooks before a +request+ is made.
376
+ # Yields the +agent+ and the +request+ that will be performed to each hook.
377
+
378
+ def pre_connect request # :yields: agent, request
379
+ @pre_connect_hooks.each do |hook|
380
+ hook.call self, request
381
+ end
382
+ end
383
+
384
+ # :section: Request
385
+
386
+ def connection_for uri
387
+ case uri.scheme.downcase
388
+ when 'http', 'https' then
389
+ return @http
390
+ when 'file' then
391
+ return Mechanize::FileConnection.new
392
+ end
393
+ end
394
+
395
+ def disable_keep_alive request
396
+ request['connection'] = 'close' unless @keep_alive
397
+ end
398
+
399
+ def enable_gzip request
400
+ request['accept-encoding'] = if @gzip_enabled
401
+ 'gzip,deflate,identity'
402
+ else
403
+ 'identity'
404
+ end
405
+ end
406
+
407
+ def http_request uri, method, params = nil
408
+ case uri.scheme.downcase
409
+ when 'http', 'https' then
410
+ klass = Net::HTTP.const_get(method.to_s.capitalize)
411
+
412
+ request ||= klass.new(uri.request_uri)
413
+ request.body = params.first if params
414
+
415
+ request
416
+ when 'file' then
417
+ Mechanize::FileRequest.new uri
418
+ end
419
+ end
420
+
421
+ def request_add_headers request, headers = {}
422
+ @request_headers.each do |k,v|
423
+ request[k] = v
424
+ end
425
+
426
+ headers.each do |field, value|
427
+ case field
428
+ when :etag then request["ETag"] = value
429
+ when :if_modified_since then request["If-Modified-Since"] = value
430
+ when Symbol then
431
+ raise ArgumentError, "unknown header symbol #{field}"
432
+ else
433
+ request[field] = value
434
+ end
435
+ end
436
+ end
437
+
438
+ def request_auth request, uri
439
+ base_uri = uri + '/'
440
+ schemes = @authenticate_methods[base_uri]
441
+
442
+ if realm = schemes[:digest].find { |r| r.uri == base_uri } then
443
+ request_auth_digest request, uri, realm, base_uri, false
444
+ elsif realm = schemes[:iis_digest].find { |r| r.uri == base_uri } then
445
+ request_auth_digest request, uri, realm, base_uri, true
446
+ elsif schemes[:basic].find { |r| r.uri == base_uri } then
447
+ request.basic_auth @user, @password
448
+ end
449
+ end
450
+
451
+ def request_auth_digest request, uri, realm, base_uri, iis
452
+ challenge = @digest_challenges[realm]
453
+
454
+ uri.user = @user
455
+ uri.password = @password
456
+
457
+ auth = @digest_auth.auth_header uri, challenge.to_s, request.method, iis
458
+ request['Authorization'] = auth
459
+ end
460
+
461
+ def request_cookies request, uri
462
+ return if @cookie_jar.empty? uri
463
+
464
+ cookies = @cookie_jar.cookies uri
465
+
466
+ return if cookies.empty?
467
+
468
+ request.add_field 'Cookie', cookies.join('; ')
469
+ end
470
+
471
+ def request_host request, uri
472
+ port = [80, 443].include?(uri.port.to_i) ? nil : uri.port
473
+ host = uri.host
474
+
475
+ request['Host'] = [host, port].compact.join ':'
476
+ end
477
+
478
+ def request_language_charset request
479
+ request['accept-charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
480
+ request['accept-language'] = 'en-us,en;q=0.5'
481
+ end
482
+
483
+ # Log specified headers for the request
484
+ def request_log request
485
+ return unless log
486
+
487
+ log.info("#{request.class}: #{request.path}")
488
+
489
+ request.each_header do |k, v|
490
+ log.debug("request-header: #{k} => #{v}")
491
+ end
492
+ end
493
+
494
+ def request_referer request, uri, referer
495
+ return unless referer
496
+ return if 'https' == referer.scheme.downcase and
497
+ 'https' != uri.scheme.downcase
498
+
499
+ request['Referer'] = referer
500
+ end
501
+
502
+ def request_user_agent request
503
+ request['User-Agent'] = @user_agent if @user_agent
504
+ end
505
+
506
+ def resolve(uri, referer = current_page)
507
+ uri = uri.dup if uri.is_a?(URI)
508
+
509
+ unless uri.is_a?(URI)
510
+ uri = uri.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/o) { |match|
511
+ if RUBY_VERSION >= "1.9.0"
512
+ Mechanize::Util.uri_escape(match)
513
+ else
514
+ sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C')[0])
515
+ end
516
+ }
517
+
518
+ unescaped = uri.split(/(?:%[0-9A-Fa-f]{2})+|#/)
519
+ escaped = uri.scan(/(?:%[0-9A-Fa-f]{2})+|#/)
520
+
521
+ escaped_uri = Mechanize::Util.html_unescape(
522
+ unescaped.zip(escaped).map { |x,y|
523
+ "#{WEBrick::HTTPUtils.escape(x)}#{y}"
524
+ }.join('')
525
+ )
526
+
527
+ begin
528
+ uri = URI.parse(escaped_uri)
529
+ rescue
530
+ uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_uri))
531
+ end
532
+ end
533
+
534
+ scheme = uri.relative? ? 'relative' : uri.scheme.downcase
535
+ uri = @scheme_handlers[scheme].call(uri, referer)
536
+
537
+ if referer && referer.uri
538
+ if uri.path.length == 0 && uri.relative?
539
+ uri.path = referer.uri.path
540
+ end
541
+ end
542
+
543
+ uri.path = '/' if uri.path.length == 0
544
+
545
+ if uri.relative?
546
+ raise ArgumentError, "absolute URL needed (not #{uri})" unless
547
+ referer && referer.uri
548
+
549
+ base = nil
550
+ if referer.respond_to?(:bases) && referer.parser
551
+ base = referer.bases.last
552
+ end
553
+
554
+ uri = ((base && base.uri && base.uri.absolute?) ?
555
+ base.uri :
556
+ referer.uri) + uri
557
+ uri = referer.uri + uri
558
+ # Strip initial "/.." bits from the path
559
+ uri.path.sub!(/^(\/\.\.)+(?=\/)/, '')
560
+ end
561
+
562
+ unless ['http', 'https', 'file'].include?(uri.scheme.downcase)
563
+ raise ArgumentError, "unsupported scheme: #{uri.scheme}"
564
+ end
565
+
566
+ uri
567
+ end
568
+
569
+ def resolve_parameters uri, method, parameters
570
+ case method
571
+ when :head, :get, :delete, :trace then
572
+ if parameters and parameters.length > 0
573
+ uri.query ||= ''
574
+ uri.query << '&' if uri.query.length > 0
575
+ uri.query << Mechanize::Util.build_query_string(parameters)
576
+ end
577
+
578
+ return uri, nil
579
+ end
580
+
581
+ return uri, parameters
582
+ end
583
+
584
+ # :section: Response
585
+
586
+ def get_meta_refresh response, uri, page
587
+ return nil unless @follow_meta_refresh
588
+
589
+ if page.respond_to?(:meta_refresh) and
590
+ (redirect = page.meta_refresh.first) then
591
+ [redirect.delay, redirect.href] unless
592
+ not @follow_meta_refresh_self and redirect.link_self
593
+ elsif refresh = response['refresh']
594
+ delay, href, link_self = Mechanize::Page::MetaRefresh.parse refresh, uri
595
+ raise Mechanize::Error, 'Invalid refresh http header' unless delay
596
+ [delay.to_f, href] unless
597
+ not @follow_meta_refresh_self and link_self
598
+ end
599
+ end
600
+
601
+ def response_authenticate(response, page, uri, request, headers, params,
602
+ referer)
603
+ raise Mechanize::UnauthorizedError, page unless @user || @password
604
+
605
+ challenges = @authenticate_parser.parse response['www-authenticate']
606
+
607
+ if challenge = challenges.find { |c| c.scheme =~ /^Digest$/i } then
608
+ realm = challenge.realm uri
609
+
610
+ auth_scheme = if response['server'] =~ /Microsoft-IIS/ then
611
+ :iis_digest
612
+ else
613
+ :digest
614
+ end
615
+
616
+ existing_realms = @authenticate_methods[realm.uri][auth_scheme]
617
+
618
+ raise Mechanize::UnauthorizedError, page if
619
+ existing_realms.include? realm
620
+
621
+ existing_realms << realm
622
+ @digest_challenges[realm] = challenge
623
+ elsif challenge = challenges.find { |c| c.scheme == 'NTLM' } then
624
+ existing_realms = @authenticate_methods[uri + '/'][:ntlm]
625
+
626
+ raise Mechanize::UnauthorizedError, page if
627
+ existing_realms.include?(realm) and not challenge.params
628
+
629
+ existing_realms << realm
630
+
631
+ if challenge.params then
632
+ type_2 = Net::NTLM::Message.decode64 challenge.params
633
+
634
+ type_3 = type_2.response({ :user => @user, :password => @password, },
635
+ { :ntlmv2 => true }).encode64
636
+
637
+ headers['Authorization'] = "NTLM #{type_3}"
638
+ else
639
+ type_1 = Net::NTLM::Message::Type1.new.encode64
640
+ headers['Authorization'] = "NTLM #{type_1}"
641
+ end
642
+ elsif challenge = challenges.find { |c| c.scheme == 'Basic' } then
643
+ realm = challenge.realm uri
644
+
645
+ existing_realms = @authenticate_methods[realm.uri][:basic]
646
+
647
+ raise Mechanize::UnauthorizedError, page if
648
+ existing_realms.include? realm
649
+
650
+ existing_realms << realm
651
+ else
652
+ raise Mechanize::UnauthorizedError, page
653
+ end
654
+
655
+ fetch uri, request.method.downcase.to_sym, headers, params, referer
656
+ end
657
+
658
+ def response_content_encoding response, body_io
659
+ length = response.content_length
660
+
661
+ length = case body_io
662
+ when IO, Tempfile then
663
+ body_io.stat.size
664
+ else
665
+ body_io.length
666
+ end unless length
667
+
668
+ out_io = nil
669
+
670
+ case response['Content-Encoding']
671
+ when nil, 'none', '7bit' then
672
+ out_io = body_io
673
+ when 'deflate' then
674
+ log.debug('deflate body') if log
675
+
676
+ return if length.zero?
677
+
678
+ begin
679
+ out_io = inflate body_io
680
+ rescue Zlib::BufError, Zlib::DataError
681
+ log.error('Unable to inflate page, retrying with raw deflate') if log
682
+ body_io.rewind
683
+ begin
684
+ out_io = inflate body_io, -Zlib::MAX_WBITS
685
+ rescue Zlib::BufError, Zlib::DataError
686
+ log.error("unable to inflate page: #{$!}") if log
687
+ nil
688
+ end
689
+ end
690
+ when 'gzip', 'x-gzip' then
691
+ log.debug('gzip body') if log
692
+
693
+ return if length.zero?
694
+
695
+ begin
696
+ zio = Zlib::GzipReader.new body_io
697
+ out_io = Tempfile.new 'mechanize-decode', :encoding => 'ascii-8bit'
698
+ out_io.binmode
699
+
700
+ until zio.eof? do
701
+ out_io.write zio.read 16384
702
+ end
703
+ rescue Zlib::BufError, Zlib::GzipFile::Error
704
+ log.error('Unable to gunzip body, trying raw inflate') if log
705
+ body_io.rewind
706
+ body_io.read 10
707
+
708
+ out_io = inflate body_io, -Zlib::MAX_WBITS
709
+ rescue Zlib::DataError
710
+ log.error("unable to gunzip page: #{$!}") if log
711
+ ''
712
+ ensure
713
+ zio.close if zio and not zio.closed?
714
+ end
715
+ else
716
+ raise Mechanize::Error,
717
+ "Unsupported Content-Encoding: #{response['Content-Encoding']}"
718
+ end
719
+
720
+ out_io.flush
721
+ out_io.rewind
722
+
723
+ out_io
724
+ end
725
+
726
+ def response_cookies response, uri, page
727
+ if Mechanize::Page === page and page.body =~ /Set-Cookie/n
728
+ page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta|
729
+ save_cookies(uri, meta['content'])
730
+ end
731
+ end
732
+
733
+ header_cookies = response.get_fields 'Set-Cookie'
734
+
735
+ return unless header_cookies
736
+
737
+ header_cookies.each do |set_cookie|
738
+ save_cookies(uri, set_cookie)
739
+ end
740
+ end
741
+
742
+ def save_cookies(uri, set_cookie)
743
+ log = log() # reduce method calls
744
+ Mechanize::Cookie.parse(uri, set_cookie, log) { |c|
745
+ if @cookie_jar.add(uri, c)
746
+ log.debug("saved cookie: #{c}") if log
747
+ else
748
+ log.debug("rejected cookie: #{c}") if log
749
+ end
750
+ }
751
+ end
752
+
753
+ def response_follow_meta_refresh response, uri, page, redirects
754
+ delay, new_url = get_meta_refresh(response, uri, page)
755
+ return nil unless new_url
756
+
757
+ raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
758
+ redirects + 1 > @redirection_limit
759
+
760
+ sleep delay
761
+ @history.push(page, page.uri)
762
+ fetch new_url, :get, {}, [],
763
+ Mechanize::Page.new(nil, {'content-type'=>'text/html'}), redirects
764
+ end
765
+
766
+ def response_log response
767
+ return unless log
768
+
769
+ log.info("status: #{response.class} #{response.http_version} " \
770
+ "#{response.code} #{response.message}")
771
+
772
+ response.each_header do |k, v|
773
+ log.debug("response-header: #{k} => #{v}")
774
+ end
775
+ end
776
+
777
+ def response_parse response, body_io, uri
778
+ @context.parse uri, response, body_io
779
+ end
780
+
781
+ def response_read response, request
782
+ content_length = response.content_length
783
+
784
+ if content_length and content_length > @max_file_buffer then
785
+ body_io = Tempfile.new 'mechanize-raw'
786
+ body_io.binmode if defined? body_io.binmode
787
+ else
788
+ body_io = StringIO.new
789
+ end
790
+
791
+ body_io.set_encoding Encoding::BINARY if body_io.respond_to? :set_encoding
792
+ total = 0
793
+
794
+ begin
795
+ response.read_body { |part|
796
+ total += part.length
797
+
798
+ if StringIO === body_io and total > @max_file_buffer then
799
+ new_io = Tempfile.new 'mechanize-raw'
800
+ new_io.binmode if defined? binmode
801
+ new_io.set_encoding(body_io.external_encoding)
802
+ new_io.write body_io.string
803
+
804
+ body_io = new_io
805
+ end
806
+
807
+ body_io.write(part)
808
+ log.debug("Read #{part.length} bytes (#{total} total)") if log
809
+ }
810
+ rescue Net::HTTP::Persistent::Error => e
811
+ body_io.rewind
812
+ raise Mechanize::ResponseReadError.new(e, response, body_io)
813
+ end
814
+
815
+ body_io.flush
816
+ body_io.rewind
817
+
818
+ raise Mechanize::ResponseCodeError, response if
819
+ Net::HTTPUnknownResponse === response
820
+
821
+ content_length = response.content_length
822
+
823
+ unless Net::HTTP::Head === request or Net::HTTPRedirection === response then
824
+ raise EOFError, "Content-Length (#{content_length}) does not match " \
825
+ "response body length (#{body_io.length})" if
826
+ content_length and content_length != body_io.length
827
+ end
828
+
829
+ body_io
830
+ end
831
+
832
+ def response_redirect response, method, page, redirects, referer = current_page
833
+ case @redirect_ok
834
+ when true, :all
835
+ # shortcut
836
+ when false, nil
837
+ return page
838
+ when :permanent
839
+ return page unless Net::HTTPMovedPermanently === response
840
+ end
841
+
842
+ log.info("follow redirect to: #{response['Location']}") if log
843
+
844
+ raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
845
+ redirects + 1 > @redirection_limit
846
+
847
+ redirect_method = method == :head ? :head : :get
848
+
849
+ from_uri = page.uri
850
+ @history.push(page, from_uri)
851
+ new_uri = from_uri + response['Location'].to_s
852
+
853
+ fetch new_uri, redirect_method, {}, [], referer, redirects + 1
854
+ end
855
+
856
+ # :section: Robots
857
+
858
+ def get_robots(uri) # :nodoc:
859
+ fetch(uri).body
860
+ rescue Mechanize::ResponseCodeError => e
861
+ return '' if e.response_code == '404'
862
+ raise e
863
+ end
864
+
865
+ def robots= value
866
+ require 'webrobots' if value
867
+ @webrobots = nil if value != @robots
868
+ @robots = value
869
+ end
870
+
871
+ ##
872
+ # Tests if this agent is allowed to access +url+, consulting the site's
873
+ # robots.txt.
874
+
875
+ def robots_allowed? uri
876
+ return true if uri.request_uri == '/robots.txt'
877
+
878
+ webrobots.allowed? uri
879
+ end
880
+
881
+ # Opposite of robots_allowed?
882
+
883
+ def robots_disallowed? url
884
+ !robots_allowed? url
885
+ end
886
+
887
+ # Returns an error object if there is an error in fetching or parsing
888
+ # robots.txt of the site +url+.
889
+ def robots_error(url)
890
+ webrobots.error(url)
891
+ end
892
+
893
+ # Raises the error if there is an error in fetching or parsing robots.txt of
894
+ # the site +url+.
895
+ def robots_error!(url)
896
+ webrobots.error!(url)
897
+ end
898
+
899
+ # Removes robots.txt cache for the site +url+.
900
+ def robots_reset(url)
901
+ webrobots.reset(url)
902
+ end
903
+
904
+ def webrobots
905
+ @webrobots ||= WebRobots.new(@user_agent, :http_get => method(:get_robots))
906
+ end
907
+
908
+ # :section: SSL
909
+
910
+ def certificate
911
+ @http.certificate
912
+ end
913
+
914
+ # :section: Timeouts
915
+
916
+ # Sets the conection idle timeout for persistent connections
917
+ def idle_timeout= timeout
918
+ @idle_timeout = timeout
919
+ @http.idle_timeout = timeout if @http
920
+ end
921
+
922
+ # :section: Utility
923
+
924
+ def inflate compressed, window_bits = nil
925
+ inflate = Zlib::Inflate.new window_bits
926
+ out_io = Tempfile.new 'mechanize-decode'
927
+
928
+ until compressed.eof? do
929
+ out_io.write inflate.inflate compressed.read 1024
930
+ end
931
+
932
+ out_io.write inflate.finish
933
+
934
+ out_io
935
+ end
936
+
937
+ def log
938
+ @context.log
939
+ end
940
+
941
+ def set_http
942
+ @http = Net::HTTP::Persistent.new 'mechanize', @proxy_uri
943
+
944
+ @http.keep_alive = @keep_alive_time
945
+ @http.idle_timeout = @idle_timeout if @idle_timeout
946
+ @http.retry_change_requests = @retry_change_requests
947
+
948
+ @http.ca_file = @ca_file
949
+ @http.cert_store = @cert_store if @cert_store
950
+ @http.verify_callback = @verify_callback
951
+ @http.verify_mode = @verify_mode if @verify_mode
952
+
953
+ # update our cached value
954
+ @verify_mode = @http.verify_mode
955
+ @cert_store = @http.cert_store
956
+
957
+ if @cert and @key then
958
+ cert = if OpenSSL::X509::Certificate === @cert then
959
+ @cert
960
+ else
961
+ OpenSSL::X509::Certificate.new ::File.read @cert
962
+ end
963
+
964
+ key = if OpenSSL::PKey::PKey === @key then
965
+ @key
966
+ else
967
+ OpenSSL::PKey::RSA.new ::File.read(@key), @pass
968
+ end
969
+
970
+ @http.certificate = cert
971
+ @http.private_key = key
972
+ end
973
+ end
974
+
975
+ ##
976
+ # Sets the proxy address, port, user, and password +addr+ should be a host,
977
+ # with no "http://", +port+ may be a port number, service name or port
978
+ # number string.
979
+
980
+ def set_proxy(addr, port, user = nil, pass = nil)
981
+ return unless addr and port
982
+
983
+ unless Integer === port then
984
+ begin
985
+ port = Socket.getservbyname port
986
+ rescue SocketError
987
+ begin
988
+ port = Integer port
989
+ rescue ArgumentError
990
+ raise ArgumentError, "invalid value for port: #{port.inspect}"
991
+ end
992
+ end
993
+ end
994
+
995
+ @proxy_uri = URI "http://#{addr}"
996
+ @proxy_uri.port = port
997
+ @proxy_uri.user = user if user
998
+ @proxy_uri.password = pass if pass
999
+
1000
+ @proxy_uri
1001
+ end
1002
+
1003
+ end
1004
+