diamond-mechanize 2.1 → 2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. metadata +222 -167
  2. data/Rakefile +0 -49
  3. data/lib/mechanize/content_type_error.rb +0 -13
  4. data/lib/mechanize/cookie.rb +0 -232
  5. data/lib/mechanize/cookie_jar.rb +0 -194
  6. data/lib/mechanize/download.rb +0 -59
  7. data/lib/mechanize/element_matcher.rb +0 -36
  8. data/lib/mechanize/file.rb +0 -65
  9. data/lib/mechanize/file_connection.rb +0 -17
  10. data/lib/mechanize/file_request.rb +0 -26
  11. data/lib/mechanize/file_response.rb +0 -74
  12. data/lib/mechanize/file_saver.rb +0 -39
  13. data/lib/mechanize/form/button.rb +0 -6
  14. data/lib/mechanize/form/check_box.rb +0 -12
  15. data/lib/mechanize/form/field.rb +0 -54
  16. data/lib/mechanize/form/file_upload.rb +0 -21
  17. data/lib/mechanize/form/hidden.rb +0 -3
  18. data/lib/mechanize/form/image_button.rb +0 -19
  19. data/lib/mechanize/form/keygen.rb +0 -34
  20. data/lib/mechanize/form/multi_select_list.rb +0 -94
  21. data/lib/mechanize/form/option.rb +0 -50
  22. data/lib/mechanize/form/radio_button.rb +0 -55
  23. data/lib/mechanize/form/reset.rb +0 -3
  24. data/lib/mechanize/form/select_list.rb +0 -44
  25. data/lib/mechanize/form/submit.rb +0 -3
  26. data/lib/mechanize/form/text.rb +0 -3
  27. data/lib/mechanize/form/textarea.rb +0 -3
  28. data/lib/mechanize/form.rb +0 -543
  29. data/lib/mechanize/headers.rb +0 -23
  30. data/lib/mechanize/history.rb +0 -82
  31. data/lib/mechanize/http/agent.rb +0 -1004
  32. data/lib/mechanize/http/auth_challenge.rb +0 -59
  33. data/lib/mechanize/http/auth_realm.rb +0 -31
  34. data/lib/mechanize/http/content_disposition_parser.rb +0 -188
  35. data/lib/mechanize/http/www_authenticate_parser.rb +0 -155
  36. data/lib/mechanize/http.rb +0 -8
  37. data/lib/mechanize/monkey_patch.rb +0 -16
  38. data/lib/mechanize/page/base.rb +0 -7
  39. data/lib/mechanize/page/frame.rb +0 -27
  40. data/lib/mechanize/page/image.rb +0 -30
  41. data/lib/mechanize/page/label.rb +0 -20
  42. data/lib/mechanize/page/link.rb +0 -98
  43. data/lib/mechanize/page/meta_refresh.rb +0 -68
  44. data/lib/mechanize/page.rb +0 -440
  45. data/lib/mechanize/parser.rb +0 -173
  46. data/lib/mechanize/pluggable_parsers.rb +0 -144
  47. data/lib/mechanize/redirect_limit_reached_error.rb +0 -19
  48. data/lib/mechanize/redirect_not_get_or_head_error.rb +0 -21
  49. data/lib/mechanize/response_code_error.rb +0 -21
  50. data/lib/mechanize/response_read_error.rb +0 -27
  51. data/lib/mechanize/robots_disallowed_error.rb +0 -28
  52. data/lib/mechanize/test_case.rb +0 -663
  53. data/lib/mechanize/unauthorized_error.rb +0 -3
  54. data/lib/mechanize/unsupported_scheme_error.rb +0 -6
  55. data/lib/mechanize/util.rb +0 -101
  56. data/lib/mechanize.rb +0 -1079
  57. data/test/data/htpasswd +0 -1
  58. data/test/data/server.crt +0 -16
  59. data/test/data/server.csr +0 -12
  60. data/test/data/server.key +0 -15
  61. data/test/data/server.pem +0 -15
  62. data/test/htdocs/alt_text.html +0 -10
  63. data/test/htdocs/bad_form_test.html +0 -9
  64. data/test/htdocs/button.jpg +0 -0
  65. data/test/htdocs/canonical_uri.html +0 -9
  66. data/test/htdocs/dir with spaces/foo.html +0 -1
  67. data/test/htdocs/empty_form.html +0 -6
  68. data/test/htdocs/file_upload.html +0 -26
  69. data/test/htdocs/find_link.html +0 -41
  70. data/test/htdocs/form_multi_select.html +0 -16
  71. data/test/htdocs/form_multival.html +0 -37
  72. data/test/htdocs/form_no_action.html +0 -18
  73. data/test/htdocs/form_no_input_name.html +0 -16
  74. data/test/htdocs/form_order_test.html +0 -11
  75. data/test/htdocs/form_select.html +0 -16
  76. data/test/htdocs/form_set_fields.html +0 -14
  77. data/test/htdocs/form_test.html +0 -188
  78. data/test/htdocs/frame_referer_test.html +0 -10
  79. data/test/htdocs/frame_test.html +0 -30
  80. data/test/htdocs/google.html +0 -13
  81. data/test/htdocs/index.html +0 -6
  82. data/test/htdocs/link with space.html +0 -5
  83. data/test/htdocs/meta_cookie.html +0 -11
  84. data/test/htdocs/no_title_test.html +0 -6
  85. data/test/htdocs/noindex.html +0 -9
  86. data/test/htdocs/rails_3_encoding_hack_form_test.html +0 -27
  87. data/test/htdocs/relative/tc_relative_links.html +0 -21
  88. data/test/htdocs/robots.html +0 -8
  89. data/test/htdocs/robots.txt +0 -2
  90. data/test/htdocs/tc_bad_charset.html +0 -9
  91. data/test/htdocs/tc_bad_links.html +0 -5
  92. data/test/htdocs/tc_base_link.html +0 -8
  93. data/test/htdocs/tc_blank_form.html +0 -11
  94. data/test/htdocs/tc_charset.html +0 -6
  95. data/test/htdocs/tc_checkboxes.html +0 -19
  96. data/test/htdocs/tc_encoded_links.html +0 -5
  97. data/test/htdocs/tc_field_precedence.html +0 -11
  98. data/test/htdocs/tc_follow_meta.html +0 -8
  99. data/test/htdocs/tc_form_action.html +0 -48
  100. data/test/htdocs/tc_links.html +0 -19
  101. data/test/htdocs/tc_meta_in_body.html +0 -9
  102. data/test/htdocs/tc_pretty_print.html +0 -17
  103. data/test/htdocs/tc_referer.html +0 -16
  104. data/test/htdocs/tc_relative_links.html +0 -19
  105. data/test/htdocs/tc_textarea.html +0 -23
  106. data/test/htdocs/test_click.html +0 -11
  107. data/test/htdocs/unusual______.html +0 -5
  108. data/test/test_mechanize.rb +0 -1164
  109. data/test/test_mechanize_cookie.rb +0 -451
  110. data/test/test_mechanize_cookie_jar.rb +0 -483
  111. data/test/test_mechanize_download.rb +0 -43
  112. data/test/test_mechanize_file.rb +0 -61
  113. data/test/test_mechanize_file_connection.rb +0 -21
  114. data/test/test_mechanize_file_request.rb +0 -19
  115. data/test/test_mechanize_file_saver.rb +0 -21
  116. data/test/test_mechanize_form.rb +0 -875
  117. data/test/test_mechanize_form_check_box.rb +0 -38
  118. data/test/test_mechanize_form_encoding.rb +0 -114
  119. data/test/test_mechanize_form_field.rb +0 -63
  120. data/test/test_mechanize_form_file_upload.rb +0 -20
  121. data/test/test_mechanize_form_image_button.rb +0 -12
  122. data/test/test_mechanize_form_keygen.rb +0 -32
  123. data/test/test_mechanize_form_multi_select_list.rb +0 -84
  124. data/test/test_mechanize_form_option.rb +0 -55
  125. data/test/test_mechanize_form_radio_button.rb +0 -78
  126. data/test/test_mechanize_form_select_list.rb +0 -76
  127. data/test/test_mechanize_form_textarea.rb +0 -52
  128. data/test/test_mechanize_headers.rb +0 -35
  129. data/test/test_mechanize_history.rb +0 -103
  130. data/test/test_mechanize_http_agent.rb +0 -1225
  131. data/test/test_mechanize_http_auth_challenge.rb +0 -39
  132. data/test/test_mechanize_http_auth_realm.rb +0 -49
  133. data/test/test_mechanize_http_content_disposition_parser.rb +0 -118
  134. data/test/test_mechanize_http_www_authenticate_parser.rb +0 -146
  135. data/test/test_mechanize_link.rb +0 -80
  136. data/test/test_mechanize_page.rb +0 -118
  137. data/test/test_mechanize_page_encoding.rb +0 -182
  138. data/test/test_mechanize_page_frame.rb +0 -16
  139. data/test/test_mechanize_page_link.rb +0 -390
  140. data/test/test_mechanize_page_meta_refresh.rb +0 -127
  141. data/test/test_mechanize_parser.rb +0 -289
  142. data/test/test_mechanize_pluggable_parser.rb +0 -52
  143. data/test/test_mechanize_redirect_limit_reached_error.rb +0 -24
  144. data/test/test_mechanize_redirect_not_get_or_head_error.rb +0 -14
  145. data/test/test_mechanize_subclass.rb +0 -22
  146. data/test/test_mechanize_util.rb +0 -103
  147. data/test/test_multi_select.rb +0 -119
@@ -1,23 +0,0 @@
1
- class Mechanize::Headers < Hash
2
- def [](key)
3
- super(key.downcase)
4
- end
5
-
6
- def []=(key, value)
7
- super(key.downcase, value)
8
- end
9
-
10
- def key?(key)
11
- super(key.downcase)
12
- end
13
-
14
- def canonical_each
15
- block_given? or return enum_for(__method__)
16
- each { |key, value|
17
- key = key.capitalize
18
- key.gsub!(/-([a-z])/) { "-#{$1.upcase}" }
19
- yield [key, value]
20
- }
21
- end
22
- end
23
-
@@ -1,82 +0,0 @@
1
- ##
2
- # This class manages history for your mechanize object.
3
-
4
- class Mechanize::History < Array
5
-
6
- attr_accessor :max_size
7
-
8
- def initialize(max_size = nil)
9
- @max_size = max_size
10
- @history_index = {}
11
- end
12
-
13
- def initialize_copy(orig)
14
- super
15
- @history_index = orig.instance_variable_get(:@history_index).dup
16
- end
17
-
18
- def inspect # :nodoc:
19
- uris = map { |page| page.uri }.join ', '
20
-
21
- "[#{uris}]"
22
- end
23
-
24
- def push(page, uri = nil)
25
- super page
26
-
27
- index = uri ? uri : page.uri
28
- @history_index[index.to_s] = page
29
-
30
- shift while length > @max_size if @max_size
31
-
32
- self
33
- end
34
-
35
- alias :<< :push
36
-
37
- def visited? uri
38
- page = @history_index[uri.to_s]
39
-
40
- return page if page # HACK
41
-
42
- uri = uri.dup
43
- uri.path = '/' if uri.path.empty?
44
-
45
- @history_index[uri.to_s]
46
- end
47
-
48
- alias visited_page visited?
49
-
50
- def clear
51
- @history_index.clear
52
- super
53
- end
54
-
55
- def shift
56
- return nil if length == 0
57
- page = self[0]
58
- self[0] = nil
59
-
60
- super
61
-
62
- remove_from_index(page)
63
- page
64
- end
65
-
66
- def pop
67
- return nil if length == 0
68
- page = super
69
- remove_from_index(page)
70
- page
71
- end
72
-
73
- private
74
-
75
- def remove_from_index(page)
76
- @history_index.each do |k,v|
77
- @history_index.delete(k) if v == page
78
- end
79
- end
80
-
81
- end
82
-
@@ -1,1004 +0,0 @@
1
- require 'tempfile'
2
- require 'net/ntlm'
3
- require 'kconv'
4
- require 'webrobots'
5
-
6
- ##
7
- # An HTTP (and local disk access) user agent. This class is an implementation
8
- # detail and is subject to change at any time.
9
-
10
- class Mechanize::HTTP::Agent
11
-
12
- # :section: Headers
13
-
14
- # Disables If-Modified-Since conditional requests (enabled by default)
15
- attr_accessor :conditional_requests
16
-
17
- # Is gzip compression of requests enabled?
18
- attr_accessor :gzip_enabled
19
-
20
- # A hash of request headers to be used for every request
21
- attr_accessor :request_headers
22
-
23
- # The User-Agent header to send
24
- attr_reader :user_agent
25
-
26
- # :section: History
27
-
28
- # history of requests made
29
- attr_accessor :history
30
-
31
- # :section: Hooks
32
-
33
- # A list of hooks to call after retrieving a response. Hooks are called with
34
- # the agent and the response returned.
35
- attr_reader :post_connect_hooks
36
-
37
- # A list of hooks to call before making a request. Hooks are called with
38
- # the agent and the request to be performed.
39
- attr_reader :pre_connect_hooks
40
-
41
- # A list of hooks to call to handle the content-encoding of a request.
42
- attr_reader :content_encoding_hooks
43
-
44
- # :section: HTTP Authentication
45
-
46
- attr_reader :authenticate_methods # :nodoc:
47
- attr_reader :digest_challenges # :nodoc:
48
- attr_accessor :user
49
- attr_accessor :password
50
-
51
- # :section: Redirection
52
-
53
- # Follow HTML meta refresh and HTTP Refresh. If set to +:anywhere+ meta
54
- # refresh tags outside of the head element will be followed.
55
- attr_accessor :follow_meta_refresh
56
-
57
- # Follow an HTML meta refresh that has no "url=" in the content attribute.
58
- #
59
- # Defaults to false to prevent infinite refresh loops.
60
- attr_accessor :follow_meta_refresh_self
61
-
62
- # Controls how this agent deals with redirects. The following values are
63
- # allowed:
64
- #
65
- # :all, true:: All 3xx redirects are followed (default)
66
- # :permanent:: Only 301 Moved Permanantly redirects are followed
67
- # false:: No redirects are followed
68
- attr_accessor :redirect_ok
69
-
70
- # Maximum number of redirects to follow
71
- attr_accessor :redirection_limit
72
-
73
- # :section: Robots
74
-
75
- # When true, this agent will consult the site's robots.txt for each access.
76
- attr_reader :robots
77
-
78
- # :section: SSL
79
-
80
- # Path to an OpenSSL server certificate file
81
- attr_accessor :ca_file
82
-
83
- # An OpenSSL private key or the path to a private key
84
- attr_accessor :key
85
-
86
- # An OpenSSL client certificate or the path to a certificate file.
87
- attr_accessor :cert
88
-
89
- # An SSL certificate store
90
- attr_accessor :cert_store
91
-
92
- # OpenSSL key password
93
- attr_accessor :pass
94
-
95
- # A callback for additional certificate verification. See
96
- # OpenSSL::SSL::SSLContext#verify_callback
97
- #
98
- # The callback can be used for debugging or to ignore errors by always
99
- # returning +true+. Specifying nil uses the default method that was valid
100
- # when the SSLContext was created
101
- attr_accessor :verify_callback
102
-
103
- # How to verify SSL connections. Defaults to VERIFY_PEER
104
- attr_accessor :verify_mode
105
-
106
- # :section: Timeouts
107
-
108
- # Reset connections that have not been used in this many seconds
109
- attr_reader :idle_timeout
110
-
111
- # Set to false to disable HTTP/1.1 keep-alive requests
112
- attr_accessor :keep_alive
113
-
114
- # Length of time to wait until a connection is opened in seconds
115
- attr_accessor :open_timeout
116
-
117
- # Length of time to attempt to read data from the server
118
- attr_accessor :read_timeout
119
-
120
- # :section:
121
-
122
- # The cookies for this agent
123
- attr_accessor :cookie_jar
124
-
125
- # URI for a proxy connection
126
- attr_reader :proxy_uri
127
-
128
- # Retry non-idempotent requests?
129
- attr_reader :retry_change_requests
130
-
131
- # Responses larger than this will be written to a Tempfile instead of stored
132
- # in memory.
133
- attr_accessor :max_file_buffer
134
-
135
- # :section: Utility
136
-
137
- # The context parses responses into pages
138
- attr_accessor :context
139
-
140
- attr_reader :http # :nodoc:
141
-
142
- # Handlers for various URI schemes
143
- attr_accessor :scheme_handlers
144
-
145
- # :section:
146
-
147
- # Creates a new Mechanize HTTP user agent. The user agent is an
148
- # implementation detail of mechanize and its API may change at any time.
149
-
150
- def initialize
151
- @conditional_requests = true
152
- @context = nil
153
- @content_encoding_hooks = []
154
- @cookie_jar = Mechanize::CookieJar.new
155
- @follow_meta_refresh = false
156
- @follow_meta_refresh_self = false
157
- @gzip_enabled = true
158
- @history = Mechanize::History.new
159
- @idle_timeout = nil
160
- @keep_alive = true
161
- @keep_alive_time = 300
162
- @max_file_buffer = 10240
163
- @open_timeout = nil
164
- @post_connect_hooks = []
165
- @pre_connect_hooks = []
166
- @proxy_uri = nil
167
- @read_timeout = nil
168
- @redirect_ok = true
169
- @redirection_limit = 20
170
- @request_headers = {}
171
- @retry_change_requests = false
172
- @robots = false
173
- @user_agent = nil
174
- @webrobots = nil
175
-
176
- # HTTP Authentication
177
- @authenticate_parser = Mechanize::HTTP::WWWAuthenticateParser.new
178
- @authenticate_methods = Hash.new do |methods, uri|
179
- methods[uri] = Hash.new do |realms, auth_scheme|
180
- realms[auth_scheme] = []
181
- end
182
- end
183
- @digest_auth = Net::HTTP::DigestAuth.new
184
- @digest_challenges = {}
185
- @password = nil # HTTP auth password
186
- @user = nil # HTTP auth user
187
-
188
- # SSL
189
- @ca_file = nil
190
- @cert = nil
191
- @cert_store = nil
192
- @key = nil
193
- @pass = nil
194
- @verify_callback = nil
195
- @verify_mode = nil
196
-
197
- @scheme_handlers = Hash.new { |h, scheme|
198
- h[scheme] = lambda { |link, page|
199
- raise Mechanize::UnsupportedSchemeError, scheme
200
- }
201
- }
202
-
203
- @scheme_handlers['http'] = lambda { |link, page| link }
204
- @scheme_handlers['https'] = @scheme_handlers['http']
205
- @scheme_handlers['relative'] = @scheme_handlers['http']
206
- @scheme_handlers['file'] = @scheme_handlers['http']
207
- end
208
-
209
- # Retrieves +uri+ and parses it into a page or other object according to
210
- # PluggableParser. If the URI is an HTTP or HTTPS scheme URI the given HTTP
211
- # +method+ is used to retrieve it, along with the HTTP +headers+, request
212
- # +params+ and HTTP +referer+.
213
- #
214
- # +redirects+ tracks the number of redirects experienced when retrieving the
215
- # page. If it is over the redirection_limit an error will be raised.
216
-
217
- def fetch uri, method = :get, headers = {}, params = [],
218
- referer = current_page, redirects = 0
219
- referer_uri = referer ? referer.uri : nil
220
-
221
- uri = resolve uri, referer
222
-
223
- uri, params = resolve_parameters uri, method, params
224
-
225
- request = http_request uri, method, params
226
-
227
- connection = connection_for uri
228
-
229
- request_auth request, uri
230
-
231
- disable_keep_alive request
232
- enable_gzip request
233
-
234
- request_language_charset request
235
- request_cookies request, uri
236
- request_host request, uri
237
- request_referer request, uri, referer_uri
238
- request_user_agent request
239
- request_add_headers request, headers
240
-
241
- pre_connect request
242
-
243
- # Consult robots.txt
244
- if robots && uri.is_a?(URI::HTTP)
245
- robots_allowed?(uri) or raise Mechanize::RobotsDisallowedError.new(uri)
246
- end
247
-
248
- # Add If-Modified-Since if page is in history
249
- page = visited_page(uri)
250
-
251
- if (page = visited_page(uri)) and page.response['Last-Modified']
252
- request['If-Modified-Since'] = page.response['Last-Modified']
253
- end if(@conditional_requests)
254
-
255
- # Specify timeouts if given
256
- connection.open_timeout = @open_timeout if @open_timeout
257
- connection.read_timeout = @read_timeout if @read_timeout
258
-
259
- request_log request
260
-
261
- response_body_io = nil
262
-
263
- # Send the request
264
- response = connection.request(uri, request) { |res|
265
- response_log res
266
-
267
- response_body_io = response_read res, request
268
-
269
- res
270
- }
271
-
272
- hook_content_encoding response, uri, response_body_io
273
-
274
- response_body_io = response_content_encoding response, response_body_io
275
-
276
- post_connect uri, response, response_body_io
277
-
278
- page = response_parse response, response_body_io, uri
279
-
280
- response_cookies response, uri, page
281
-
282
- meta = response_follow_meta_refresh response, uri, page, redirects
283
- return meta if meta
284
-
285
- case response
286
- when Net::HTTPSuccess
287
- if robots && page.is_a?(Mechanize::Page)
288
- page.parser.noindex? and raise Mechanize::RobotsDisallowedError.new(uri)
289
- end
290
-
291
- page
292
- when Mechanize::FileResponse
293
- page
294
- when Net::HTTPNotModified
295
- log.debug("Got cached page") if log
296
- visited_page(uri) || page
297
- when Net::HTTPRedirection
298
- response_redirect response, method, page, redirects, referer
299
- when Net::HTTPUnauthorized
300
- response_authenticate(response, page, uri, request, headers, params,
301
- referer)
302
- else
303
- raise Mechanize::ResponseCodeError.new(page), "Unhandled response"
304
- end
305
- end
306
-
307
- # Retry non-idempotent requests
308
-
309
- def retry_change_requests= retri
310
- @retry_change_requests = retri
311
- @http.retry_change_requests = retri if @http
312
- end
313
-
314
- # :section: Headers
315
-
316
- def user_agent= user_agent
317
- @webrobots = nil if user_agent != @user_agent
318
- @user_agent = user_agent
319
- end
320
-
321
- # :section: History
322
-
323
- # Equivalent to the browser back button. Returns the most recent page
324
- # visited.
325
- def back
326
- @history.pop
327
- end
328
-
329
- ##
330
- # Returns the latest page loaded by the agent
331
-
332
- def current_page
333
- @history.last
334
- end
335
-
336
- def max_history
337
- @history.max_size
338
- end
339
-
340
- def max_history=(length)
341
- @history.max_size = length
342
- end
343
-
344
- # Returns a visited page for the url passed in, otherwise nil
345
- def visited_page url
346
- @history.visited_page resolve url
347
- end
348
-
349
- # :section: Hooks
350
-
351
- def hook_content_encoding response, uri, response_body_io
352
- @content_encoding_hooks.each do |hook|
353
- hook.call self, uri, response, response_body_io
354
- end
355
- end
356
-
357
- ##
358
- # Invokes hooks added to post_connect_hooks after a +response+ is returned
359
- # and the response +body+ is handled.
360
- #
361
- # Yields the +context+, the +uri+ for the request, the +response+ and the
362
- # response +body+.
363
-
364
- def post_connect uri, response, body_io # :yields: agent, uri, response, body
365
- @post_connect_hooks.each do |hook|
366
- begin
367
- hook.call self, uri, response, body_io.read
368
- ensure
369
- body_io.rewind
370
- end
371
- end
372
- end
373
-
374
- ##
375
- # Invokes hooks added to pre_connect_hooks before a +request+ is made.
376
- # Yields the +agent+ and the +request+ that will be performed to each hook.
377
-
378
- def pre_connect request # :yields: agent, request
379
- @pre_connect_hooks.each do |hook|
380
- hook.call self, request
381
- end
382
- end
383
-
384
- # :section: Request
385
-
386
- def connection_for uri
387
- case uri.scheme.downcase
388
- when 'http', 'https' then
389
- return @http
390
- when 'file' then
391
- return Mechanize::FileConnection.new
392
- end
393
- end
394
-
395
- def disable_keep_alive request
396
- request['connection'] = 'close' unless @keep_alive
397
- end
398
-
399
- def enable_gzip request
400
- request['accept-encoding'] = if @gzip_enabled
401
- 'gzip,deflate,identity'
402
- else
403
- 'identity'
404
- end
405
- end
406
-
407
- def http_request uri, method, params = nil
408
- case uri.scheme.downcase
409
- when 'http', 'https' then
410
- klass = Net::HTTP.const_get(method.to_s.capitalize)
411
-
412
- request ||= klass.new(uri.request_uri)
413
- request.body = params.first if params
414
-
415
- request
416
- when 'file' then
417
- Mechanize::FileRequest.new uri
418
- end
419
- end
420
-
421
- def request_add_headers request, headers = {}
422
- @request_headers.each do |k,v|
423
- request[k] = v
424
- end
425
-
426
- headers.each do |field, value|
427
- case field
428
- when :etag then request["ETag"] = value
429
- when :if_modified_since then request["If-Modified-Since"] = value
430
- when Symbol then
431
- raise ArgumentError, "unknown header symbol #{field}"
432
- else
433
- request[field] = value
434
- end
435
- end
436
- end
437
-
438
- def request_auth request, uri
439
- base_uri = uri + '/'
440
- schemes = @authenticate_methods[base_uri]
441
-
442
- if realm = schemes[:digest].find { |r| r.uri == base_uri } then
443
- request_auth_digest request, uri, realm, base_uri, false
444
- elsif realm = schemes[:iis_digest].find { |r| r.uri == base_uri } then
445
- request_auth_digest request, uri, realm, base_uri, true
446
- elsif schemes[:basic].find { |r| r.uri == base_uri } then
447
- request.basic_auth @user, @password
448
- end
449
- end
450
-
451
- def request_auth_digest request, uri, realm, base_uri, iis
452
- challenge = @digest_challenges[realm]
453
-
454
- uri.user = @user
455
- uri.password = @password
456
-
457
- auth = @digest_auth.auth_header uri, challenge.to_s, request.method, iis
458
- request['Authorization'] = auth
459
- end
460
-
461
- def request_cookies request, uri
462
- return if @cookie_jar.empty? uri
463
-
464
- cookies = @cookie_jar.cookies uri
465
-
466
- return if cookies.empty?
467
-
468
- request.add_field 'Cookie', cookies.join('; ')
469
- end
470
-
471
- def request_host request, uri
472
- port = [80, 443].include?(uri.port.to_i) ? nil : uri.port
473
- host = uri.host
474
-
475
- request['Host'] = [host, port].compact.join ':'
476
- end
477
-
478
- def request_language_charset request
479
- request['accept-charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
480
- request['accept-language'] = 'en-us,en;q=0.5'
481
- end
482
-
483
- # Log specified headers for the request
484
- def request_log request
485
- return unless log
486
-
487
- log.info("#{request.class}: #{request.path}")
488
-
489
- request.each_header do |k, v|
490
- log.debug("request-header: #{k} => #{v}")
491
- end
492
- end
493
-
494
- def request_referer request, uri, referer
495
- return unless referer
496
- return if 'https' == referer.scheme.downcase and
497
- 'https' != uri.scheme.downcase
498
-
499
- request['Referer'] = referer
500
- end
501
-
502
- def request_user_agent request
503
- request['User-Agent'] = @user_agent if @user_agent
504
- end
505
-
506
- def resolve(uri, referer = current_page)
507
- uri = uri.dup if uri.is_a?(URI)
508
-
509
- unless uri.is_a?(URI)
510
- uri = uri.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/o) { |match|
511
- if RUBY_VERSION >= "1.9.0"
512
- Mechanize::Util.uri_escape(match)
513
- else
514
- sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C')[0])
515
- end
516
- }
517
-
518
- unescaped = uri.split(/(?:%[0-9A-Fa-f]{2})+|#/)
519
- escaped = uri.scan(/(?:%[0-9A-Fa-f]{2})+|#/)
520
-
521
- escaped_uri = Mechanize::Util.html_unescape(
522
- unescaped.zip(escaped).map { |x,y|
523
- "#{WEBrick::HTTPUtils.escape(x)}#{y}"
524
- }.join('')
525
- )
526
-
527
- begin
528
- uri = URI.parse(escaped_uri)
529
- rescue
530
- uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_uri))
531
- end
532
- end
533
-
534
- scheme = uri.relative? ? 'relative' : uri.scheme.downcase
535
- uri = @scheme_handlers[scheme].call(uri, referer)
536
-
537
- if referer && referer.uri
538
- if uri.path.length == 0 && uri.relative?
539
- uri.path = referer.uri.path
540
- end
541
- end
542
-
543
- uri.path = '/' if uri.path.length == 0
544
-
545
- if uri.relative?
546
- raise ArgumentError, "absolute URL needed (not #{uri})" unless
547
- referer && referer.uri
548
-
549
- base = nil
550
- if referer.respond_to?(:bases) && referer.parser
551
- base = referer.bases.last
552
- end
553
-
554
- uri = ((base && base.uri && base.uri.absolute?) ?
555
- base.uri :
556
- referer.uri) + uri
557
- uri = referer.uri + uri
558
- # Strip initial "/.." bits from the path
559
- uri.path.sub!(/^(\/\.\.)+(?=\/)/, '')
560
- end
561
-
562
- unless ['http', 'https', 'file'].include?(uri.scheme.downcase)
563
- raise ArgumentError, "unsupported scheme: #{uri.scheme}"
564
- end
565
-
566
- uri
567
- end
568
-
569
- def resolve_parameters uri, method, parameters
570
- case method
571
- when :head, :get, :delete, :trace then
572
- if parameters and parameters.length > 0
573
- uri.query ||= ''
574
- uri.query << '&' if uri.query.length > 0
575
- uri.query << Mechanize::Util.build_query_string(parameters)
576
- end
577
-
578
- return uri, nil
579
- end
580
-
581
- return uri, parameters
582
- end
583
-
584
- # :section: Response
585
-
586
- def get_meta_refresh response, uri, page
587
- return nil unless @follow_meta_refresh
588
-
589
- if page.respond_to?(:meta_refresh) and
590
- (redirect = page.meta_refresh.first) then
591
- [redirect.delay, redirect.href] unless
592
- not @follow_meta_refresh_self and redirect.link_self
593
- elsif refresh = response['refresh']
594
- delay, href, link_self = Mechanize::Page::MetaRefresh.parse refresh, uri
595
- raise Mechanize::Error, 'Invalid refresh http header' unless delay
596
- [delay.to_f, href] unless
597
- not @follow_meta_refresh_self and link_self
598
- end
599
- end
600
-
601
- def response_authenticate(response, page, uri, request, headers, params,
602
- referer)
603
- raise Mechanize::UnauthorizedError, page unless @user || @password
604
-
605
- challenges = @authenticate_parser.parse response['www-authenticate']
606
-
607
- if challenge = challenges.find { |c| c.scheme =~ /^Digest$/i } then
608
- realm = challenge.realm uri
609
-
610
- auth_scheme = if response['server'] =~ /Microsoft-IIS/ then
611
- :iis_digest
612
- else
613
- :digest
614
- end
615
-
616
- existing_realms = @authenticate_methods[realm.uri][auth_scheme]
617
-
618
- raise Mechanize::UnauthorizedError, page if
619
- existing_realms.include? realm
620
-
621
- existing_realms << realm
622
- @digest_challenges[realm] = challenge
623
- elsif challenge = challenges.find { |c| c.scheme == 'NTLM' } then
624
- existing_realms = @authenticate_methods[uri + '/'][:ntlm]
625
-
626
- raise Mechanize::UnauthorizedError, page if
627
- existing_realms.include?(realm) and not challenge.params
628
-
629
- existing_realms << realm
630
-
631
- if challenge.params then
632
- type_2 = Net::NTLM::Message.decode64 challenge.params
633
-
634
- type_3 = type_2.response({ :user => @user, :password => @password, },
635
- { :ntlmv2 => true }).encode64
636
-
637
- headers['Authorization'] = "NTLM #{type_3}"
638
- else
639
- type_1 = Net::NTLM::Message::Type1.new.encode64
640
- headers['Authorization'] = "NTLM #{type_1}"
641
- end
642
- elsif challenge = challenges.find { |c| c.scheme == 'Basic' } then
643
- realm = challenge.realm uri
644
-
645
- existing_realms = @authenticate_methods[realm.uri][:basic]
646
-
647
- raise Mechanize::UnauthorizedError, page if
648
- existing_realms.include? realm
649
-
650
- existing_realms << realm
651
- else
652
- raise Mechanize::UnauthorizedError, page
653
- end
654
-
655
- fetch uri, request.method.downcase.to_sym, headers, params, referer
656
- end
657
-
658
- def response_content_encoding response, body_io
659
- length = response.content_length
660
-
661
- length = case body_io
662
- when IO, Tempfile then
663
- body_io.stat.size
664
- else
665
- body_io.length
666
- end unless length
667
-
668
- out_io = nil
669
-
670
- case response['Content-Encoding']
671
- when nil, 'none', '7bit' then
672
- out_io = body_io
673
- when 'deflate' then
674
- log.debug('deflate body') if log
675
-
676
- return if length.zero?
677
-
678
- begin
679
- out_io = inflate body_io
680
- rescue Zlib::BufError, Zlib::DataError
681
- log.error('Unable to inflate page, retrying with raw deflate') if log
682
- body_io.rewind
683
- begin
684
- out_io = inflate body_io, -Zlib::MAX_WBITS
685
- rescue Zlib::BufError, Zlib::DataError
686
- log.error("unable to inflate page: #{$!}") if log
687
- nil
688
- end
689
- end
690
- when 'gzip', 'x-gzip' then
691
- log.debug('gzip body') if log
692
-
693
- return if length.zero?
694
-
695
- begin
696
- zio = Zlib::GzipReader.new body_io
697
- out_io = Tempfile.new 'mechanize-decode', :encoding => 'ascii-8bit'
698
- out_io.binmode
699
-
700
- until zio.eof? do
701
- out_io.write zio.read 16384
702
- end
703
- rescue Zlib::BufError, Zlib::GzipFile::Error
704
- log.error('Unable to gunzip body, trying raw inflate') if log
705
- body_io.rewind
706
- body_io.read 10
707
-
708
- out_io = inflate body_io, -Zlib::MAX_WBITS
709
- rescue Zlib::DataError
710
- log.error("unable to gunzip page: #{$!}") if log
711
- ''
712
- ensure
713
- zio.close if zio and not zio.closed?
714
- end
715
- else
716
- raise Mechanize::Error,
717
- "Unsupported Content-Encoding: #{response['Content-Encoding']}"
718
- end
719
-
720
- out_io.flush
721
- out_io.rewind
722
-
723
- out_io
724
- end
725
-
726
- def response_cookies response, uri, page
727
- if Mechanize::Page === page and page.body =~ /Set-Cookie/n
728
- page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta|
729
- save_cookies(uri, meta['content'])
730
- end
731
- end
732
-
733
- header_cookies = response.get_fields 'Set-Cookie'
734
-
735
- return unless header_cookies
736
-
737
- header_cookies.each do |set_cookie|
738
- save_cookies(uri, set_cookie)
739
- end
740
- end
741
-
742
- def save_cookies(uri, set_cookie)
743
- log = log() # reduce method calls
744
- Mechanize::Cookie.parse(uri, set_cookie, log) { |c|
745
- if @cookie_jar.add(uri, c)
746
- log.debug("saved cookie: #{c}") if log
747
- else
748
- log.debug("rejected cookie: #{c}") if log
749
- end
750
- }
751
- end
752
-
753
- def response_follow_meta_refresh response, uri, page, redirects
754
- delay, new_url = get_meta_refresh(response, uri, page)
755
- return nil unless new_url
756
-
757
- raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
758
- redirects + 1 > @redirection_limit
759
-
760
- sleep delay
761
- @history.push(page, page.uri)
762
- fetch new_url, :get, {}, [],
763
- Mechanize::Page.new(nil, {'content-type'=>'text/html'}), redirects
764
- end
765
-
766
- def response_log response
767
- return unless log
768
-
769
- log.info("status: #{response.class} #{response.http_version} " \
770
- "#{response.code} #{response.message}")
771
-
772
- response.each_header do |k, v|
773
- log.debug("response-header: #{k} => #{v}")
774
- end
775
- end
776
-
777
- def response_parse response, body_io, uri
778
- @context.parse uri, response, body_io
779
- end
780
-
781
- def response_read response, request
782
- content_length = response.content_length
783
-
784
- if content_length and content_length > @max_file_buffer then
785
- body_io = Tempfile.new 'mechanize-raw'
786
- body_io.binmode if defined? body_io.binmode
787
- else
788
- body_io = StringIO.new
789
- end
790
-
791
- body_io.set_encoding Encoding::BINARY if body_io.respond_to? :set_encoding
792
- total = 0
793
-
794
- begin
795
- response.read_body { |part|
796
- total += part.length
797
-
798
- if StringIO === body_io and total > @max_file_buffer then
799
- new_io = Tempfile.new 'mechanize-raw'
800
- new_io.binmode if defined? binmode
801
- new_io.set_encoding(body_io.external_encoding)
802
- new_io.write body_io.string
803
-
804
- body_io = new_io
805
- end
806
-
807
- body_io.write(part)
808
- log.debug("Read #{part.length} bytes (#{total} total)") if log
809
- }
810
- rescue Net::HTTP::Persistent::Error => e
811
- body_io.rewind
812
- raise Mechanize::ResponseReadError.new(e, response, body_io)
813
- end
814
-
815
- body_io.flush
816
- body_io.rewind
817
-
818
- raise Mechanize::ResponseCodeError, response if
819
- Net::HTTPUnknownResponse === response
820
-
821
- content_length = response.content_length
822
-
823
- unless Net::HTTP::Head === request or Net::HTTPRedirection === response then
824
- raise EOFError, "Content-Length (#{content_length}) does not match " \
825
- "response body length (#{body_io.length})" if
826
- content_length and content_length != body_io.length
827
- end
828
-
829
- body_io
830
- end
831
-
832
- def response_redirect response, method, page, redirects, referer = current_page
833
- case @redirect_ok
834
- when true, :all
835
- # shortcut
836
- when false, nil
837
- return page
838
- when :permanent
839
- return page unless Net::HTTPMovedPermanently === response
840
- end
841
-
842
- log.info("follow redirect to: #{response['Location']}") if log
843
-
844
- raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
845
- redirects + 1 > @redirection_limit
846
-
847
- redirect_method = method == :head ? :head : :get
848
-
849
- from_uri = page.uri
850
- @history.push(page, from_uri)
851
- new_uri = from_uri + response['Location'].to_s
852
-
853
- fetch new_uri, redirect_method, {}, [], referer, redirects + 1
854
- end
855
-
856
- # :section: Robots
857
-
858
- def get_robots(uri) # :nodoc:
859
- fetch(uri).body
860
- rescue Mechanize::ResponseCodeError => e
861
- return '' if e.response_code == '404'
862
- raise e
863
- end
864
-
865
- def robots= value
866
- require 'webrobots' if value
867
- @webrobots = nil if value != @robots
868
- @robots = value
869
- end
870
-
871
- ##
872
- # Tests if this agent is allowed to access +url+, consulting the site's
873
- # robots.txt.
874
-
875
- def robots_allowed? uri
876
- return true if uri.request_uri == '/robots.txt'
877
-
878
- webrobots.allowed? uri
879
- end
880
-
881
- # Opposite of robots_allowed?
882
-
883
- def robots_disallowed? url
884
- !robots_allowed? url
885
- end
886
-
887
- # Returns an error object if there is an error in fetching or parsing
888
- # robots.txt of the site +url+.
889
- def robots_error(url)
890
- webrobots.error(url)
891
- end
892
-
893
- # Raises the error if there is an error in fetching or parsing robots.txt of
894
- # the site +url+.
895
- def robots_error!(url)
896
- webrobots.error!(url)
897
- end
898
-
899
- # Removes robots.txt cache for the site +url+.
900
- def robots_reset(url)
901
- webrobots.reset(url)
902
- end
903
-
904
- def webrobots
905
- @webrobots ||= WebRobots.new(@user_agent, :http_get => method(:get_robots))
906
- end
907
-
908
- # :section: SSL
909
-
910
- def certificate
911
- @http.certificate
912
- end
913
-
914
- # :section: Timeouts
915
-
916
- # Sets the conection idle timeout for persistent connections
917
- def idle_timeout= timeout
918
- @idle_timeout = timeout
919
- @http.idle_timeout = timeout if @http
920
- end
921
-
922
- # :section: Utility
923
-
924
- def inflate compressed, window_bits = nil
925
- inflate = Zlib::Inflate.new window_bits
926
- out_io = Tempfile.new 'mechanize-decode'
927
-
928
- until compressed.eof? do
929
- out_io.write inflate.inflate compressed.read 1024
930
- end
931
-
932
- out_io.write inflate.finish
933
-
934
- out_io
935
- end
936
-
937
- def log
938
- @context.log
939
- end
940
-
941
- def set_http
942
- @http = Net::HTTP::Persistent.new 'mechanize', @proxy_uri
943
-
944
- @http.keep_alive = @keep_alive_time
945
- @http.idle_timeout = @idle_timeout if @idle_timeout
946
- @http.retry_change_requests = @retry_change_requests
947
-
948
- @http.ca_file = @ca_file
949
- @http.cert_store = @cert_store if @cert_store
950
- @http.verify_callback = @verify_callback
951
- @http.verify_mode = @verify_mode if @verify_mode
952
-
953
- # update our cached value
954
- @verify_mode = @http.verify_mode
955
- @cert_store = @http.cert_store
956
-
957
- if @cert and @key then
958
- cert = if OpenSSL::X509::Certificate === @cert then
959
- @cert
960
- else
961
- OpenSSL::X509::Certificate.new ::File.read @cert
962
- end
963
-
964
- key = if OpenSSL::PKey::PKey === @key then
965
- @key
966
- else
967
- OpenSSL::PKey::RSA.new ::File.read(@key), @pass
968
- end
969
-
970
- @http.certificate = cert
971
- @http.private_key = key
972
- end
973
- end
974
-
975
- ##
976
- # Sets the proxy address, port, user, and password +addr+ should be a host,
977
- # with no "http://", +port+ may be a port number, service name or port
978
- # number string.
979
-
980
- def set_proxy(addr, port, user = nil, pass = nil)
981
- return unless addr and port
982
-
983
- unless Integer === port then
984
- begin
985
- port = Socket.getservbyname port
986
- rescue SocketError
987
- begin
988
- port = Integer port
989
- rescue ArgumentError
990
- raise ArgumentError, "invalid value for port: #{port.inspect}"
991
- end
992
- end
993
- end
994
-
995
- @proxy_uri = URI "http://#{addr}"
996
- @proxy_uri.port = port
997
- @proxy_uri.user = user if user
998
- @proxy_uri.password = pass if pass
999
-
1000
- @proxy_uri
1001
- end
1002
-
1003
- end
1004
-