mechanize 0.6.11 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mechanize might be problematic. Click here for more details.

Files changed (91) hide show
  1. data/CHANGELOG.txt +8 -0
  2. data/Manifest.txt +31 -22
  3. data/lib/mechanize.rb +2 -652
  4. data/lib/www/mechanize.rb +635 -0
  5. data/lib/www/mechanize/content_type_error.rb +16 -0
  6. data/lib/www/mechanize/cookie.rb +64 -0
  7. data/lib/{mechanize/cookie.rb → www/mechanize/cookie_jar.rb} +0 -60
  8. data/lib/www/mechanize/file.rb +73 -0
  9. data/lib/www/mechanize/file_saver.rb +39 -0
  10. data/lib/{mechanize → www/mechanize}/form.rb +119 -137
  11. data/lib/www/mechanize/form/button.rb +8 -0
  12. data/lib/www/mechanize/form/check_box.rb +13 -0
  13. data/lib/www/mechanize/form/field.rb +28 -0
  14. data/lib/www/mechanize/form/file_upload.rb +24 -0
  15. data/lib/www/mechanize/form/image_button.rb +23 -0
  16. data/lib/www/mechanize/form/multi_select_list.rb +69 -0
  17. data/lib/www/mechanize/form/option.rb +51 -0
  18. data/lib/www/mechanize/form/radio_button.rb +38 -0
  19. data/lib/www/mechanize/form/select_list.rb +41 -0
  20. data/lib/www/mechanize/headers.rb +12 -0
  21. data/lib/{mechanize → www/mechanize}/history.rb +0 -0
  22. data/lib/{mechanize → www/mechanize}/inspect.rb +21 -28
  23. data/lib/{mechanize → www/mechanize}/list.rb +0 -0
  24. data/lib/{mechanize → www/mechanize}/monkey_patch.rb +19 -0
  25. data/lib/www/mechanize/page.rb +121 -0
  26. data/lib/www/mechanize/page/base.rb +10 -0
  27. data/lib/www/mechanize/page/frame.rb +22 -0
  28. data/lib/www/mechanize/page/link.rb +50 -0
  29. data/lib/www/mechanize/page/meta.rb +10 -0
  30. data/lib/www/mechanize/pluggable_parsers.rb +93 -0
  31. data/lib/{mechanize/errors.rb → www/mechanize/response_code_error.rb} +1 -13
  32. data/test/{test_includes.rb → helper.rb} +4 -18
  33. data/test/{test_servlets.rb → servlets.rb} +0 -0
  34. data/test/tc_authenticate.rb +1 -8
  35. data/test/tc_bad_links.rb +3 -10
  36. data/test/tc_blank_form.rb +1 -8
  37. data/test/tc_checkboxes.rb +1 -8
  38. data/test/tc_cookie_class.rb +1 -6
  39. data/test/tc_cookie_jar.rb +1 -7
  40. data/test/tc_cookies.rb +10 -17
  41. data/test/tc_encoded_links.rb +5 -12
  42. data/test/tc_errors.rb +4 -11
  43. data/test/tc_follow_meta.rb +1 -8
  44. data/test/tc_form_action.rb +6 -14
  45. data/test/tc_form_as_hash.rb +1 -9
  46. data/test/tc_form_button.rb +5 -8
  47. data/test/tc_form_no_inputname.rb +1 -8
  48. data/test/tc_forms.rb +16 -24
  49. data/test/tc_frames.rb +3 -10
  50. data/test/tc_gzipping.rb +2 -9
  51. data/test/tc_history.rb +5 -12
  52. data/test/tc_html_unscape_forms.rb +8 -15
  53. data/test/tc_if_modified_since.rb +1 -6
  54. data/test/tc_keep_alive.rb +1 -8
  55. data/test/tc_links.rb +12 -19
  56. data/test/tc_mech.rb +26 -34
  57. data/test/{test_mechanize_file.rb → tc_mechanize_file.rb} +1 -6
  58. data/test/tc_multi_select.rb +10 -17
  59. data/test/tc_no_attributes.rb +1 -8
  60. data/test/tc_page.rb +3 -10
  61. data/test/tc_pluggable_parser.rb +8 -15
  62. data/test/tc_post_form.rb +3 -10
  63. data/test/tc_pretty_print.rb +3 -10
  64. data/test/tc_radiobutton.rb +2 -9
  65. data/test/tc_referer.rb +13 -20
  66. data/test/tc_relative_links.rb +1 -8
  67. data/test/tc_response_code.rb +14 -21
  68. data/test/tc_save_file.rb +1 -9
  69. data/test/tc_select.rb +3 -10
  70. data/test/tc_select_all.rb +2 -10
  71. data/test/tc_select_none.rb +2 -10
  72. data/test/tc_select_noopts.rb +2 -9
  73. data/test/tc_set_fields.rb +2 -9
  74. data/test/tc_ssl_server.rb +5 -12
  75. data/test/tc_subclass.rb +2 -9
  76. data/test/tc_textarea.rb +2 -9
  77. data/test/tc_upload.rb +2 -9
  78. data/test/test_all.rb +4 -43
  79. metadata +96 -80
  80. data/lib/mechanize/form_elements.rb +0 -254
  81. data/lib/mechanize/net-overrides/net/http.rb +0 -2107
  82. data/lib/mechanize/net-overrides/net/https.rb +0 -172
  83. data/lib/mechanize/net-overrides/net/protocol.rb +0 -380
  84. data/lib/mechanize/page.rb +0 -138
  85. data/lib/mechanize/page_elements.rb +0 -77
  86. data/lib/mechanize/parsers/rexml_page.rb +0 -35
  87. data/lib/mechanize/pluggable_parsers.rb +0 -204
  88. data/lib/mechanize/rexml.rb +0 -236
  89. data/setup.rb +0 -1585
  90. data/test/tc_proxy.rb +0 -25
  91. data/test/tc_watches.rb +0 -32
data/CHANGELOG.txt CHANGED
@@ -1,5 +1,13 @@
1
1
  = Mechanize CHANGELOG
2
2
 
3
+ == 0.7.0
4
+
5
+ * Removed Ruby 1.8.2 support
6
+ * Changed parser to lazily parse links
7
+ * Lazily parsing document
8
+ * Adding verify_callback for SSL requests. Thanks Mike Dalessio!
9
+ * Fixed a bug with Accept-Language header. Thanks Bill Siggelkow.
10
+
3
11
  == 0.6.11
4
12
 
5
13
  * Detecting single quotes in meta redirects.
data/Manifest.txt CHANGED
@@ -13,28 +13,40 @@ eg/proxy_req.rb
13
13
  eg/rubyforge.rb
14
14
  eg/spider.rb
15
15
  lib/mechanize.rb
16
- lib/mechanize/cookie.rb
17
- lib/mechanize/errors.rb
18
- lib/mechanize/form.rb
19
- lib/mechanize/form_elements.rb
20
- lib/mechanize/history.rb
21
- lib/mechanize/inspect.rb
22
- lib/mechanize/list.rb
23
- lib/mechanize/monkey_patch.rb
24
- lib/mechanize/net-overrides/net/http.rb
25
- lib/mechanize/net-overrides/net/https.rb
26
- lib/mechanize/net-overrides/net/protocol.rb
27
- lib/mechanize/page.rb
28
- lib/mechanize/page_elements.rb
29
- lib/mechanize/parsers/rexml_page.rb
30
- lib/mechanize/pluggable_parsers.rb
31
- lib/mechanize/rexml.rb
32
- setup.rb
16
+ lib/www/mechanize.rb
17
+ lib/www/mechanize/content_type_error.rb
18
+ lib/www/mechanize/cookie.rb
19
+ lib/www/mechanize/cookie_jar.rb
20
+ lib/www/mechanize/file.rb
21
+ lib/www/mechanize/file_saver.rb
22
+ lib/www/mechanize/form.rb
23
+ lib/www/mechanize/form/button.rb
24
+ lib/www/mechanize/form/check_box.rb
25
+ lib/www/mechanize/form/field.rb
26
+ lib/www/mechanize/form/file_upload.rb
27
+ lib/www/mechanize/form/image_button.rb
28
+ lib/www/mechanize/form/multi_select_list.rb
29
+ lib/www/mechanize/form/option.rb
30
+ lib/www/mechanize/form/radio_button.rb
31
+ lib/www/mechanize/form/select_list.rb
32
+ lib/www/mechanize/headers.rb
33
+ lib/www/mechanize/history.rb
34
+ lib/www/mechanize/inspect.rb
35
+ lib/www/mechanize/list.rb
36
+ lib/www/mechanize/monkey_patch.rb
37
+ lib/www/mechanize/page.rb
38
+ lib/www/mechanize/page/base.rb
39
+ lib/www/mechanize/page/frame.rb
40
+ lib/www/mechanize/page/link.rb
41
+ lib/www/mechanize/page/meta.rb
42
+ lib/www/mechanize/pluggable_parsers.rb
43
+ lib/www/mechanize/response_code_error.rb
33
44
  test/data/htpasswd
34
45
  test/data/server.crt
35
46
  test/data/server.csr
36
47
  test/data/server.key
37
48
  test/data/server.pem
49
+ test/helper.rb
38
50
  test/htdocs/alt_text.html
39
51
  test/htdocs/bad_form_test.html
40
52
  test/htdocs/button.jpg
@@ -73,6 +85,7 @@ test/htdocs/tc_referer.html
73
85
  test/htdocs/tc_relative_links.html
74
86
  test/htdocs/tc_textarea.html
75
87
  test/htdocs/unusual______.html
88
+ test/servlets.rb
76
89
  test/ssl_server.rb
77
90
  test/tc_authenticate.rb
78
91
  test/tc_bad_links.rb
@@ -97,13 +110,13 @@ test/tc_if_modified_since.rb
97
110
  test/tc_keep_alive.rb
98
111
  test/tc_links.rb
99
112
  test/tc_mech.rb
113
+ test/tc_mechanize_file.rb
100
114
  test/tc_multi_select.rb
101
115
  test/tc_no_attributes.rb
102
116
  test/tc_page.rb
103
117
  test/tc_pluggable_parser.rb
104
118
  test/tc_post_form.rb
105
119
  test/tc_pretty_print.rb
106
- test/tc_proxy.rb
107
120
  test/tc_radiobutton.rb
108
121
  test/tc_referer.rb
109
122
  test/tc_relative_links.rb
@@ -118,8 +131,4 @@ test/tc_ssl_server.rb
118
131
  test/tc_subclass.rb
119
132
  test/tc_textarea.rb
120
133
  test/tc_upload.rb
121
- test/tc_watches.rb
122
134
  test/test_all.rb
123
- test/test_includes.rb
124
- test/test_mechanize_file.rb
125
- test/test_servlets.rb
data/lib/mechanize.rb CHANGED
@@ -1,657 +1,7 @@
1
- # Original Code:
2
1
  # Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de)
3
- #
4
- # New Code:
5
- # Copyright (c) 2006 by Aaron Patterson (aaronp@rubyforge.org)
2
+ # Copyright (c) 2007 by Aaron Patterson (aaronp@rubyforge.org)
6
3
  #
7
4
  # Please see the LICENSE file for licensing.
8
- #
9
-
10
- # required due to the missing get_fields method in Ruby 1.8.2
11
- unless RUBY_VERSION > "1.8.2"
12
- $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides")
13
- end
14
-
15
- require 'net/http'
16
- require 'net/https'
17
-
18
- # Monkey patch for ruby 1.8.4
19
- unless RUBY_VERSION > "1.8.4"
20
- module Net # :nodoc:
21
- class HTTPResponse # :nodoc:
22
- CODE_TO_OBJ['500'] = HTTPInternalServerError
23
- end
24
- end
25
- end
26
-
27
- require 'uri'
28
- require 'webrick/httputils'
29
- require 'zlib'
30
- require 'stringio'
31
- require 'digest/md5'
32
- require 'mechanize/monkey_patch'
33
- require 'mechanize/cookie'
34
- require 'mechanize/errors'
35
- require 'mechanize/pluggable_parsers'
36
- require 'mechanize/form'
37
- require 'mechanize/form_elements'
38
- require 'mechanize/history'
39
- require 'mechanize/list'
40
- require 'mechanize/page'
41
- require 'mechanize/page_elements'
42
- require 'mechanize/inspect'
43
-
44
- module WWW
45
-
46
- # = Synopsis
47
- # The Mechanize library is used for automating interaction with a website. It
48
- # can follow links, and submit forms. Form fields can be populated and
49
- # submitted. A history of URL's is maintained and can be queried.
50
- #
51
- # == Example
52
- # require 'rubygems'
53
- # require 'mechanize'
54
- # require 'logger'
55
- #
56
- # agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
57
- # agent.user_agent_alias = 'Mac Safari'
58
- # page = agent.get("http://www.google.com/")
59
- # search_form = page.forms.name("f").first
60
- # search_form.fields.name("q").value = "Hello"
61
- # search_results = agent.submit(search_form)
62
- # puts search_results.body
63
- class Mechanize
64
- ##
65
- # The version of Mechanize you are using.
66
-
67
- VERSION = '0.6.11'
68
-
69
- ##
70
- # User Agent aliases
71
- AGENT_ALIASES = {
72
- 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
73
- 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
74
- 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
75
- 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3',
76
- 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
77
- 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
78
- 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
79
- 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
80
- 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"
81
- }
82
-
83
- attr_accessor :cookie_jar
84
- attr_accessor :log
85
- attr_accessor :open_timeout, :read_timeout
86
- attr_accessor :user_agent
87
- attr_accessor :watch_for_set
88
- attr_accessor :ca_file
89
- attr_accessor :key
90
- attr_accessor :cert
91
- attr_accessor :pass
92
- attr_accessor :redirect_ok
93
- attr_accessor :keep_alive_time
94
- attr_accessor :keep_alive
95
- attr_accessor :conditional_requests
96
- attr_accessor :follow_meta_refresh
97
-
98
- attr_reader :history
99
- attr_reader :pluggable_parser
100
-
101
- alias :follow_redirect? :redirect_ok
102
-
103
- @@nonce_count = -1
104
- CNONCE = Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535)))
105
-
106
- def initialize
107
- # attr_accessors
108
- @cookie_jar = CookieJar.new
109
- @log = nil
110
- @open_timeout = nil
111
- @read_timeout = nil
112
- @user_agent = AGENT_ALIASES['Mechanize']
113
- @watch_for_set = nil
114
- @ca_file = nil
115
- @cert = nil # OpenSSL Certificate
116
- @key = nil # OpenSSL Private Key
117
- @pass = nil # OpenSSL Password
118
- @redirect_ok = true # Should we follow redirects?
119
-
120
- # attr_readers
121
- @history = WWW::Mechanize::History.new
122
- @pluggable_parser = PluggableParser.new
123
-
124
- # Auth variables
125
- @user = nil # Auth User
126
- @password = nil # Auth Password
127
- @digest = nil # DigestAuth Digest
128
- @auth_hash = {} # Keep track of urls for sending auth
129
-
130
- # Proxy settings
131
- @proxy_addr = nil
132
- @proxy_pass = nil
133
- @proxy_port = nil
134
- @proxy_user = nil
135
-
136
- @conditional_requests = true
137
-
138
- @follow_meta_refresh = false
139
-
140
- # Connection Cache & Keep alive
141
- @connection_cache = {}
142
- @keep_alive_time = 300
143
- @keep_alive = true
144
-
145
- yield self if block_given?
146
- end
147
-
148
- def max_history=(length); @history.max_size = length; end
149
- def max_history; @history.max_size; end
150
-
151
- # Sets the proxy address, port, user, and password
152
- def set_proxy(addr, port, user = nil, pass = nil)
153
- @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
154
- end
155
-
156
- # Set the user agent for the Mechanize object.
157
- # See AGENT_ALIASES
158
- def user_agent_alias=(al)
159
- self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
160
- end
161
-
162
- # Returns a list of cookies stored in the cookie jar.
163
- def cookies
164
- @cookie_jar.to_a
165
- end
166
-
167
- # Sets the user and password to be used for basic authentication.
168
- def basic_auth(user, password)
169
- auth(user, password)
170
- end
171
-
172
- def auth(user, password)
173
- @user = user
174
- @password = password
175
- end
176
-
177
- # Fetches the URL passed in and returns a page.
178
- def get(url, referer=nil, &block)
179
- cur_page = referer || current_page ||
180
- Page.new( nil, {'content-type'=>'text/html'})
181
-
182
- # fetch the page
183
- abs_uri = to_absolute_uri(url, cur_page)
184
- request = fetch_request(abs_uri)
185
- page = fetch_page(abs_uri, request, cur_page, &block)
186
- add_to_history(page)
187
- page
188
- end
189
-
190
- # Fetch a file and return the contents of the file.
191
- def get_file(url)
192
- get(url).body
193
- end
194
-
195
-
196
- # Clicks the WWW::Mechanize::Link object passed in and returns the
197
- # page fetched.
198
- def click(link)
199
- referer =
200
- begin
201
- link.page
202
- rescue
203
- nil
204
- end
205
- uri = to_absolute_uri(
206
- link.attributes['href'] || link.attributes['src'] || link.href,
207
- referer || current_page()
208
- )
209
- get(uri, referer)
210
- end
211
-
212
- # Equivalent to the browser back button. Returns the most recent page
213
- # visited.
214
- def back
215
- @history.pop
216
- end
217
-
218
- # Posts to the given URL wht the query parameters passed in. Query
219
- # parameters can be passed as a hash, or as an array of arrays.
220
- # Example:
221
- # agent.post('http://example.com/', "foo" => "bar")
222
- # or
223
- # agent.post('http://example.com/', [ ["foo", "bar"] ])
224
- def post(url, query={})
225
- node = Hpricot::Elem.new(Hpricot::STag.new('form'))
226
- node['method'] = 'POST'
227
- node['enctype'] = 'application/x-www-form-urlencoded'
228
-
229
- form = Form.new(node)
230
- query.each { |k,v|
231
- form.fields << Field.new(k,v)
232
- }
233
- post_form(url, form)
234
- end
235
-
236
- # Submit a form with an optional button.
237
- # Without a button:
238
- # page = agent.get('http://example.com')
239
- # agent.submit(page.forms.first)
240
- # With a button
241
- # agent.submit(page.forms.first, page.forms.first.buttons.first)
242
- def submit(form, button=nil)
243
- form.add_button_to_query(button) if button
244
- uri = to_absolute_uri(form.action, form.page)
245
- case form.method.upcase
246
- when 'POST'
247
- post_form(uri, form)
248
- when 'GET'
249
- uri.query = WWW::Mechanize.build_query_string(form.build_query)
250
- get(uri)
251
- else
252
- raise "unsupported method: #{form.method.upcase}"
253
- end
254
- end
255
-
256
- # Returns the current page loaded by Mechanize
257
- def current_page
258
- @history.last
259
- end
260
-
261
- # Returns whether or not a url has been visited
262
- def visited?(url)
263
- ! visited_page(url).nil?
264
- end
265
-
266
- # Returns a visited page for the url passed in, otherwise nil
267
- def visited_page(url)
268
- if url.respond_to? :href
269
- url = url.href
270
- end
271
- @history.visited_page(to_absolute_uri(url))
272
- end
273
-
274
- # Runs given block, then resets the page history as it was before. self is
275
- # given as a parameter to the block. Returns the value of the block.
276
- def transact
277
- history_backup = @history.dup
278
- begin
279
- yield self
280
- ensure
281
- @history = history_backup
282
- end
283
- end
284
-
285
- alias :page :current_page
286
-
287
- protected
288
- def set_headers(uri, request, cur_page)
289
- if @keep_alive
290
- request.add_field('Connection', 'keep-alive')
291
- request.add_field('Keep-Alive', keep_alive_time.to_s)
292
- else
293
- request.add_field('Connection', 'close')
294
- end
295
- request.add_field('Accept-Encoding', 'gzip,identity')
296
- request.add_field('Accept-Language', 'en-us,en;q0.5')
297
- request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
298
-
299
- unless @cookie_jar.empty?(uri)
300
- cookies = @cookie_jar.cookies(uri)
301
- cookie = cookies.length > 0 ? cookies.join("; ") : nil
302
- if log
303
- cookies.each do |c|
304
- log.debug("using cookie: #{c}")
305
- end
306
- end
307
- request.add_field('Cookie', cookie)
308
- end
309
-
310
- # Add Referer header to request
311
- unless cur_page.uri.nil?
312
- request.add_field('Referer', cur_page.uri.to_s)
313
- end
314
-
315
- # Add User-Agent header to request
316
- request.add_field('User-Agent', @user_agent) if @user_agent
317
-
318
- # Add If-Modified-Since if page is in history
319
- if @conditional_requests
320
- if( (page = visited_page(uri)) && page.response['Last-Modified'] )
321
- request.add_field('If-Modified-Since', page.response['Last-Modified'])
322
- end
323
- end
324
-
325
- if( @auth_hash[uri.host] )
326
- case @auth_hash[uri.host]
327
- when :basic
328
- request.basic_auth(@user, @password)
329
- when :digest
330
- @digest_response ||= nil
331
- @digest_response = self.gen_auth_header(uri,request,@digest) if @digest
332
- request.add_field('Authorization', @digest_response) if @digest_response
333
- end
334
- end
335
-
336
- request
337
- end
338
-
339
- def gen_auth_header(uri, request, auth_header, is_IIS = false)
340
- @@nonce_count += 1
341
-
342
- user = @digest_user
343
- password = @digest_password
344
-
345
- auth_header =~ /^(\w+) (.*)/
346
-
347
- params = {}
348
- $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 }
349
-
350
- a_1 = "#{@user}:#{params['realm']}:#{@password}"
351
- a_2 = "#{request.method}:#{uri.path}"
352
- request_digest = ''
353
- request_digest << Digest::MD5.hexdigest(a_1)
354
- request_digest << ':' << params['nonce']
355
- request_digest << ':' << ('%08x' % @@nonce_count)
356
- request_digest << ':' << CNONCE
357
- request_digest << ':' << params['qop']
358
- request_digest << ':' << Digest::MD5.hexdigest(a_2)
359
-
360
- header = ''
361
- header << "Digest username=\"#{@user}\", "
362
- header << "realm=\"#{params['realm']}\", "
363
- if is_IIS then
364
- header << "qop=\"#{params['qop']}\", "
365
- else
366
- header << "qop=#{params['qop']}, "
367
- end
368
- header << "uri=\"#{uri.path}\", "
369
- header << "algorithm=MD5, "
370
- header << "nonce=\"#{params['nonce']}\", "
371
- header << "nc=#{'%08x' % @@nonce_count}, "
372
- header << "cnonce=\"#{CNONCE}\", "
373
- header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\""
374
-
375
- return header
376
- end
377
-
378
- private
379
-
380
- def to_absolute_uri(url, cur_page=current_page())
381
- unless url.is_a? URI
382
- url = url.to_s.strip.gsub(/[^#{0.chr}-#{125.chr}]/) { |match|
383
- sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0])
384
- }
385
-
386
- url = URI.parse(
387
- Util.html_unescape(
388
- url.split(/%[0-9A-Fa-f]{2}|#/).zip(
389
- url.scan(/%[0-9A-Fa-f]{2}|#/)
390
- ).map { |x,y|
391
- "#{URI.escape(x)}#{y}"
392
- }.join('')
393
- )
394
- )
395
- end
396
-
397
- url.path = '/' if url.path.length == 0
398
-
399
- # construct an absolute uri
400
- if url.relative?
401
- raise 'no history. please specify an absolute URL' unless cur_page.uri
402
- base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil
403
- url = ((base && base.uri && base.uri.absolute?) ?
404
- base.uri :
405
- cur_page.uri) + url
406
- url = cur_page.uri + url
407
- # Strip initial "/.." bits from the path
408
- url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
409
- end
410
-
411
- return url
412
- end
413
-
414
- def post_form(url, form)
415
- cur_page = form.page || current_page ||
416
- Page.new( nil, {'content-type'=>'text/html'})
417
-
418
- request_data = form.request_data
419
-
420
- abs_url = to_absolute_uri(url, cur_page)
421
- request = fetch_request(abs_url, :post)
422
- request.add_field('Content-Type', form.enctype)
423
- request.add_field('Content-Length', request_data.size.to_s)
424
-
425
- log.debug("query: #{ request_data.inspect }") if log
426
-
427
- # fetch the page
428
- page = fetch_page(abs_url, request, cur_page, [request_data])
429
- add_to_history(page)
430
- page
431
- end
432
-
433
- # Creates a new request object based on the scheme and type
434
- def fetch_request(uri, type = :get)
435
- raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
436
- if type == :get
437
- Net::HTTP::Get.new(uri.request_uri)
438
- else
439
- Net::HTTP::Post.new(uri.request_uri)
440
- end
441
- end
442
-
443
- # uri is an absolute URI
444
- def fetch_page(uri, request, cur_page=current_page(), request_data=[])
445
- raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
446
-
447
- log.info("#{ request.class }: #{ request.path }") if log
448
-
449
- page = nil
450
-
451
- cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= {
452
- :connection => nil,
453
- :keep_alive_options => {},
454
- })
455
- http_obj = cache_obj[:connection]
456
- if http_obj.nil? || ! http_obj.started?
457
- http_obj = cache_obj[:connection] =
458
- Net::HTTP.new( uri.host,
459
- uri.port,
460
- @proxy_addr,
461
- @proxy_port,
462
- @proxy_user,
463
- @proxy_pass
464
- )
465
- cache_obj[:keep_alive_options] = {}
466
-
467
- # Specify timeouts if given
468
- http_obj.open_timeout = @open_timeout if @open_timeout
469
- http_obj.read_timeout = @read_timeout if @read_timeout
470
- end
471
-
472
- if uri.scheme == 'https' && ! http_obj.started?
473
- http_obj.use_ssl = true
474
- http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
475
- if @ca_file
476
- http_obj.ca_file = @ca_file
477
- http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER
478
- end
479
- if @cert && @key
480
- http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
481
- http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
482
- end
483
- end
484
-
485
- # If we're keeping connections alive and the last request time is too
486
- # long ago, stop the connection. Or, if the max requests left is 1,
487
- # reset the connection.
488
- if @keep_alive && http_obj.started?
489
- opts = cache_obj[:keep_alive_options]
490
- if((opts[:timeout] &&
491
- Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) ||
492
- opts[:max] && opts[:max].to_i == 1)
493
-
494
- log.debug('Finishing stale connection') if log
495
- http_obj.finish
496
-
497
- end
498
- end
499
-
500
- http_obj.start unless http_obj.started?
501
-
502
- request = set_headers(uri, request, cur_page)
503
-
504
- # Log specified headers for the request
505
- if log
506
- request.each_header do |k, v|
507
- log.debug("request-header: #{ k } => #{ v }")
508
- end
509
- end
510
-
511
- cache_obj[:last_request_time] = Time.now.to_i
512
-
513
- # Send the request
514
- response = http_obj.request(request, *request_data) {|response|
515
-
516
- body = StringIO.new
517
- total = 0
518
- response.read_body { |part|
519
- total += part.length
520
- body.write(part)
521
- log.debug("Read #{total} bytes") if log
522
- }
523
- body.rewind
524
-
525
- response.each_header { |k,v|
526
- log.debug("response-header: #{ k } => #{ v }")
527
- } if log
528
-
529
- content_type = nil
530
- unless response['Content-Type'].nil?
531
- data = response['Content-Type'].match(/^([^;]*)/)
532
- content_type = data[1].downcase unless data.nil?
533
- end
534
-
535
- response_body =
536
- if encoding = response['Content-Encoding']
537
- case encoding.downcase
538
- when 'gzip'
539
- log.debug('gunzip body') if log
540
- Zlib::GzipReader.new(body).read
541
- when 'x-gzip'
542
- body.read
543
- else
544
- raise 'Unsupported content encoding'
545
- end
546
- else
547
- body.read
548
- end
549
-
550
- # Find our pluggable parser
551
- page = @pluggable_parser.parser(content_type).new(
552
- uri,
553
- response,
554
- response_body,
555
- response.code
556
- ) { |parser|
557
- parser.mech = self if parser.respond_to? :mech=
558
- if parser.respond_to?(:watch_for_set=) && @watch_for_set
559
- parser.watch_for_set = @watch_for_set
560
- end
561
- }
562
-
563
- }
564
-
565
- # If the server sends back keep alive options, save them
566
- if keep_alive_info = response['keep-alive']
567
- keep_alive_info.split(/,\s*/).each do |option|
568
- k, v = option.split(/=/)
569
- cache_obj[:keep_alive_options] ||= {}
570
- cache_obj[:keep_alive_options][k.intern] = v
571
- end
572
- end
573
-
574
- (response.get_fields('Set-Cookie')||[]).each do |cookie|
575
- Cookie::parse(uri, cookie, log) { |c|
576
- log.debug("saved cookie: #{c}") if log
577
- @cookie_jar.add(uri, c)
578
- }
579
- end
580
-
581
- log.info("status: #{ page.code }") if log
582
-
583
- res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
584
-
585
- if follow_meta_refresh && page.respond_to?(:meta) &&
586
- (redirect = page.meta.first)
587
- return redirect.click
588
- end
589
-
590
- return page if res_klass <= Net::HTTPSuccess
591
-
592
- if res_klass == Net::HTTPNotModified
593
- log.debug("Got cached page") if log
594
- return visited_page(uri)
595
- elsif res_klass <= Net::HTTPRedirection
596
- return page unless follow_redirect?
597
- log.info("follow redirect to: #{ response['Location'] }") if log
598
- from_uri = page.uri
599
- abs_uri = to_absolute_uri(response['Location'].to_s, page)
600
- page = fetch_page(abs_uri, fetch_request(abs_uri), page)
601
- @history.push(page, from_uri)
602
- return page
603
- elsif res_klass <= Net::HTTPUnauthorized
604
- raise ResponseCodeError.new(page) unless @user || @password
605
- raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
606
- if response['www-authenticate'] =~ /Digest/i
607
- @auth_hash[uri.host] = :digest
608
- @digest = response['www-authenticate']
609
- else
610
- @auth_hash[uri.host] = :basic
611
- end
612
- return fetch_page( uri,
613
- fetch_request(uri, request.method.downcase.to_sym),
614
- cur_page,
615
- request_data
616
- )
617
- end
618
-
619
- raise ResponseCodeError.new(page), "Unhandled response", caller
620
- end
621
-
622
- def self.build_query_string(parameters)
623
- vals = []
624
- parameters.each { |k,v|
625
- next if k.nil?
626
- vals <<
627
- [WEBrick::HTTPUtils.escape_form(k),
628
- WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
629
- }
630
-
631
- vals.join("&")
632
- end
633
-
634
- def add_to_history(page)
635
- @history.push(page, to_absolute_uri(page.uri))
636
- end
637
-
638
- # :stopdoc:
639
- class Util
640
- def self.html_unescape(s)
641
- return s unless s
642
- s.gsub(/&(\w+|#[0-9]+);/) { |match|
643
- number = case match
644
- when /&(\w+);/
645
- Hpricot::NamedCharacters[$1]
646
- when /&#([0-9]+);/
647
- $1.to_i
648
- end
649
5
 
650
- number ? ([number].pack('U') rescue match) : match
651
- }
652
- end
653
- end
654
- # :startdoc:
655
- end
656
6
 
657
- end # module WWW
7
+ require 'www/mechanize'