mechanize 0.6.11 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data/CHANGELOG.txt +8 -0
- data/Manifest.txt +31 -22
- data/lib/mechanize.rb +2 -652
- data/lib/www/mechanize.rb +635 -0
- data/lib/www/mechanize/content_type_error.rb +16 -0
- data/lib/www/mechanize/cookie.rb +64 -0
- data/lib/{mechanize/cookie.rb → www/mechanize/cookie_jar.rb} +0 -60
- data/lib/www/mechanize/file.rb +73 -0
- data/lib/www/mechanize/file_saver.rb +39 -0
- data/lib/{mechanize → www/mechanize}/form.rb +119 -137
- data/lib/www/mechanize/form/button.rb +8 -0
- data/lib/www/mechanize/form/check_box.rb +13 -0
- data/lib/www/mechanize/form/field.rb +28 -0
- data/lib/www/mechanize/form/file_upload.rb +24 -0
- data/lib/www/mechanize/form/image_button.rb +23 -0
- data/lib/www/mechanize/form/multi_select_list.rb +69 -0
- data/lib/www/mechanize/form/option.rb +51 -0
- data/lib/www/mechanize/form/radio_button.rb +38 -0
- data/lib/www/mechanize/form/select_list.rb +41 -0
- data/lib/www/mechanize/headers.rb +12 -0
- data/lib/{mechanize → www/mechanize}/history.rb +0 -0
- data/lib/{mechanize → www/mechanize}/inspect.rb +21 -28
- data/lib/{mechanize → www/mechanize}/list.rb +0 -0
- data/lib/{mechanize → www/mechanize}/monkey_patch.rb +19 -0
- data/lib/www/mechanize/page.rb +121 -0
- data/lib/www/mechanize/page/base.rb +10 -0
- data/lib/www/mechanize/page/frame.rb +22 -0
- data/lib/www/mechanize/page/link.rb +50 -0
- data/lib/www/mechanize/page/meta.rb +10 -0
- data/lib/www/mechanize/pluggable_parsers.rb +93 -0
- data/lib/{mechanize/errors.rb → www/mechanize/response_code_error.rb} +1 -13
- data/test/{test_includes.rb → helper.rb} +4 -18
- data/test/{test_servlets.rb → servlets.rb} +0 -0
- data/test/tc_authenticate.rb +1 -8
- data/test/tc_bad_links.rb +3 -10
- data/test/tc_blank_form.rb +1 -8
- data/test/tc_checkboxes.rb +1 -8
- data/test/tc_cookie_class.rb +1 -6
- data/test/tc_cookie_jar.rb +1 -7
- data/test/tc_cookies.rb +10 -17
- data/test/tc_encoded_links.rb +5 -12
- data/test/tc_errors.rb +4 -11
- data/test/tc_follow_meta.rb +1 -8
- data/test/tc_form_action.rb +6 -14
- data/test/tc_form_as_hash.rb +1 -9
- data/test/tc_form_button.rb +5 -8
- data/test/tc_form_no_inputname.rb +1 -8
- data/test/tc_forms.rb +16 -24
- data/test/tc_frames.rb +3 -10
- data/test/tc_gzipping.rb +2 -9
- data/test/tc_history.rb +5 -12
- data/test/tc_html_unscape_forms.rb +8 -15
- data/test/tc_if_modified_since.rb +1 -6
- data/test/tc_keep_alive.rb +1 -8
- data/test/tc_links.rb +12 -19
- data/test/tc_mech.rb +26 -34
- data/test/{test_mechanize_file.rb → tc_mechanize_file.rb} +1 -6
- data/test/tc_multi_select.rb +10 -17
- data/test/tc_no_attributes.rb +1 -8
- data/test/tc_page.rb +3 -10
- data/test/tc_pluggable_parser.rb +8 -15
- data/test/tc_post_form.rb +3 -10
- data/test/tc_pretty_print.rb +3 -10
- data/test/tc_radiobutton.rb +2 -9
- data/test/tc_referer.rb +13 -20
- data/test/tc_relative_links.rb +1 -8
- data/test/tc_response_code.rb +14 -21
- data/test/tc_save_file.rb +1 -9
- data/test/tc_select.rb +3 -10
- data/test/tc_select_all.rb +2 -10
- data/test/tc_select_none.rb +2 -10
- data/test/tc_select_noopts.rb +2 -9
- data/test/tc_set_fields.rb +2 -9
- data/test/tc_ssl_server.rb +5 -12
- data/test/tc_subclass.rb +2 -9
- data/test/tc_textarea.rb +2 -9
- data/test/tc_upload.rb +2 -9
- data/test/test_all.rb +4 -43
- metadata +96 -80
- data/lib/mechanize/form_elements.rb +0 -254
- data/lib/mechanize/net-overrides/net/http.rb +0 -2107
- data/lib/mechanize/net-overrides/net/https.rb +0 -172
- data/lib/mechanize/net-overrides/net/protocol.rb +0 -380
- data/lib/mechanize/page.rb +0 -138
- data/lib/mechanize/page_elements.rb +0 -77
- data/lib/mechanize/parsers/rexml_page.rb +0 -35
- data/lib/mechanize/pluggable_parsers.rb +0 -204
- data/lib/mechanize/rexml.rb +0 -236
- data/setup.rb +0 -1585
- data/test/tc_proxy.rb +0 -25
- data/test/tc_watches.rb +0 -32
@@ -0,0 +1,635 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'net/https'
|
3
|
+
require 'uri'
|
4
|
+
require 'webrick/httputils'
|
5
|
+
require 'zlib'
|
6
|
+
require 'stringio'
|
7
|
+
require 'digest/md5'
|
8
|
+
|
9
|
+
require 'www/mechanize/content_type_error'
|
10
|
+
require 'www/mechanize/response_code_error'
|
11
|
+
require 'www/mechanize/cookie'
|
12
|
+
require 'www/mechanize/cookie_jar'
|
13
|
+
require 'www/mechanize/history'
|
14
|
+
require 'www/mechanize/list'
|
15
|
+
require 'www/mechanize/form'
|
16
|
+
require 'www/mechanize/pluggable_parsers'
|
17
|
+
require 'www/mechanize/inspect'
|
18
|
+
require 'www/mechanize/monkey_patch'
|
19
|
+
|
20
|
+
module WWW
|
21
|
+
# = Synopsis
|
22
|
+
# The Mechanize library is used for automating interaction with a website. It
|
23
|
+
# can follow links, and submit forms. Form fields can be populated and
|
24
|
+
# submitted. A history of URL's is maintained and can be queried.
|
25
|
+
#
|
26
|
+
# == Example
|
27
|
+
# require 'rubygems'
|
28
|
+
# require 'mechanize'
|
29
|
+
# require 'logger'
|
30
|
+
#
|
31
|
+
# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
|
32
|
+
# agent.user_agent_alias = 'Mac Safari'
|
33
|
+
# page = agent.get("http://www.google.com/")
|
34
|
+
# search_form = page.forms.name("f").first
|
35
|
+
# search_form.fields.name("q").value = "Hello"
|
36
|
+
# search_results = agent.submit(search_form)
|
37
|
+
# puts search_results.body
|
38
|
+
class Mechanize
|
39
|
+
##
|
40
|
+
# The version of Mechanize you are using.
|
41
|
+
VERSION = '0.7.0'
|
42
|
+
|
43
|
+
##
|
44
|
+
# User Agent aliases
|
45
|
+
AGENT_ALIASES = {
|
46
|
+
'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
|
47
|
+
'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
48
|
+
'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
|
49
|
+
'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3',
|
50
|
+
'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
|
51
|
+
'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
|
52
|
+
'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
|
53
|
+
'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
|
54
|
+
'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"
|
55
|
+
}
|
56
|
+
|
57
|
+
attr_accessor :cookie_jar
|
58
|
+
attr_accessor :log
|
59
|
+
attr_accessor :open_timeout, :read_timeout
|
60
|
+
attr_accessor :user_agent
|
61
|
+
attr_accessor :watch_for_set
|
62
|
+
attr_accessor :ca_file
|
63
|
+
attr_accessor :key
|
64
|
+
attr_accessor :cert
|
65
|
+
attr_accessor :pass
|
66
|
+
attr_accessor :redirect_ok
|
67
|
+
attr_accessor :keep_alive_time
|
68
|
+
attr_accessor :keep_alive
|
69
|
+
attr_accessor :conditional_requests
|
70
|
+
attr_accessor :follow_meta_refresh
|
71
|
+
attr_accessor :verify_callback
|
72
|
+
|
73
|
+
attr_reader :history
|
74
|
+
attr_reader :pluggable_parser
|
75
|
+
|
76
|
+
alias :follow_redirect? :redirect_ok
|
77
|
+
|
78
|
+
@@nonce_count = -1
|
79
|
+
CNONCE = Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535)))
|
80
|
+
|
81
|
+
def initialize
|
82
|
+
# attr_accessors
|
83
|
+
@cookie_jar = CookieJar.new
|
84
|
+
@log = nil
|
85
|
+
@open_timeout = nil
|
86
|
+
@read_timeout = nil
|
87
|
+
@user_agent = AGENT_ALIASES['Mechanize']
|
88
|
+
@watch_for_set = nil
|
89
|
+
@ca_file = nil # OpenSSL server certificate file
|
90
|
+
|
91
|
+
# callback for OpenSSL errors while verifying the server certificate
|
92
|
+
# chain, can be used for debugging or to ignore errors by always
|
93
|
+
# returning _true_
|
94
|
+
@verify_callback = nil
|
95
|
+
@cert = nil # OpenSSL Certificate
|
96
|
+
@key = nil # OpenSSL Private Key
|
97
|
+
@pass = nil # OpenSSL Password
|
98
|
+
@redirect_ok = true # Should we follow redirects?
|
99
|
+
|
100
|
+
# attr_readers
|
101
|
+
@history = WWW::Mechanize::History.new
|
102
|
+
@pluggable_parser = PluggableParser.new
|
103
|
+
|
104
|
+
# Auth variables
|
105
|
+
@user = nil # Auth User
|
106
|
+
@password = nil # Auth Password
|
107
|
+
@digest = nil # DigestAuth Digest
|
108
|
+
@auth_hash = {} # Keep track of urls for sending auth
|
109
|
+
@digest_response = nil
|
110
|
+
|
111
|
+
# Proxy settings
|
112
|
+
@proxy_addr = nil
|
113
|
+
@proxy_pass = nil
|
114
|
+
@proxy_port = nil
|
115
|
+
@proxy_user = nil
|
116
|
+
|
117
|
+
@conditional_requests = true
|
118
|
+
|
119
|
+
@follow_meta_refresh = false
|
120
|
+
|
121
|
+
# Connection Cache & Keep alive
|
122
|
+
@connection_cache = {}
|
123
|
+
@keep_alive_time = 300
|
124
|
+
@keep_alive = true
|
125
|
+
|
126
|
+
yield self if block_given?
|
127
|
+
end
|
128
|
+
|
129
|
+
def max_history=(length); @history.max_size = length; end
|
130
|
+
def max_history; @history.max_size; end
|
131
|
+
|
132
|
+
# Sets the proxy address, port, user, and password
|
133
|
+
def set_proxy(addr, port, user = nil, pass = nil)
|
134
|
+
@proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
|
135
|
+
end
|
136
|
+
|
137
|
+
# Set the user agent for the Mechanize object.
|
138
|
+
# See AGENT_ALIASES
|
139
|
+
def user_agent_alias=(al)
|
140
|
+
self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
|
141
|
+
end
|
142
|
+
|
143
|
+
# Returns a list of cookies stored in the cookie jar.
|
144
|
+
def cookies
|
145
|
+
@cookie_jar.to_a
|
146
|
+
end
|
147
|
+
|
148
|
+
# Sets the user and password to be used for basic authentication.
|
149
|
+
def basic_auth(user, password)
|
150
|
+
auth(user, password)
|
151
|
+
end
|
152
|
+
|
153
|
+
def auth(user, password)
|
154
|
+
@user = user
|
155
|
+
@password = password
|
156
|
+
end
|
157
|
+
|
158
|
+
# Fetches the URL passed in and returns a page.
|
159
|
+
def get(url, referer=nil, &block)
|
160
|
+
cur_page = referer || current_page ||
|
161
|
+
Page.new( nil, {'content-type'=>'text/html'})
|
162
|
+
|
163
|
+
# fetch the page
|
164
|
+
abs_uri = to_absolute_uri(url, cur_page)
|
165
|
+
request = fetch_request(abs_uri)
|
166
|
+
page = fetch_page(abs_uri, request, cur_page, &block)
|
167
|
+
add_to_history(page)
|
168
|
+
page
|
169
|
+
end
|
170
|
+
|
171
|
+
# Fetch a file and return the contents of the file.
|
172
|
+
def get_file(url)
|
173
|
+
get(url).body
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
# Clicks the WWW::Mechanize::Link object passed in and returns the
|
178
|
+
# page fetched.
|
179
|
+
def click(link)
|
180
|
+
referer =
|
181
|
+
begin
|
182
|
+
link.page
|
183
|
+
rescue
|
184
|
+
nil
|
185
|
+
end
|
186
|
+
uri = to_absolute_uri(
|
187
|
+
link.attributes['href'] || link.attributes['src'] || link.href,
|
188
|
+
referer || current_page()
|
189
|
+
)
|
190
|
+
get(uri, referer)
|
191
|
+
end
|
192
|
+
|
193
|
+
# Equivalent to the browser back button. Returns the most recent page
|
194
|
+
# visited.
|
195
|
+
def back
|
196
|
+
@history.pop
|
197
|
+
end
|
198
|
+
|
199
|
+
# Posts to the given URL wht the query parameters passed in. Query
|
200
|
+
# parameters can be passed as a hash, or as an array of arrays.
|
201
|
+
# Example:
|
202
|
+
# agent.post('http://example.com/', "foo" => "bar")
|
203
|
+
# or
|
204
|
+
# agent.post('http://example.com/', [ ["foo", "bar"] ])
|
205
|
+
def post(url, query={})
|
206
|
+
node = Hpricot::Elem.new(Hpricot::STag.new('form'))
|
207
|
+
node['method'] = 'POST'
|
208
|
+
node['enctype'] = 'application/x-www-form-urlencoded'
|
209
|
+
|
210
|
+
form = Form.new(node)
|
211
|
+
query.each { |k,v|
|
212
|
+
form.fields << Form::Field.new(k,v)
|
213
|
+
}
|
214
|
+
post_form(url, form)
|
215
|
+
end
|
216
|
+
|
217
|
+
# Submit a form with an optional button.
|
218
|
+
# Without a button:
|
219
|
+
# page = agent.get('http://example.com')
|
220
|
+
# agent.submit(page.forms.first)
|
221
|
+
# With a button
|
222
|
+
# agent.submit(page.forms.first, page.forms.first.buttons.first)
|
223
|
+
def submit(form, button=nil)
|
224
|
+
form.add_button_to_query(button) if button
|
225
|
+
uri = to_absolute_uri(form.action, form.page)
|
226
|
+
case form.method.upcase
|
227
|
+
when 'POST'
|
228
|
+
post_form(uri, form)
|
229
|
+
when 'GET'
|
230
|
+
uri.query = WWW::Mechanize.build_query_string(form.build_query)
|
231
|
+
get(uri)
|
232
|
+
else
|
233
|
+
raise "unsupported method: #{form.method.upcase}"
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
# Returns the current page loaded by Mechanize
|
238
|
+
def current_page
|
239
|
+
@history.last
|
240
|
+
end
|
241
|
+
|
242
|
+
# Returns whether or not a url has been visited
|
243
|
+
def visited?(url)
|
244
|
+
! visited_page(url).nil?
|
245
|
+
end
|
246
|
+
|
247
|
+
# Returns a visited page for the url passed in, otherwise nil
|
248
|
+
def visited_page(url)
|
249
|
+
if url.respond_to? :href
|
250
|
+
url = url.href
|
251
|
+
end
|
252
|
+
@history.visited_page(to_absolute_uri(url))
|
253
|
+
end
|
254
|
+
|
255
|
+
# Runs given block, then resets the page history as it was before. self is
|
256
|
+
# given as a parameter to the block. Returns the value of the block.
|
257
|
+
def transact
|
258
|
+
history_backup = @history.dup
|
259
|
+
begin
|
260
|
+
yield self
|
261
|
+
ensure
|
262
|
+
@history = history_backup
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
alias :page :current_page
|
267
|
+
|
268
|
+
class << self
|
269
|
+
def html_unescape(s)
|
270
|
+
return s unless s
|
271
|
+
s.gsub(/&(\w+|#[0-9]+);/) { |match|
|
272
|
+
number = case match
|
273
|
+
when /&(\w+);/
|
274
|
+
Hpricot::NamedCharacters[$1]
|
275
|
+
when /&#([0-9]+);/
|
276
|
+
$1.to_i
|
277
|
+
end
|
278
|
+
|
279
|
+
number ? ([number].pack('U') rescue match) : match
|
280
|
+
}
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
protected
|
285
|
+
def set_headers(uri, request, cur_page)
|
286
|
+
if @keep_alive
|
287
|
+
request.add_field('Connection', 'keep-alive')
|
288
|
+
request.add_field('Keep-Alive', keep_alive_time.to_s)
|
289
|
+
else
|
290
|
+
request.add_field('Connection', 'close')
|
291
|
+
end
|
292
|
+
request.add_field('Accept-Encoding', 'gzip,identity')
|
293
|
+
request.add_field('Accept-Language', 'en-us,en;q=0.5')
|
294
|
+
request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
|
295
|
+
|
296
|
+
unless @cookie_jar.empty?(uri)
|
297
|
+
cookies = @cookie_jar.cookies(uri)
|
298
|
+
cookie = cookies.length > 0 ? cookies.join("; ") : nil
|
299
|
+
if log
|
300
|
+
cookies.each do |c|
|
301
|
+
log.debug("using cookie: #{c}")
|
302
|
+
end
|
303
|
+
end
|
304
|
+
request.add_field('Cookie', cookie)
|
305
|
+
end
|
306
|
+
|
307
|
+
# Add Referer header to request
|
308
|
+
unless cur_page.uri.nil?
|
309
|
+
request.add_field('Referer', cur_page.uri.to_s)
|
310
|
+
end
|
311
|
+
|
312
|
+
# Add User-Agent header to request
|
313
|
+
request.add_field('User-Agent', @user_agent) if @user_agent
|
314
|
+
|
315
|
+
# Add If-Modified-Since if page is in history
|
316
|
+
if @conditional_requests
|
317
|
+
if( (page = visited_page(uri)) && page.response['Last-Modified'] )
|
318
|
+
request.add_field('If-Modified-Since', page.response['Last-Modified'])
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
if( @auth_hash[uri.host] )
|
323
|
+
case @auth_hash[uri.host]
|
324
|
+
when :basic
|
325
|
+
request.basic_auth(@user, @password)
|
326
|
+
when :digest
|
327
|
+
@digest_response = self.gen_auth_header(uri,request,@digest) if @digest
|
328
|
+
request.add_field('Authorization', @digest_response) if @digest_response
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
332
|
+
request
|
333
|
+
end
|
334
|
+
|
335
|
+
def gen_auth_header(uri, request, auth_header, is_IIS = false)
|
336
|
+
@@nonce_count += 1
|
337
|
+
|
338
|
+
user = @digest_user
|
339
|
+
password = @digest_password
|
340
|
+
|
341
|
+
auth_header =~ /^(\w+) (.*)/
|
342
|
+
|
343
|
+
params = {}
|
344
|
+
$2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 }
|
345
|
+
|
346
|
+
a_1 = "#{@user}:#{params['realm']}:#{@password}"
|
347
|
+
a_2 = "#{request.method}:#{uri.path}"
|
348
|
+
request_digest = ''
|
349
|
+
request_digest << Digest::MD5.hexdigest(a_1)
|
350
|
+
request_digest << ':' << params['nonce']
|
351
|
+
request_digest << ':' << ('%08x' % @@nonce_count)
|
352
|
+
request_digest << ':' << CNONCE
|
353
|
+
request_digest << ':' << params['qop']
|
354
|
+
request_digest << ':' << Digest::MD5.hexdigest(a_2)
|
355
|
+
|
356
|
+
header = ''
|
357
|
+
header << "Digest username=\"#{@user}\", "
|
358
|
+
header << "realm=\"#{params['realm']}\", "
|
359
|
+
if is_IIS then
|
360
|
+
header << "qop=\"#{params['qop']}\", "
|
361
|
+
else
|
362
|
+
header << "qop=#{params['qop']}, "
|
363
|
+
end
|
364
|
+
header << "uri=\"#{uri.path}\", "
|
365
|
+
header << "algorithm=MD5, "
|
366
|
+
header << "nonce=\"#{params['nonce']}\", "
|
367
|
+
header << "nc=#{'%08x' % @@nonce_count}, "
|
368
|
+
header << "cnonce=\"#{CNONCE}\", "
|
369
|
+
header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\""
|
370
|
+
|
371
|
+
return header
|
372
|
+
end
|
373
|
+
|
374
|
+
private
|
375
|
+
|
376
|
+
def to_absolute_uri(url, cur_page=current_page())
|
377
|
+
unless url.is_a? URI
|
378
|
+
url = url.to_s.strip.gsub(/[^#{0.chr}-#{125.chr}]/) { |match|
|
379
|
+
sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0])
|
380
|
+
}
|
381
|
+
|
382
|
+
url = URI.parse(
|
383
|
+
Mechanize.html_unescape(
|
384
|
+
url.split(/%[0-9A-Fa-f]{2}|#/).zip(
|
385
|
+
url.scan(/%[0-9A-Fa-f]{2}|#/)
|
386
|
+
).map { |x,y|
|
387
|
+
"#{URI.escape(x)}#{y}"
|
388
|
+
}.join('')
|
389
|
+
)
|
390
|
+
)
|
391
|
+
end
|
392
|
+
|
393
|
+
url.path = '/' if url.path.length == 0
|
394
|
+
|
395
|
+
# construct an absolute uri
|
396
|
+
if url.relative?
|
397
|
+
raise 'no history. please specify an absolute URL' unless cur_page.uri
|
398
|
+
base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil
|
399
|
+
url = ((base && base.uri && base.uri.absolute?) ?
|
400
|
+
base.uri :
|
401
|
+
cur_page.uri) + url
|
402
|
+
url = cur_page.uri + url
|
403
|
+
# Strip initial "/.." bits from the path
|
404
|
+
url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
|
405
|
+
end
|
406
|
+
|
407
|
+
return url
|
408
|
+
end
|
409
|
+
|
410
|
+
def post_form(url, form)
|
411
|
+
cur_page = form.page || current_page ||
|
412
|
+
Page.new( nil, {'content-type'=>'text/html'})
|
413
|
+
|
414
|
+
request_data = form.request_data
|
415
|
+
|
416
|
+
abs_url = to_absolute_uri(url, cur_page)
|
417
|
+
request = fetch_request(abs_url, :post)
|
418
|
+
request.add_field('Content-Type', form.enctype)
|
419
|
+
request.add_field('Content-Length', request_data.size.to_s)
|
420
|
+
|
421
|
+
log.debug("query: #{ request_data.inspect }") if log
|
422
|
+
|
423
|
+
# fetch the page
|
424
|
+
page = fetch_page(abs_url, request, cur_page, [request_data])
|
425
|
+
add_to_history(page)
|
426
|
+
page
|
427
|
+
end
|
428
|
+
|
429
|
+
# Creates a new request object based on the scheme and type
|
430
|
+
def fetch_request(uri, type = :get)
|
431
|
+
raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
|
432
|
+
if type == :get
|
433
|
+
Net::HTTP::Get.new(uri.request_uri)
|
434
|
+
else
|
435
|
+
Net::HTTP::Post.new(uri.request_uri)
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
# uri is an absolute URI
|
440
|
+
def fetch_page(uri, request, cur_page=current_page(), request_data=[])
|
441
|
+
raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
|
442
|
+
|
443
|
+
log.info("#{ request.class }: #{ request.path }") if log
|
444
|
+
|
445
|
+
page = nil
|
446
|
+
|
447
|
+
cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= {
|
448
|
+
:connection => nil,
|
449
|
+
:keep_alive_options => {},
|
450
|
+
})
|
451
|
+
http_obj = cache_obj[:connection]
|
452
|
+
if http_obj.nil? || ! http_obj.started?
|
453
|
+
http_obj = cache_obj[:connection] =
|
454
|
+
Net::HTTP.new( uri.host,
|
455
|
+
uri.port,
|
456
|
+
@proxy_addr,
|
457
|
+
@proxy_port,
|
458
|
+
@proxy_user,
|
459
|
+
@proxy_pass
|
460
|
+
)
|
461
|
+
cache_obj[:keep_alive_options] = {}
|
462
|
+
|
463
|
+
# Specify timeouts if given
|
464
|
+
http_obj.open_timeout = @open_timeout if @open_timeout
|
465
|
+
http_obj.read_timeout = @read_timeout if @read_timeout
|
466
|
+
end
|
467
|
+
|
468
|
+
if uri.scheme == 'https' && ! http_obj.started?
|
469
|
+
http_obj.use_ssl = true
|
470
|
+
http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
471
|
+
if @ca_file
|
472
|
+
http_obj.ca_file = @ca_file
|
473
|
+
http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
474
|
+
http_obj.verify_callback = @verify_callback if @verify_callback
|
475
|
+
end
|
476
|
+
if @cert && @key
|
477
|
+
http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
|
478
|
+
http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
|
479
|
+
end
|
480
|
+
end
|
481
|
+
|
482
|
+
# If we're keeping connections alive and the last request time is too
|
483
|
+
# long ago, stop the connection. Or, if the max requests left is 1,
|
484
|
+
# reset the connection.
|
485
|
+
if @keep_alive && http_obj.started?
|
486
|
+
opts = cache_obj[:keep_alive_options]
|
487
|
+
if((opts[:timeout] &&
|
488
|
+
Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) ||
|
489
|
+
opts[:max] && opts[:max].to_i == 1)
|
490
|
+
|
491
|
+
log.debug('Finishing stale connection') if log
|
492
|
+
http_obj.finish
|
493
|
+
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
http_obj.start unless http_obj.started?
|
498
|
+
|
499
|
+
request = set_headers(uri, request, cur_page)
|
500
|
+
|
501
|
+
# Log specified headers for the request
|
502
|
+
if log
|
503
|
+
request.each_header do |k, v|
|
504
|
+
log.debug("request-header: #{ k } => #{ v }")
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
cache_obj[:last_request_time] = Time.now.to_i
|
509
|
+
|
510
|
+
# Send the request
|
511
|
+
response = http_obj.request(request, *request_data) {|response|
|
512
|
+
|
513
|
+
body = StringIO.new
|
514
|
+
total = 0
|
515
|
+
response.read_body { |part|
|
516
|
+
total += part.length
|
517
|
+
body.write(part)
|
518
|
+
log.debug("Read #{total} bytes") if log
|
519
|
+
}
|
520
|
+
body.rewind
|
521
|
+
|
522
|
+
response.each_header { |k,v|
|
523
|
+
log.debug("response-header: #{ k } => #{ v }")
|
524
|
+
} if log
|
525
|
+
|
526
|
+
content_type = nil
|
527
|
+
unless response['Content-Type'].nil?
|
528
|
+
data = response['Content-Type'].match(/^([^;]*)/)
|
529
|
+
content_type = data[1].downcase unless data.nil?
|
530
|
+
end
|
531
|
+
|
532
|
+
response_body =
|
533
|
+
if encoding = response['Content-Encoding']
|
534
|
+
case encoding.downcase
|
535
|
+
when 'gzip'
|
536
|
+
log.debug('gunzip body') if log
|
537
|
+
Zlib::GzipReader.new(body).read
|
538
|
+
when 'x-gzip'
|
539
|
+
body.read
|
540
|
+
else
|
541
|
+
raise 'Unsupported content encoding'
|
542
|
+
end
|
543
|
+
else
|
544
|
+
body.read
|
545
|
+
end
|
546
|
+
|
547
|
+
# Find our pluggable parser
|
548
|
+
page = @pluggable_parser.parser(content_type).new(
|
549
|
+
uri,
|
550
|
+
response,
|
551
|
+
response_body,
|
552
|
+
response.code
|
553
|
+
) { |parser|
|
554
|
+
parser.mech = self if parser.respond_to? :mech=
|
555
|
+
if parser.respond_to?(:watch_for_set=) && @watch_for_set
|
556
|
+
parser.watch_for_set = @watch_for_set
|
557
|
+
end
|
558
|
+
}
|
559
|
+
|
560
|
+
}
|
561
|
+
|
562
|
+
# If the server sends back keep alive options, save them
|
563
|
+
if keep_alive_info = response['keep-alive']
|
564
|
+
keep_alive_info.split(/,\s*/).each do |option|
|
565
|
+
k, v = option.split(/=/)
|
566
|
+
cache_obj[:keep_alive_options] ||= {}
|
567
|
+
cache_obj[:keep_alive_options][k.intern] = v
|
568
|
+
end
|
569
|
+
end
|
570
|
+
|
571
|
+
(response.get_fields('Set-Cookie')||[]).each do |cookie|
|
572
|
+
Cookie::parse(uri, cookie, log) { |c|
|
573
|
+
log.debug("saved cookie: #{c}") if log
|
574
|
+
@cookie_jar.add(uri, c)
|
575
|
+
}
|
576
|
+
end
|
577
|
+
|
578
|
+
log.info("status: #{ page.code }") if log
|
579
|
+
|
580
|
+
res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
|
581
|
+
|
582
|
+
if follow_meta_refresh && page.respond_to?(:meta) &&
|
583
|
+
(redirect = page.meta.first)
|
584
|
+
return redirect.click
|
585
|
+
end
|
586
|
+
|
587
|
+
return page if res_klass <= Net::HTTPSuccess
|
588
|
+
|
589
|
+
if res_klass == Net::HTTPNotModified
|
590
|
+
log.debug("Got cached page") if log
|
591
|
+
return visited_page(uri)
|
592
|
+
elsif res_klass <= Net::HTTPRedirection
|
593
|
+
return page unless follow_redirect?
|
594
|
+
log.info("follow redirect to: #{ response['Location'] }") if log
|
595
|
+
from_uri = page.uri
|
596
|
+
abs_uri = to_absolute_uri(response['Location'].to_s, page)
|
597
|
+
page = fetch_page(abs_uri, fetch_request(abs_uri), page)
|
598
|
+
@history.push(page, from_uri)
|
599
|
+
return page
|
600
|
+
elsif res_klass <= Net::HTTPUnauthorized
|
601
|
+
raise ResponseCodeError.new(page) unless @user || @password
|
602
|
+
raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
|
603
|
+
if response['www-authenticate'] =~ /Digest/i
|
604
|
+
@auth_hash[uri.host] = :digest
|
605
|
+
@digest = response['www-authenticate']
|
606
|
+
else
|
607
|
+
@auth_hash[uri.host] = :basic
|
608
|
+
end
|
609
|
+
return fetch_page( uri,
|
610
|
+
fetch_request(uri, request.method.downcase.to_sym),
|
611
|
+
cur_page,
|
612
|
+
request_data
|
613
|
+
)
|
614
|
+
end
|
615
|
+
|
616
|
+
raise ResponseCodeError.new(page), "Unhandled response", caller
|
617
|
+
end
|
618
|
+
|
619
|
+
def self.build_query_string(parameters)
|
620
|
+
vals = []
|
621
|
+
parameters.each { |k,v|
|
622
|
+
next if k.nil?
|
623
|
+
vals <<
|
624
|
+
[WEBrick::HTTPUtils.escape_form(k),
|
625
|
+
WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
|
626
|
+
}
|
627
|
+
|
628
|
+
vals.join("&")
|
629
|
+
end
|
630
|
+
|
631
|
+
def add_to_history(page)
|
632
|
+
@history.push(page, to_absolute_uri(page.uri))
|
633
|
+
end
|
634
|
+
end
|
635
|
+
end
|