mechanize 0.9.2 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data/CHANGELOG.rdoc +16 -0
- data/README.rdoc +5 -5
- data/Rakefile +0 -1
- data/lib/www/mechanize.rb +67 -56
- data/lib/www/mechanize/chain/body_decoding_handler.rb +5 -0
- data/lib/www/mechanize/chain/custom_headers.rb +3 -3
- data/lib/www/mechanize/chain/header_resolver.rb +6 -1
- data/lib/www/mechanize/chain/response_header_handler.rb +0 -1
- data/lib/www/mechanize/chain/ssl_resolver.rb +7 -1
- data/lib/www/mechanize/chain/uri_resolver.rb +5 -1
- data/lib/www/mechanize/cookie_jar.rb +63 -63
- data/lib/www/mechanize/file.rb +1 -1
- data/lib/www/mechanize/form.rb +1 -0
- data/lib/www/mechanize/form/select_list.rb +1 -1
- data/lib/www/mechanize/page.rb +41 -10
- data/lib/www/mechanize/page/meta.rb +42 -1
- data/lib/www/mechanize/util.rb +1 -1
- data/mechanize.gemspec +14 -10
- data/test/chain/test_header_resolver.rb +28 -0
- data/test/helper.rb +2 -0
- data/test/servlets.rb +26 -0
- data/test/test_cookie_class.rb +4 -4
- data/test/test_cookie_jar.rb +28 -9
- data/test/test_follow_meta.rb +39 -0
- data/test/test_forms.rb +0 -11
- data/test/test_mech.rb +15 -6
- data/test/test_meta.rb +65 -0
- data/test/test_page.rb +52 -0
- metadata +16 -7
data/CHANGELOG.rdoc
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
= Mechanize CHANGELOG
|
2
2
|
|
3
|
+
=== HEAD
|
4
|
+
|
5
|
+
* Bug Fixes:
|
6
|
+
|
7
|
+
* Do not apply encoding if encoding equals 'none' Thanks Akinori MUSHA!
|
8
|
+
* Custom request headers may be supplied WWW::Mechanize#request_headers
|
9
|
+
RF #24516
|
10
|
+
* HTML Parser may be set on a per instance level WWW::Mechanize#html_parser
|
11
|
+
RF #24693
|
12
|
+
* Fixed string encoding in ruby 1.9. RF #2433
|
13
|
+
* Rescuing Zlib::DataErrors (Thanks Kelley Reynolds)
|
14
|
+
* Fixing a problem with frozen SSL objects. RF #24950
|
15
|
+
* Do not send a referer on meta refresh. RF #24945
|
16
|
+
* Fixed a bug with double semi-colons in Content-Disposition headers
|
17
|
+
* Properly handling cookies that specify a path. RF #25259
|
18
|
+
|
3
19
|
=== 0.9.2 / 2009/03/05
|
4
20
|
|
5
21
|
* New Features:
|
data/README.rdoc
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
= WWW::Mechanize
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
* http://mechanize.rubyforge.org/
|
4
|
+
* http://github.com/tenderlove/mechanize/tree/master
|
5
5
|
|
6
6
|
== DESCRIPTION
|
7
7
|
|
@@ -28,8 +28,8 @@ The bug tracker is available here:
|
|
28
28
|
|
29
29
|
== Examples
|
30
30
|
|
31
|
-
If you are just starting, check out the GUIDE
|
32
|
-
Also, check out the EXAMPLES
|
31
|
+
If you are just starting, check out the GUIDE.
|
32
|
+
Also, check out the EXAMPLES file.
|
33
33
|
|
34
34
|
== Authors
|
35
35
|
|
@@ -56,5 +56,5 @@ library!
|
|
56
56
|
|
57
57
|
== License
|
58
58
|
|
59
|
-
This library is distributed under the GPL. Please see the LICENSE
|
59
|
+
This library is distributed under the GPL. Please see the LICENSE file.
|
60
60
|
|
data/Rakefile
CHANGED
@@ -5,7 +5,6 @@ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "lib")
|
|
5
5
|
require 'mechanize'
|
6
6
|
|
7
7
|
HOE = Hoe.new('mechanize', WWW::Mechanize::VERSION) do |p|
|
8
|
-
p.rubyforge_name = 'mechanize'
|
9
8
|
p.developer('Aaron Patterson','aaronp@rubyforge.org')
|
10
9
|
p.developer('Mike Dalessio','mike.dalessio@gmail.com')
|
11
10
|
p.readme_file = 'README.rdoc'
|
data/lib/www/mechanize.rb
CHANGED
@@ -37,7 +37,7 @@ module WWW
|
|
37
37
|
# require 'rubygems'
|
38
38
|
# require 'mechanize'
|
39
39
|
# require 'logger'
|
40
|
-
#
|
40
|
+
#
|
41
41
|
# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
|
42
42
|
# agent.user_agent_alias = 'Mac Safari'
|
43
43
|
# page = agent.get("http://www.google.com/")
|
@@ -48,8 +48,8 @@ module WWW
|
|
48
48
|
class Mechanize
|
49
49
|
##
|
50
50
|
# The version of Mechanize you are using.
|
51
|
-
VERSION = '0.9.
|
52
|
-
|
51
|
+
VERSION = '0.9.3'
|
52
|
+
|
53
53
|
##
|
54
54
|
# User Agent aliases
|
55
55
|
AGENT_ALIASES = {
|
@@ -64,7 +64,7 @@ module WWW
|
|
64
64
|
'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3',
|
65
65
|
'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"
|
66
66
|
}
|
67
|
-
|
67
|
+
|
68
68
|
attr_accessor :cookie_jar
|
69
69
|
attr_accessor :open_timeout, :read_timeout
|
70
70
|
attr_accessor :user_agent
|
@@ -82,15 +82,21 @@ module WWW
|
|
82
82
|
attr_accessor :history_added
|
83
83
|
attr_accessor :scheme_handlers
|
84
84
|
attr_accessor :redirection_limit
|
85
|
-
|
85
|
+
|
86
|
+
# A hash of custom request headers
|
87
|
+
attr_accessor :request_headers
|
88
|
+
|
89
|
+
# The HTML parser to be used when parsing documents
|
90
|
+
attr_accessor :html_parser
|
91
|
+
|
86
92
|
attr_reader :history
|
87
93
|
attr_reader :pluggable_parser
|
88
|
-
|
94
|
+
|
89
95
|
alias :follow_redirect? :redirect_ok
|
90
|
-
|
96
|
+
|
91
97
|
@html_parser = Nokogiri::HTML
|
92
98
|
class << self; attr_accessor :html_parser, :log end
|
93
|
-
|
99
|
+
|
94
100
|
def initialize
|
95
101
|
# attr_accessors
|
96
102
|
@cookie_jar = CookieJar.new
|
@@ -110,28 +116,29 @@ module WWW
|
|
110
116
|
@key = nil # OpenSSL Private Key
|
111
117
|
@pass = nil # OpenSSL Password
|
112
118
|
@redirect_ok = true # Should we follow redirects?
|
113
|
-
|
119
|
+
|
114
120
|
# attr_readers
|
115
121
|
@history = WWW::Mechanize::History.new
|
116
122
|
@pluggable_parser = PluggableParser.new
|
117
|
-
|
123
|
+
|
118
124
|
# Auth variables
|
119
125
|
@user = nil # Auth User
|
120
126
|
@password = nil # Auth Password
|
121
127
|
@digest = nil # DigestAuth Digest
|
122
128
|
@auth_hash = {} # Keep track of urls for sending auth
|
123
|
-
|
129
|
+
@request_headers= {} # A hash of request headers to be used
|
130
|
+
|
124
131
|
# Proxy settings
|
125
132
|
@proxy_addr = nil
|
126
133
|
@proxy_pass = nil
|
127
134
|
@proxy_port = nil
|
128
135
|
@proxy_user = nil
|
129
|
-
|
136
|
+
|
130
137
|
@conditional_requests = true
|
131
|
-
|
138
|
+
|
132
139
|
@follow_meta_refresh = false
|
133
140
|
@redirection_limit = 20
|
134
|
-
|
141
|
+
|
135
142
|
# Connection Cache & Keep alive
|
136
143
|
@connection_cache = {}
|
137
144
|
@keep_alive_time = 300
|
@@ -149,7 +156,9 @@ module WWW
|
|
149
156
|
|
150
157
|
@pre_connect_hook = Chain::PreConnectHook.new
|
151
158
|
@post_connect_hook = Chain::PostConnectHook.new
|
152
|
-
|
159
|
+
|
160
|
+
@html_parser = self.class.html_parser
|
161
|
+
|
153
162
|
yield self if block_given?
|
154
163
|
end
|
155
164
|
|
@@ -165,31 +174,31 @@ module WWW
|
|
165
174
|
def post_connect_hooks
|
166
175
|
@post_connect_hook.hooks
|
167
176
|
end
|
168
|
-
|
177
|
+
|
169
178
|
# Sets the proxy address, port, user, and password
|
170
179
|
# +addr+ should be a host, with no "http://"
|
171
180
|
def set_proxy(addr, port, user = nil, pass = nil)
|
172
181
|
@proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
|
173
182
|
end
|
174
|
-
|
183
|
+
|
175
184
|
# Set the user agent for the Mechanize object.
|
176
185
|
# See AGENT_ALIASES
|
177
186
|
def user_agent_alias=(al)
|
178
187
|
self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
|
179
188
|
end
|
180
|
-
|
189
|
+
|
181
190
|
# Returns a list of cookies stored in the cookie jar.
|
182
191
|
def cookies
|
183
192
|
@cookie_jar.to_a
|
184
193
|
end
|
185
|
-
|
194
|
+
|
186
195
|
# Sets the user and password to be used for authentication.
|
187
196
|
def auth(user, password)
|
188
197
|
@user = user
|
189
198
|
@password = password
|
190
199
|
end
|
191
200
|
alias :basic_auth :auth
|
192
|
-
|
201
|
+
|
193
202
|
# Fetches the URL passed in and returns a page.
|
194
203
|
def get(options, parameters = [], referer = nil)
|
195
204
|
unless options.is_a? Hash
|
@@ -272,12 +281,12 @@ module WWW
|
|
272
281
|
yield page if block_given?
|
273
282
|
page
|
274
283
|
end
|
275
|
-
|
284
|
+
|
276
285
|
# Fetch a file and return the contents of the file.
|
277
286
|
def get_file(url)
|
278
287
|
get(url).body
|
279
288
|
end
|
280
|
-
|
289
|
+
|
281
290
|
# Clicks the WWW::Mechanize::Link object passed in and returns the
|
282
291
|
# page fetched.
|
283
292
|
def click(link)
|
@@ -286,13 +295,13 @@ module WWW
|
|
286
295
|
(link['href'] || link['src'])
|
287
296
|
get(:url => href, :referer => (referer || current_page()))
|
288
297
|
end
|
289
|
-
|
298
|
+
|
290
299
|
# Equivalent to the browser back button. Returns the most recent page
|
291
300
|
# visited.
|
292
301
|
def back
|
293
302
|
@history.pop
|
294
303
|
end
|
295
|
-
|
304
|
+
|
296
305
|
# Posts to the given URL wht the query parameters passed in. Query
|
297
306
|
# parameters can be passed as a hash, or as an array of arrays.
|
298
307
|
# Example:
|
@@ -307,7 +316,7 @@ module WWW
|
|
307
316
|
end
|
308
317
|
node['method'] = 'POST'
|
309
318
|
node['enctype'] = 'application/x-www-form-urlencoded'
|
310
|
-
|
319
|
+
|
311
320
|
form = Form.new(node)
|
312
321
|
query.each { |k,v|
|
313
322
|
if v.is_a?(IO)
|
@@ -321,7 +330,7 @@ module WWW
|
|
321
330
|
}
|
322
331
|
post_form(url, form)
|
323
332
|
end
|
324
|
-
|
333
|
+
|
325
334
|
# Submit a form with an optional button.
|
326
335
|
# Without a button:
|
327
336
|
# page = agent.get('http://example.com')
|
@@ -343,17 +352,17 @@ module WWW
|
|
343
352
|
raise "unsupported method: #{form.method.upcase}"
|
344
353
|
end
|
345
354
|
end
|
346
|
-
|
355
|
+
|
347
356
|
# Returns the current page loaded by Mechanize
|
348
357
|
def current_page
|
349
358
|
@history.last
|
350
359
|
end
|
351
|
-
|
360
|
+
|
352
361
|
# Returns whether or not a url has been visited
|
353
362
|
def visited?(url)
|
354
363
|
! visited_page(url).nil?
|
355
364
|
end
|
356
|
-
|
365
|
+
|
357
366
|
# Returns a visited page for the url passed in, otherwise nil
|
358
367
|
def visited_page(url)
|
359
368
|
if url.respond_to? :href
|
@@ -361,7 +370,7 @@ module WWW
|
|
361
370
|
end
|
362
371
|
@history.visited_page(resolve(url))
|
363
372
|
end
|
364
|
-
|
373
|
+
|
365
374
|
# Runs given block, then resets the page history as it was before. self is
|
366
375
|
# given as a parameter to the block. Returns the value of the block.
|
367
376
|
def transact
|
@@ -372,11 +381,11 @@ module WWW
|
|
372
381
|
@history = history_backup
|
373
382
|
end
|
374
383
|
end
|
375
|
-
|
384
|
+
|
376
385
|
alias :page :current_page
|
377
386
|
|
378
387
|
private
|
379
|
-
|
388
|
+
|
380
389
|
def resolve(url, referer = current_page())
|
381
390
|
hash = { :uri => url, :referer => referer }
|
382
391
|
chain = Chain.new([
|
@@ -384,15 +393,15 @@ module WWW
|
|
384
393
|
]).handle(hash)
|
385
394
|
hash[:uri].to_s
|
386
395
|
end
|
387
|
-
|
396
|
+
|
388
397
|
def post_form(url, form, headers = {})
|
389
398
|
cur_page = form.page || current_page ||
|
390
399
|
Page.new( nil, {'content-type'=>'text/html'})
|
391
|
-
|
400
|
+
|
392
401
|
request_data = form.request_data
|
393
|
-
|
402
|
+
|
394
403
|
log.debug("query: #{ request_data.inspect }") if log
|
395
|
-
|
404
|
+
|
396
405
|
# fetch the page
|
397
406
|
page = fetch_page( :uri => url,
|
398
407
|
:referer => cur_page,
|
@@ -402,10 +411,10 @@ module WWW
|
|
402
411
|
'Content-Type' => form.enctype,
|
403
412
|
'Content-Length' => request_data.size.to_s,
|
404
413
|
}.merge(headers))
|
405
|
-
add_to_history(page)
|
414
|
+
add_to_history(page)
|
406
415
|
page
|
407
416
|
end
|
408
|
-
|
417
|
+
|
409
418
|
# uri is an absolute URI
|
410
419
|
def fetch_page(params)
|
411
420
|
options = {
|
@@ -435,10 +444,13 @@ module WWW
|
|
435
444
|
),
|
436
445
|
Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass),
|
437
446
|
Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest),
|
438
|
-
Chain::HeaderResolver.new(
|
439
|
-
|
440
|
-
|
441
|
-
|
447
|
+
Chain::HeaderResolver.new(
|
448
|
+
@keep_alive,
|
449
|
+
@keep_alive_time,
|
450
|
+
@cookie_jar,
|
451
|
+
@user_agent,
|
452
|
+
{}
|
453
|
+
),
|
442
454
|
Chain::CustomHeaders.new,
|
443
455
|
@pre_connect_hook,
|
444
456
|
])
|
@@ -499,37 +511,36 @@ module WWW
|
|
499
511
|
page = options[:page]
|
500
512
|
|
501
513
|
log.info("status: #{ page.code }") if log
|
502
|
-
|
514
|
+
|
503
515
|
if follow_meta_refresh
|
504
|
-
redirect_uri
|
516
|
+
redirect_uri = nil
|
517
|
+
referer = page
|
505
518
|
if (page.respond_to?(:meta) && (redirect = page.meta.first))
|
506
519
|
redirect_uri = redirect.uri.to_s
|
520
|
+
sleep redirect.node['delay'].to_f
|
521
|
+
referer = Page.new(nil, {'content-type'=>'text/html'})
|
507
522
|
elsif refresh = response['refresh']
|
508
|
-
|
509
|
-
raise StandardError, "Invalid refresh http header" unless
|
510
|
-
delay = parsed_refresh[1]
|
511
|
-
location = parsed_refresh[3]
|
512
|
-
location = "http://#{uri.host}#{location}" unless location.include?("http")
|
523
|
+
delay, redirect_uri = Page::Meta.parse(refresh, uri)
|
524
|
+
raise StandardError, "Invalid refresh http header" unless delay
|
513
525
|
if redirects + 1 > redirection_limit
|
514
526
|
raise RedirectLimitReachedError.new(page, redirects)
|
515
527
|
end
|
516
|
-
sleep delay.
|
517
|
-
redirect_uri = location
|
528
|
+
sleep delay.to_f
|
518
529
|
end
|
519
530
|
if redirect_uri
|
520
531
|
@history.push(page, page.uri)
|
521
532
|
return fetch_page(
|
522
533
|
:uri => redirect_uri,
|
523
|
-
:referer =>
|
534
|
+
:referer => referer,
|
524
535
|
:params => [],
|
525
536
|
:verb => :get,
|
526
537
|
:redirects => redirects + 1
|
527
538
|
)
|
528
539
|
end
|
529
540
|
end
|
530
|
-
|
541
|
+
|
531
542
|
return page if res_klass <= Net::HTTPSuccess
|
532
|
-
|
543
|
+
|
533
544
|
if res_klass == Net::HTTPNotModified
|
534
545
|
log.debug("Got cached page") if log
|
535
546
|
return visited_page(uri) || page
|
@@ -566,10 +577,10 @@ module WWW
|
|
566
577
|
:headers => options[:headers]
|
567
578
|
)
|
568
579
|
end
|
569
|
-
|
580
|
+
|
570
581
|
raise ResponseCodeError.new(page), "Unhandled response", caller
|
571
582
|
end
|
572
|
-
|
583
|
+
|
573
584
|
def add_to_history(page)
|
574
585
|
@history.push(page, resolve(page.uri))
|
575
586
|
history_added.call(page) if history_added
|
@@ -23,6 +23,11 @@ module WWW
|
|
23
23
|
body.rewind
|
24
24
|
body.read(10)
|
25
25
|
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body.read)
|
26
|
+
rescue Zlib::DataError
|
27
|
+
if Mechanize.log
|
28
|
+
Mechanize.log.error("Caught a Zlib::DataError, unable to decode page: #{$!.to_s}")
|
29
|
+
end
|
30
|
+
''
|
26
31
|
end
|
27
32
|
else
|
28
33
|
''
|
@@ -8,11 +8,11 @@ module WWW
|
|
8
8
|
request = params[:request]
|
9
9
|
params[:headers].each do |k,v|
|
10
10
|
case k
|
11
|
-
when :etag then request
|
12
|
-
when :if_modified_since then request
|
11
|
+
when :etag then request["ETag"] = v
|
12
|
+
when :if_modified_since then request["If-Modified-Since"] = v
|
13
13
|
else
|
14
14
|
raise ArgumentError.new("unknown header symbol #{k}") if k.is_a? Symbol
|
15
|
-
request
|
15
|
+
request[k] = v
|
16
16
|
end
|
17
17
|
end
|
18
18
|
super
|
@@ -3,11 +3,12 @@ module WWW
|
|
3
3
|
class Chain
|
4
4
|
class HeaderResolver
|
5
5
|
include WWW::Handler
|
6
|
-
def initialize(keep_alive, keep_alive_time, cookie_jar, user_agent)
|
6
|
+
def initialize(keep_alive, keep_alive_time, cookie_jar, user_agent, headers)
|
7
7
|
@keep_alive = keep_alive
|
8
8
|
@keep_alive_time = keep_alive_time
|
9
9
|
@cookie_jar = cookie_jar
|
10
10
|
@user_agent = user_agent
|
11
|
+
@headers = headers
|
11
12
|
end
|
12
13
|
|
13
14
|
def handle(ctx, params)
|
@@ -40,6 +41,10 @@ module WWW
|
|
40
41
|
|
41
42
|
# Add User-Agent header to request
|
42
43
|
request['User-Agent'] = @user_agent if @user_agent
|
44
|
+
|
45
|
+
@headers.each do |k,v|
|
46
|
+
request[k] = v
|
47
|
+
end if request
|
43
48
|
super
|
44
49
|
end
|
45
50
|
end
|