mechanize 2.0.pre.2 → 2.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +22 -0
- data/Manifest.txt +11 -8
- data/Rakefile +2 -2
- data/examples/flickr_upload.rb +6 -7
- data/examples/mech-dump.rb +0 -2
- data/examples/proxy_req.rb +0 -2
- data/examples/rubyforge.rb +1 -3
- data/examples/spider.rb +2 -3
- data/lib/mechanize.rb +228 -680
- data/lib/mechanize/form/field.rb +1 -1
- data/lib/mechanize/history.rb +23 -5
- data/lib/mechanize/http.rb +3 -0
- data/lib/mechanize/http/agent.rb +738 -0
- data/lib/mechanize/inspect.rb +2 -2
- data/lib/mechanize/page.rb +101 -42
- data/lib/mechanize/page/frame.rb +24 -17
- data/lib/mechanize/page/link.rb +72 -54
- data/lib/mechanize/page/meta_refresh.rb +56 -0
- data/lib/mechanize/response_read_error.rb +27 -0
- data/test/htdocs/frame_referer_test.html +10 -0
- data/test/htdocs/tc_referer.html +4 -0
- data/test/test_frames.rb +9 -0
- data/test/test_history.rb +74 -98
- data/test/test_mechanize.rb +334 -812
- data/test/test_mechanize_form.rb +32 -3
- data/test/{test_textarea.rb → test_mechanize_form_textarea.rb} +1 -1
- data/test/test_mechanize_http_agent.rb +697 -0
- data/test/test_mechanize_link.rb +83 -0
- data/test/test_mechanize_page_encoding.rb +147 -0
- data/test/test_mechanize_page_link.rb +379 -0
- data/test/test_mechanize_page_meta_refresh.rb +115 -0
- data/test/test_pretty_print.rb +1 -1
- data/test/test_referer.rb +29 -5
- data/test/test_response_code.rb +21 -20
- data/test/test_robots.rb +13 -17
- data/test/test_scheme.rb +1 -1
- metadata +30 -31
- metadata.gz.sig +0 -0
- data/lib/mechanize/page/meta.rb +0 -48
- data/test/test_form_no_inputname.rb +0 -15
- data/test/test_links.rb +0 -146
- data/test/test_mechanize_page.rb +0 -224
- data/test/test_meta.rb +0 -67
- data/test/test_upload.rb +0 -109
- data/test/test_verbs.rb +0 -25
data/lib/mechanize/form/field.rb
CHANGED
data/lib/mechanize/history.rb
CHANGED
@@ -14,26 +14,40 @@ class Mechanize
|
|
14
14
|
@history_index = orig.instance_variable_get(:@history_index).dup
|
15
15
|
end
|
16
16
|
|
17
|
+
def inspect # :nodoc:
|
18
|
+
uris = map { |page| page.uri }.join ', '
|
19
|
+
|
20
|
+
"[#{uris}]"
|
21
|
+
end
|
22
|
+
|
17
23
|
def push(page, uri = nil)
|
18
24
|
super(page)
|
25
|
+
|
19
26
|
@history_index[(uri ? uri : page.uri).to_s] = page
|
27
|
+
|
20
28
|
if @max_size && self.length > @max_size
|
21
29
|
while self.length > @max_size
|
22
30
|
self.shift
|
23
31
|
end
|
24
32
|
end
|
33
|
+
|
25
34
|
self
|
26
35
|
end
|
27
36
|
alias :<< :push
|
28
37
|
|
29
|
-
def
|
30
|
-
|
31
|
-
end
|
38
|
+
def visited_page(uri)
|
39
|
+
page = @history_index[uri.to_s]
|
32
40
|
|
33
|
-
|
34
|
-
|
41
|
+
return page if page # HACK
|
42
|
+
|
43
|
+
uri = uri.dup
|
44
|
+
uri.path = '/' if uri.path.empty?
|
45
|
+
|
46
|
+
@history_index[uri.to_s]
|
35
47
|
end
|
36
48
|
|
49
|
+
alias visited? visited_page
|
50
|
+
|
37
51
|
def clear
|
38
52
|
@history_index.clear
|
39
53
|
super
|
@@ -43,7 +57,9 @@ class Mechanize
|
|
43
57
|
return nil if length == 0
|
44
58
|
page = self[0]
|
45
59
|
self[0] = nil
|
60
|
+
|
46
61
|
super
|
62
|
+
|
47
63
|
remove_from_index(page)
|
48
64
|
page
|
49
65
|
end
|
@@ -56,10 +72,12 @@ class Mechanize
|
|
56
72
|
end
|
57
73
|
|
58
74
|
private
|
75
|
+
|
59
76
|
def remove_from_index(page)
|
60
77
|
@history_index.each do |k,v|
|
61
78
|
@history_index.delete(k) if v == page
|
62
79
|
end
|
63
80
|
end
|
81
|
+
|
64
82
|
end
|
65
83
|
end
|
@@ -0,0 +1,738 @@
|
|
1
|
+
##
|
2
|
+
# An HTTP (and local disk access) user agent
|
3
|
+
|
4
|
+
class Mechanize::HTTP::Agent
|
5
|
+
|
6
|
+
attr_reader :cookie_jar
|
7
|
+
|
8
|
+
# Disables If-Modified-Since conditional requests (enabled by default)
|
9
|
+
attr_accessor :conditional_requests
|
10
|
+
attr_accessor :context
|
11
|
+
|
12
|
+
# Follow HTML meta refresh. If set to +:anywhere+ meta refresh tags outside
|
13
|
+
# of the head element will be followed.
|
14
|
+
attr_accessor :follow_meta_refresh
|
15
|
+
attr_accessor :gzip_enabled
|
16
|
+
attr_accessor :history
|
17
|
+
|
18
|
+
# Length of time to wait until a connection is opened in seconds
|
19
|
+
attr_accessor :open_timeout
|
20
|
+
|
21
|
+
attr_accessor :password
|
22
|
+
attr_reader :proxy_uri
|
23
|
+
|
24
|
+
# A list of hooks to call after retrieving a response. Hooks are called with
|
25
|
+
# the agent and the response returned.
|
26
|
+
|
27
|
+
attr_reader :post_connect_hooks
|
28
|
+
|
29
|
+
# A list of hooks to call before making a request. Hooks are called with
|
30
|
+
# the agent and the request to be performed.
|
31
|
+
|
32
|
+
attr_reader :pre_connect_hooks
|
33
|
+
|
34
|
+
# Length of time to attempt to read data from the server
|
35
|
+
attr_accessor :read_timeout
|
36
|
+
|
37
|
+
# Controls how this agent deals with redirects. The following values are
|
38
|
+
# allowed:
|
39
|
+
#
|
40
|
+
# :all, true:: All 3xx redirects are followed (default)
|
41
|
+
# :permanent:: Only 301 Moved Permanantly redirects are followed
|
42
|
+
# false:: No redirects are followed
|
43
|
+
|
44
|
+
attr_accessor :redirect_ok
|
45
|
+
attr_accessor :redirection_limit
|
46
|
+
|
47
|
+
# A hash of request headers to be used
|
48
|
+
|
49
|
+
attr_accessor :request_headers
|
50
|
+
|
51
|
+
# When true, this agent will consult the site's robots.txt for each access.
|
52
|
+
|
53
|
+
attr_reader :robots
|
54
|
+
|
55
|
+
attr_accessor :scheme_handlers
|
56
|
+
|
57
|
+
attr_accessor :user
|
58
|
+
attr_reader :user_agent
|
59
|
+
|
60
|
+
# Path to an OpenSSL server certificate file
|
61
|
+
attr_accessor :ca_file
|
62
|
+
|
63
|
+
# An OpenSSL private key or the path to a private key
|
64
|
+
attr_accessor :key
|
65
|
+
|
66
|
+
# An OpenSSL client certificate or the path to a certificate file.
|
67
|
+
attr_accessor :cert
|
68
|
+
|
69
|
+
# OpenSSL key password
|
70
|
+
attr_accessor :pass
|
71
|
+
|
72
|
+
# A callback for additional certificate verification. See
|
73
|
+
# OpenSSL::SSL::SSLContext#verify_callback
|
74
|
+
#
|
75
|
+
# The callback can be used for debugging or to ignore errors by always
|
76
|
+
# returning +true+. Specifying nil uses the default method that was valid
|
77
|
+
# when the SSLContext was created
|
78
|
+
attr_accessor :verify_callback
|
79
|
+
|
80
|
+
attr_reader :http # :nodoc:
|
81
|
+
|
82
|
+
def initialize
|
83
|
+
@auth_hash = {} # Keep track of urls for sending auth
|
84
|
+
@conditional_requests = true
|
85
|
+
@context = nil
|
86
|
+
@cookie_jar = Mechanize::CookieJar.new
|
87
|
+
@digest = nil # DigestAuth Digest
|
88
|
+
@digest_auth = Net::HTTP::DigestAuth.new
|
89
|
+
@follow_meta_refresh = false
|
90
|
+
@gzip_enabled = true
|
91
|
+
@history = Mechanize::History.new
|
92
|
+
@keep_alive_time = 300
|
93
|
+
@open_timeout = nil
|
94
|
+
@password = nil # HTTP auth password
|
95
|
+
@post_connect_hooks = []
|
96
|
+
@pre_connect_hooks = []
|
97
|
+
@proxy_uri = nil
|
98
|
+
@read_timeout = nil
|
99
|
+
@redirect_ok = true
|
100
|
+
@redirection_limit = 20
|
101
|
+
@request_headers = {}
|
102
|
+
@robots = false
|
103
|
+
@user = nil # HTTP auth user
|
104
|
+
@user_agent = nil
|
105
|
+
@webrobots = nil
|
106
|
+
|
107
|
+
@ca_file = nil # OpenSSL server certificate file
|
108
|
+
@cert = nil # OpenSSL Certificate
|
109
|
+
@key = nil # OpenSSL Private Key
|
110
|
+
@pass = nil # OpenSSL Password
|
111
|
+
@verify_callback = nil
|
112
|
+
|
113
|
+
@scheme_handlers = Hash.new { |h, scheme|
|
114
|
+
h[scheme] = lambda { |link, page|
|
115
|
+
raise Mechanize::UnsupportedSchemeError, scheme
|
116
|
+
}
|
117
|
+
}
|
118
|
+
|
119
|
+
@scheme_handlers['http'] = lambda { |link, page| link }
|
120
|
+
@scheme_handlers['https'] = @scheme_handlers['http']
|
121
|
+
@scheme_handlers['relative'] = @scheme_handlers['http']
|
122
|
+
@scheme_handlers['file'] = @scheme_handlers['http']
|
123
|
+
end
|
124
|
+
|
125
|
+
# Equivalent to the browser back button. Returns the most recent page
|
126
|
+
# visited.
|
127
|
+
def back
|
128
|
+
@history.pop
|
129
|
+
end
|
130
|
+
|
131
|
+
def certificate
|
132
|
+
@http.certificate
|
133
|
+
end
|
134
|
+
|
135
|
+
def connection_for uri
|
136
|
+
case uri.scheme.downcase
|
137
|
+
when 'http', 'https' then
|
138
|
+
return @http
|
139
|
+
when 'file' then
|
140
|
+
return Mechanize::FileConnection.new
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
##
|
145
|
+
# Returns the latest page loaded by the agent
|
146
|
+
|
147
|
+
def current_page
|
148
|
+
@history.last
|
149
|
+
end
|
150
|
+
|
151
|
+
def enable_gzip request
|
152
|
+
request['accept-encoding'] = if @gzip_enabled
|
153
|
+
'gzip,deflate,identity'
|
154
|
+
else
|
155
|
+
'identity'
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# uri is an absolute URI
|
160
|
+
def fetch uri, method = :get, headers = {}, params = [],
|
161
|
+
referer = current_page, redirects = 0
|
162
|
+
referer_uri = referer ? referer.uri : nil
|
163
|
+
|
164
|
+
uri = resolve uri, referer
|
165
|
+
|
166
|
+
uri, params = resolve_parameters uri, method, params
|
167
|
+
|
168
|
+
request = http_request uri, method, params
|
169
|
+
|
170
|
+
connection = connection_for uri
|
171
|
+
|
172
|
+
request_auth request, uri
|
173
|
+
|
174
|
+
enable_gzip request
|
175
|
+
|
176
|
+
request_language_charset request
|
177
|
+
request_cookies request, uri
|
178
|
+
request_host request, uri
|
179
|
+
request_referer request, uri, referer_uri
|
180
|
+
request_user_agent request
|
181
|
+
request_add_headers request, headers
|
182
|
+
|
183
|
+
pre_connect request
|
184
|
+
|
185
|
+
# Consult robots.txt
|
186
|
+
if robots && uri.is_a?(URI::HTTP)
|
187
|
+
robots_allowed?(uri) or raise Mechanize::RobotsDisallowedError.new(uri)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Add If-Modified-Since if page is in history
|
191
|
+
if (page = visited_page(uri)) and page.response['Last-Modified']
|
192
|
+
request['If-Modified-Since'] = page.response['Last-Modified']
|
193
|
+
end if(@conditional_requests)
|
194
|
+
|
195
|
+
# Specify timeouts if given
|
196
|
+
connection.open_timeout = @open_timeout if @open_timeout
|
197
|
+
connection.read_timeout = @read_timeout if @read_timeout
|
198
|
+
|
199
|
+
request_log request
|
200
|
+
|
201
|
+
response_body_io = nil
|
202
|
+
|
203
|
+
# Send the request
|
204
|
+
response = connection.request(uri, request) { |res|
|
205
|
+
response_log res
|
206
|
+
|
207
|
+
response_body_io = response_read res, request
|
208
|
+
|
209
|
+
res
|
210
|
+
}
|
211
|
+
|
212
|
+
response_body = response_content_encoding response, response_body_io
|
213
|
+
|
214
|
+
post_connect uri, response, response_body
|
215
|
+
|
216
|
+
page = response_parse response, response_body, uri
|
217
|
+
|
218
|
+
response_cookies response, uri, page
|
219
|
+
|
220
|
+
meta = response_follow_meta_refresh response, uri, page, redirects
|
221
|
+
return meta if meta
|
222
|
+
|
223
|
+
case response
|
224
|
+
when Net::HTTPSuccess
|
225
|
+
if robots && page.is_a?(Mechanize::Page)
|
226
|
+
page.parser.noindex? and raise Mechanize::RobotsDisallowedError.new(uri)
|
227
|
+
end
|
228
|
+
|
229
|
+
page
|
230
|
+
when Mechanize::FileResponse
|
231
|
+
page
|
232
|
+
when Net::HTTPNotModified
|
233
|
+
log.debug("Got cached page") if log
|
234
|
+
visited_page(uri) || page
|
235
|
+
when Net::HTTPRedirection
|
236
|
+
response_redirect response, method, page, redirects
|
237
|
+
when Net::HTTPUnauthorized
|
238
|
+
response_authenticate(response, page, uri, request, headers, params,
|
239
|
+
referer)
|
240
|
+
else
|
241
|
+
raise Mechanize::ResponseCodeError.new(page), "Unhandled response"
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def max_history
|
246
|
+
@history.max_size
|
247
|
+
end
|
248
|
+
|
249
|
+
def max_history=(length)
|
250
|
+
@history.max_size = length
|
251
|
+
end
|
252
|
+
|
253
|
+
def http_request uri, method, params = nil
|
254
|
+
case uri.scheme.downcase
|
255
|
+
when 'http', 'https' then
|
256
|
+
klass = Net::HTTP.const_get(method.to_s.capitalize)
|
257
|
+
|
258
|
+
request ||= klass.new(uri.request_uri)
|
259
|
+
request.body = params.first if params
|
260
|
+
|
261
|
+
request
|
262
|
+
when 'file' then
|
263
|
+
Mechanize::FileRequest.new uri
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
def log
|
268
|
+
Mechanize.log
|
269
|
+
end
|
270
|
+
|
271
|
+
##
|
272
|
+
# Invokes hooks added to post_connect_hooks after a +response+ is returned
|
273
|
+
# and the response +body+ is handled.
|
274
|
+
#
|
275
|
+
# Yields the +context+, the +uri+ for the request, the +response+ and the
|
276
|
+
# response +body+.
|
277
|
+
|
278
|
+
def post_connect uri, response, body # :yields: agent, uri, response, body
|
279
|
+
@post_connect_hooks.each do |hook|
|
280
|
+
hook.call self, uri, response, body
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
##
|
285
|
+
# Invokes hooks added to pre_connect_hooks before a +request+ is made.
|
286
|
+
# Yields the +agent+ and the +request+ that will be performed to each hook.
|
287
|
+
|
288
|
+
def pre_connect request # :yields: agent, request
|
289
|
+
@pre_connect_hooks.each do |hook|
|
290
|
+
hook.call self, request
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def request_auth request, uri
|
295
|
+
auth_type = @auth_hash[uri.host]
|
296
|
+
|
297
|
+
return unless auth_type
|
298
|
+
|
299
|
+
case auth_type
|
300
|
+
when :basic
|
301
|
+
request.basic_auth @user, @password
|
302
|
+
when :digest, :iis_digest
|
303
|
+
uri.user = @user
|
304
|
+
uri.password = @password
|
305
|
+
|
306
|
+
iis = auth_type == :iis_digest
|
307
|
+
|
308
|
+
auth = @digest_auth.auth_header uri, @digest, request.method, iis
|
309
|
+
|
310
|
+
request['Authorization'] = auth
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
def request_cookies request, uri
|
315
|
+
return if @cookie_jar.empty? uri
|
316
|
+
|
317
|
+
cookies = @cookie_jar.cookies uri
|
318
|
+
|
319
|
+
return if cookies.empty?
|
320
|
+
|
321
|
+
request.add_field 'Cookie', cookies.join('; ')
|
322
|
+
end
|
323
|
+
|
324
|
+
def request_host request, uri
|
325
|
+
port = [80, 443].include?(uri.port.to_i) ? nil : uri.port
|
326
|
+
host = uri.host
|
327
|
+
|
328
|
+
request['Host'] = [host, port].compact.join ':'
|
329
|
+
end
|
330
|
+
|
331
|
+
def request_language_charset request
|
332
|
+
request['accept-charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
|
333
|
+
request['accept-language'] = 'en-us,en;q=0.5'
|
334
|
+
end
|
335
|
+
|
336
|
+
# Log specified headers for the request
|
337
|
+
def request_log request
|
338
|
+
return unless log
|
339
|
+
|
340
|
+
log.info("#{request.class}: #{request.path}")
|
341
|
+
|
342
|
+
request.each_header do |k, v|
|
343
|
+
log.debug("request-header: #{k} => #{v}")
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
def request_add_headers request, headers = {}
|
348
|
+
@request_headers.each do |k,v|
|
349
|
+
request[k] = v
|
350
|
+
end
|
351
|
+
|
352
|
+
headers.each do |field, value|
|
353
|
+
case field
|
354
|
+
when :etag then request["ETag"] = value
|
355
|
+
when :if_modified_since then request["If-Modified-Since"] = value
|
356
|
+
when Symbol then
|
357
|
+
raise ArgumentError, "unknown header symbol #{field}"
|
358
|
+
else
|
359
|
+
request[field] = value
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
def request_referer request, uri, referer
|
365
|
+
return unless referer
|
366
|
+
return if 'https' == referer.scheme.downcase and
|
367
|
+
'https' != uri.scheme.downcase
|
368
|
+
|
369
|
+
request['Referer'] = referer
|
370
|
+
end
|
371
|
+
|
372
|
+
def request_user_agent request
|
373
|
+
request['User-Agent'] = @user_agent if @user_agent
|
374
|
+
end
|
375
|
+
|
376
|
+
def resolve(uri, referer = current_page)
|
377
|
+
uri = uri.dup if uri.is_a?(URI)
|
378
|
+
|
379
|
+
unless uri.is_a?(URI)
|
380
|
+
uri = uri.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/o) { |match|
|
381
|
+
if RUBY_VERSION >= "1.9.0"
|
382
|
+
Mechanize::Util.uri_escape(match)
|
383
|
+
else
|
384
|
+
sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C')[0])
|
385
|
+
end
|
386
|
+
}
|
387
|
+
|
388
|
+
unescaped = uri.split(/(?:%[0-9A-Fa-f]{2})+|#/)
|
389
|
+
escaped = uri.scan(/(?:%[0-9A-Fa-f]{2})+|#/)
|
390
|
+
|
391
|
+
escaped_uri = Mechanize::Util.html_unescape(
|
392
|
+
unescaped.zip(escaped).map { |x,y|
|
393
|
+
"#{WEBrick::HTTPUtils.escape(x)}#{y}"
|
394
|
+
}.join('')
|
395
|
+
)
|
396
|
+
|
397
|
+
begin
|
398
|
+
uri = URI.parse(escaped_uri)
|
399
|
+
rescue
|
400
|
+
uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_uri))
|
401
|
+
end
|
402
|
+
end
|
403
|
+
|
404
|
+
scheme = uri.relative? ? 'relative' : uri.scheme.downcase
|
405
|
+
uri = @scheme_handlers[scheme].call(uri, referer)
|
406
|
+
|
407
|
+
if referer && referer.uri
|
408
|
+
if uri.path.length == 0 && uri.relative?
|
409
|
+
uri.path = referer.uri.path
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
uri.path = '/' if uri.path.length == 0
|
414
|
+
|
415
|
+
if uri.relative?
|
416
|
+
raise ArgumentError, "absolute URL needed (not #{uri})" unless
|
417
|
+
referer && referer.uri
|
418
|
+
|
419
|
+
base = nil
|
420
|
+
if referer.respond_to?(:bases) && referer.parser
|
421
|
+
base = referer.bases.last
|
422
|
+
end
|
423
|
+
|
424
|
+
uri = ((base && base.uri && base.uri.absolute?) ?
|
425
|
+
base.uri :
|
426
|
+
referer.uri) + uri
|
427
|
+
uri = referer.uri + uri
|
428
|
+
# Strip initial "/.." bits from the path
|
429
|
+
uri.path.sub!(/^(\/\.\.)+(?=\/)/, '')
|
430
|
+
end
|
431
|
+
|
432
|
+
unless ['http', 'https', 'file'].include?(uri.scheme.downcase)
|
433
|
+
raise ArgumentError, "unsupported scheme: #{uri.scheme}"
|
434
|
+
end
|
435
|
+
|
436
|
+
uri
|
437
|
+
end
|
438
|
+
|
439
|
+
def resolve_parameters uri, method, parameters
|
440
|
+
case method
|
441
|
+
when :head, :get, :delete, :trace then
|
442
|
+
if parameters and parameters.length > 0
|
443
|
+
uri.query ||= ''
|
444
|
+
uri.query << '&' if uri.query.length > 0
|
445
|
+
uri.query << Mechanize::Util.build_query_string(parameters)
|
446
|
+
end
|
447
|
+
|
448
|
+
return uri, nil
|
449
|
+
end
|
450
|
+
|
451
|
+
return uri, parameters
|
452
|
+
end
|
453
|
+
|
454
|
+
def response_content_encoding response, body_io
|
455
|
+
length = response.content_length || body_io.length
|
456
|
+
|
457
|
+
case response['Content-Encoding']
|
458
|
+
when nil, 'none', '7bit' then
|
459
|
+
body_io.string
|
460
|
+
when 'deflate' then
|
461
|
+
log.debug('deflate body') if log
|
462
|
+
|
463
|
+
return if length.zero?
|
464
|
+
|
465
|
+
begin
|
466
|
+
Zlib::Inflate.inflate body_io.string
|
467
|
+
rescue Zlib::BufError, Zlib::DataError
|
468
|
+
log.error('Unable to inflate page, retrying with raw deflate') if log
|
469
|
+
begin
|
470
|
+
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body_io.string)
|
471
|
+
rescue Zlib::BufError, Zlib::DataError
|
472
|
+
log.error("unable to inflate page: #{$!}") if log
|
473
|
+
''
|
474
|
+
end
|
475
|
+
end
|
476
|
+
when 'gzip', 'x-gzip' then
|
477
|
+
log.debug('gzip body') if log
|
478
|
+
|
479
|
+
return if length.zero?
|
480
|
+
|
481
|
+
begin
|
482
|
+
zio = Zlib::GzipReader.new body_io
|
483
|
+
zio.read
|
484
|
+
rescue Zlib::BufError, Zlib::GzipFile::Error
|
485
|
+
log.error('Unable to gunzip body, trying raw inflate') if log
|
486
|
+
body_io.rewind
|
487
|
+
body_io.read 10
|
488
|
+
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body_io.read)
|
489
|
+
rescue Zlib::DataError
|
490
|
+
log.error("unable to gunzip page: #{$!}") if log
|
491
|
+
''
|
492
|
+
ensure
|
493
|
+
zio.close if zio and not zio.closed?
|
494
|
+
end
|
495
|
+
else
|
496
|
+
raise Mechanize::Error,
|
497
|
+
"Unsupported Content-Encoding: #{response['Content-Encoding']}"
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
def response_cookies response, uri, page
|
502
|
+
if Mechanize::Page === page and page.body =~ /Set-Cookie/n
|
503
|
+
page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta|
|
504
|
+
Mechanize::Cookie.parse(uri, meta['content']) { |c|
|
505
|
+
log.debug("saved cookie: #{c}") if log
|
506
|
+
@cookie_jar.add(uri, c)
|
507
|
+
}
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
header_cookies = response.get_fields 'Set-Cookie'
|
512
|
+
|
513
|
+
return unless header_cookies
|
514
|
+
|
515
|
+
header_cookies.each do |cookie|
|
516
|
+
Mechanize::Cookie.parse(uri, cookie) { |c|
|
517
|
+
log.debug("saved cookie: #{c}") if log
|
518
|
+
@cookie_jar.add(uri, c)
|
519
|
+
}
|
520
|
+
end
|
521
|
+
end
|
522
|
+
|
523
|
+
def response_follow_meta_refresh response, uri, page, redirects
|
524
|
+
return unless @follow_meta_refresh
|
525
|
+
|
526
|
+
redirect_uri = nil
|
527
|
+
referer = page
|
528
|
+
|
529
|
+
if page.respond_to?(:meta_refresh) and (redirect = page.meta_refresh.first)
|
530
|
+
redirect_uri = Mechanize::Util.uri_unescape redirect.uri.to_s
|
531
|
+
sleep redirect.node['delay'].to_f
|
532
|
+
referer = Mechanize::Page.new(nil, {'content-type'=>'text/html'})
|
533
|
+
elsif refresh = response['refresh']
|
534
|
+
delay, redirect_uri = Mechanize::Page::MetaRefresh.parse refresh, uri
|
535
|
+
raise Mechanize::Error, 'Invalid refresh http header' unless delay
|
536
|
+
raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
|
537
|
+
redirects + 1 > @redirection_limit
|
538
|
+
sleep delay.to_f
|
539
|
+
end
|
540
|
+
|
541
|
+
if redirect_uri
|
542
|
+
@history.push(page, page.uri)
|
543
|
+
fetch redirect_uri, :get, {}, [], referer, redirects + 1
|
544
|
+
end
|
545
|
+
end
|
546
|
+
|
547
|
+
def response_log response
|
548
|
+
return unless log
|
549
|
+
|
550
|
+
log.info("status: #{response.class} #{response.http_version} " \
|
551
|
+
"#{response.code} #{response.message}")
|
552
|
+
|
553
|
+
response.each_header do |k, v|
|
554
|
+
log.debug("response-header: #{k} => #{v}")
|
555
|
+
end
|
556
|
+
end
|
557
|
+
|
558
|
+
def response_parse response, body, uri
|
559
|
+
@context.parse uri, response, body
|
560
|
+
end
|
561
|
+
|
562
|
+
def response_read response, request
|
563
|
+
body_io = StringIO.new
|
564
|
+
body_io.set_encoding Encoding::BINARY if body_io.respond_to? :set_encoding
|
565
|
+
total = 0
|
566
|
+
|
567
|
+
begin
|
568
|
+
response.read_body { |part|
|
569
|
+
total += part.length
|
570
|
+
body_io.write(part)
|
571
|
+
log.debug("Read #{part.length} bytes (#{total} total)") if log
|
572
|
+
}
|
573
|
+
rescue Net::HTTP::Persistent::Error => e
|
574
|
+
body_io.rewind
|
575
|
+
raise Mechanize::ResponseReadError.new(e, response, body_io)
|
576
|
+
end
|
577
|
+
|
578
|
+
body_io.rewind
|
579
|
+
|
580
|
+
raise Mechanize::ResponseCodeError, response if
|
581
|
+
Net::HTTPUnknownResponse === response
|
582
|
+
|
583
|
+
content_length = response.content_length
|
584
|
+
|
585
|
+
unless Net::HTTP::Head === request or Net::HTTPRedirection === response then
|
586
|
+
raise EOFError, "Content-Length (#{content_length}) does not match " \
|
587
|
+
"response body length (#{body_io.length})" if
|
588
|
+
content_length and content_length != body_io.length
|
589
|
+
end
|
590
|
+
|
591
|
+
body_io
|
592
|
+
end
|
593
|
+
|
594
|
+
def response_redirect response, method, page, redirects
|
595
|
+
case @redirect_ok
|
596
|
+
when true, :all
|
597
|
+
# shortcut
|
598
|
+
when false, nil
|
599
|
+
return page
|
600
|
+
when :permanent
|
601
|
+
return page if response_class != Net::HTTPMovedPermanently
|
602
|
+
end
|
603
|
+
|
604
|
+
log.info("follow redirect to: #{response['Location']}") if log
|
605
|
+
|
606
|
+
from_uri = page.uri
|
607
|
+
|
608
|
+
raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
|
609
|
+
redirects + 1 > @redirection_limit
|
610
|
+
|
611
|
+
redirect_method = method == :head ? :head : :get
|
612
|
+
|
613
|
+
page = fetch(response['Location'].to_s, redirect_method, {}, [], page,
|
614
|
+
redirects + 1)
|
615
|
+
|
616
|
+
@history.push(page, from_uri)
|
617
|
+
|
618
|
+
return page
|
619
|
+
end
|
620
|
+
|
621
|
+
def response_authenticate(response, page, uri, request, headers, params,
|
622
|
+
referer)
|
623
|
+
raise Mechanize::ResponseCodeError, page unless @user || @password
|
624
|
+
raise Mechanize::ResponseCodeError, page if @auth_hash.has_key?(uri.host)
|
625
|
+
|
626
|
+
if response['www-authenticate'] =~ /Digest/i
|
627
|
+
@auth_hash[uri.host] = :digest
|
628
|
+
if response['server'] =~ /Microsoft-IIS/
|
629
|
+
@auth_hash[uri.host] = :iis_digest
|
630
|
+
end
|
631
|
+
@digest = response['www-authenticate']
|
632
|
+
else
|
633
|
+
@auth_hash[uri.host] = :basic
|
634
|
+
end
|
635
|
+
|
636
|
+
fetch uri, request.method.downcase.to_sym, headers, params, referer
|
637
|
+
end
|
638
|
+
|
639
|
+
def robots= value
|
640
|
+
require 'webrobots' if value
|
641
|
+
@webrobots = nil if value != @robots
|
642
|
+
@robots = value
|
643
|
+
end
|
644
|
+
|
645
|
+
##
|
646
|
+
# Tests if this agent is allowed to access +url+, consulting the site's
|
647
|
+
# robots.txt.
|
648
|
+
|
649
|
+
def robots_allowed? uri
|
650
|
+
return true if uri.request_uri == '/robots.txt'
|
651
|
+
|
652
|
+
webrobots.allowed? uri
|
653
|
+
end
|
654
|
+
|
655
|
+
# Opposite of robots_allowed?
|
656
|
+
|
657
|
+
def robots_disallowed? url
|
658
|
+
!robots_allowed? url
|
659
|
+
end
|
660
|
+
|
661
|
+
# Returns an error object if there is an error in fetching or parsing
|
662
|
+
# robots.txt of the site +url+.
|
663
|
+
def robots_error(url)
|
664
|
+
webrobots.error(url)
|
665
|
+
end
|
666
|
+
|
667
|
+
# Raises the error if there is an error in fetching or parsing robots.txt of
|
668
|
+
# the site +url+.
|
669
|
+
def robots_error!(url)
|
670
|
+
webrobots.error!(url)
|
671
|
+
end
|
672
|
+
|
673
|
+
# Removes robots.txt cache for the site +url+.
|
674
|
+
def robots_reset(url)
|
675
|
+
webrobots.reset(url)
|
676
|
+
end
|
677
|
+
|
678
|
+
def set_http
|
679
|
+
@http = Net::HTTP::Persistent.new 'mechanize', @proxy_uri
|
680
|
+
|
681
|
+
@http.keep_alive = @keep_alive_time
|
682
|
+
|
683
|
+
@http.ca_file = @ca_file
|
684
|
+
@http.verify_callback = @verify_callback
|
685
|
+
|
686
|
+
if @cert and @key then
|
687
|
+
cert = if OpenSSL::X509::Certificate === @cert then
|
688
|
+
@cert
|
689
|
+
else
|
690
|
+
OpenSSL::X509::Certificate.new ::File.read @cert
|
691
|
+
end
|
692
|
+
|
693
|
+
key = if OpenSSL::PKey::PKey === @key then
|
694
|
+
@key
|
695
|
+
else
|
696
|
+
OpenSSL::PKey::RSA.new ::File.read(@key), @pass
|
697
|
+
end
|
698
|
+
|
699
|
+
@http.certificate = cert
|
700
|
+
@http.private_key = key
|
701
|
+
end
|
702
|
+
end
|
703
|
+
|
704
|
+
# Sets the proxy address, port, user, and password +addr+ should be a host,
|
705
|
+
# with no "http://"
|
706
|
+
def set_proxy(addr, port, user = nil, pass = nil)
|
707
|
+
return unless addr and port
|
708
|
+
@proxy_uri = URI "http://#{addr}"
|
709
|
+
@proxy_uri.port = port
|
710
|
+
@proxy_uri.user = user if user
|
711
|
+
@proxy_uri.password = pass if pass
|
712
|
+
|
713
|
+
@proxy_uri
|
714
|
+
end
|
715
|
+
|
716
|
+
def user_agent= user_agent
|
717
|
+
@webrobots = nil if user_agent != @user_agent
|
718
|
+
@user_agent = user_agent
|
719
|
+
end
|
720
|
+
|
721
|
+
# Returns a visited page for the url passed in, otherwise nil
|
722
|
+
def visited_page url
|
723
|
+
@history.visited_page resolve url
|
724
|
+
end
|
725
|
+
|
726
|
+
def get_robots(uri) # :nodoc:
|
727
|
+
fetch(uri).body
|
728
|
+
rescue Mechanize::ResponseCodeError => e
|
729
|
+
return '' if e.response_code == '404'
|
730
|
+
raise e
|
731
|
+
end
|
732
|
+
|
733
|
+
def webrobots
|
734
|
+
@webrobots ||= WebRobots.new(@user_agent, :http_get => method(:get_robots))
|
735
|
+
end
|
736
|
+
|
737
|
+
end
|
738
|
+
|