mechanize 2.1 → 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +28 -0
- data/Manifest.txt +1 -1
- data/README.rdoc +1 -1
- data/Rakefile +1 -1
- data/examples/wikipedia_links_to_philosophy.rb +159 -0
- data/lib/mechanize.rb +68 -5
- data/lib/mechanize/download.rb +9 -8
- data/lib/mechanize/form.rb +8 -0
- data/lib/mechanize/form/field.rb +8 -0
- data/lib/mechanize/http/agent.rb +107 -65
- data/lib/mechanize/http/www_authenticate_parser.rb +14 -0
- data/lib/mechanize/page.rb +8 -10
- data/lib/mechanize/page/meta_refresh.rb +8 -1
- data/lib/mechanize/parser.rb +1 -1
- data/lib/mechanize/response_read_error.rb +15 -4
- data/lib/mechanize/test_case.rb +10 -0
- data/lib/mechanize/util.rb +23 -15
- data/test/htdocs/tc_referer.html +1 -1
- data/test/test_mechanize.rb +48 -2
- data/test/test_mechanize_download.rb +11 -1
- data/test/test_mechanize_file.rb +7 -0
- data/test/test_mechanize_form.rb +16 -1
- data/test/test_mechanize_http_agent.rb +155 -26
- data/test/test_mechanize_page_encoding.rb +6 -0
- data/test/test_mechanize_page_meta_refresh.rb +10 -0
- data/test/test_mechanize_parser.rb +10 -0
- data/test/test_mechanize_response_read_error.rb +28 -0
- data/test/test_mechanize_util.rb +5 -0
- metadata +47 -30
- metadata.gz.sig +0 -0
- data/FAQ.rdoc +0 -11
data.tar.gz.sig
CHANGED
Binary file
|
data/CHANGELOG.rdoc
CHANGED
@@ -1,5 +1,33 @@
|
|
1
1
|
= Mechanize CHANGELOG
|
2
2
|
|
3
|
+
=== 2.1.1 / 2010-02-03
|
4
|
+
|
5
|
+
* Bug fixes
|
6
|
+
* Set missing idle_timeout default. Issue #196
|
7
|
+
* Meta refresh URIs are now escaped (excluding %). Issue #177
|
8
|
+
* Fix charset name extraction. Issue #180
|
9
|
+
* A Referer URI sent on request no longer includes user information
|
10
|
+
or fragment part.
|
11
|
+
* Tempfiles for storing response bodies are unlinked upon creation to avoid
|
12
|
+
possible lack of finalization. Issue #183
|
13
|
+
* The default maximum history size is now 50 pages to avoid filling up a
|
14
|
+
disk with tempfiles accidentally. Related to Issue #183
|
15
|
+
* Errors in bodies with deflate and gzip responses now result in a
|
16
|
+
Mechanize::Error instead of silently being ignored and causing future
|
17
|
+
errors. Issue #185
|
18
|
+
* Mechanize now raises an UnauthorizedError instead of crashing when a 403
|
19
|
+
response does not contain a www-authenticate header. Issue #181
|
20
|
+
* Mechanize gives a useful exception when attempting to click buttons across
|
21
|
+
pages. Issue #186
|
22
|
+
* Added note to Mechanize#cert_store describing how to add certificates in
|
23
|
+
case your system does not come with a default set. Issue #179
|
24
|
+
* Invalid content-disposition headers are now ignored. Issue #191
|
25
|
+
* Fix NTLM by recognizing the "Negotiation" challenge instead of endlessly
|
26
|
+
looping. Issue #192
|
27
|
+
* Allow specification of the NTLM domain through Mechanize#auth. Issue #193
|
28
|
+
* Documented how to convert a Mechanize::ResponseReadError into a File or
|
29
|
+
Page, along with a new method #force_parse. Issue #176
|
30
|
+
|
3
31
|
=== 2.1 / 2011-12-20
|
4
32
|
|
5
33
|
* Deprecations
|
data/Manifest.txt
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
.autotest
|
2
2
|
CHANGELOG.rdoc
|
3
3
|
EXAMPLES.rdoc
|
4
|
-
FAQ.rdoc
|
5
4
|
GUIDE.rdoc
|
6
5
|
LICENSE.rdoc
|
7
6
|
Manifest.txt
|
@@ -12,6 +11,7 @@ examples/mech-dump.rb
|
|
12
11
|
examples/proxy_req.rb
|
13
12
|
examples/rubyforge.rb
|
14
13
|
examples/spider.rb
|
14
|
+
examples/wikipedia_links_to_philosophy.rb
|
15
15
|
lib/mechanize.rb
|
16
16
|
lib/mechanize/content_type_error.rb
|
17
17
|
lib/mechanize/cookie.rb
|
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
@@ -17,7 +17,7 @@ hoe = Hoe.spec 'mechanize' do
|
|
17
17
|
rdoc_locations << 'drbrain@rubyforge.org:/var/www/gforge-projects/mechanize/'
|
18
18
|
|
19
19
|
self.extra_deps << ['net-http-digest_auth', '~> 1.1', '>= 1.1.1']
|
20
|
-
self.extra_deps << ['net-http-persistent', '~> 2.
|
20
|
+
self.extra_deps << ['net-http-persistent', '~> 2.4', '>= 2.4.1']
|
21
21
|
self.extra_deps << ['nokogiri', '~> 1.4']
|
22
22
|
self.extra_deps << ['ntlm-http', '~> 0.1', '>= 0.1.1']
|
23
23
|
self.extra_deps << ['webrobots', '~> 0.0', '>= 0.0.9']
|
@@ -0,0 +1,159 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'tsort'
|
3
|
+
|
4
|
+
##
|
5
|
+
# This example implements the alt-text of http://xkcd.com/903/ which states:
|
6
|
+
#
|
7
|
+
# Wikipedia trivia: if you take any article, click on the first link in the
|
8
|
+
# article text not in parentheses or italics, and then repeat, you will
|
9
|
+
# eventually end up at "Philosophy".
|
10
|
+
|
11
|
+
class WikipediaLinksToPhilosophy
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@agent = Mechanize.new
|
15
|
+
@agent.user_agent_alias = 'Mac Safari' # Wikipedia blocks "mechanize"
|
16
|
+
|
17
|
+
@history = @agent.history
|
18
|
+
|
19
|
+
@wiki_url = URI 'http://en.wikipedia.org'
|
20
|
+
@search_url = @wiki_url + '/w/index.php'
|
21
|
+
@random_url = @wiki_url + '/wiki/Special:Random'
|
22
|
+
|
23
|
+
@title = nil
|
24
|
+
@seen = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# Retrieves the title of the current page
|
29
|
+
|
30
|
+
def extract_title
|
31
|
+
@page.title =~ /(.*) - Wikipedia/
|
32
|
+
|
33
|
+
@title = $1
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# Retrieves the initial page. If +query+ is not given a random page is
|
38
|
+
# chosen
|
39
|
+
|
40
|
+
def fetch_first_page query
|
41
|
+
if query then
|
42
|
+
search query
|
43
|
+
else
|
44
|
+
random
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# The search is finished if we've seen the page before or we've reached
|
50
|
+
# Philosophy
|
51
|
+
|
52
|
+
def finished?
|
53
|
+
@seen or @title == 'Philosophy'
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Follows the first non-parenthetical, non-italic link in the main body of
|
58
|
+
# the article.
|
59
|
+
|
60
|
+
def follow_first_link
|
61
|
+
puts @title
|
62
|
+
|
63
|
+
# > p > a rejects italics
|
64
|
+
links = @page.root.css('.mw-content-ltr > p > a[href^="/wiki/"]')
|
65
|
+
|
66
|
+
# reject disambiguation and special pages, images and files
|
67
|
+
links = links.reject do |link_node|
|
68
|
+
link_node['href'] =~ %r%/wiki/\w+:|\(disambiguation\)%
|
69
|
+
end
|
70
|
+
|
71
|
+
links = links.reject do |link_node|
|
72
|
+
in_parenthetical? link_node
|
73
|
+
end
|
74
|
+
|
75
|
+
link = links.first
|
76
|
+
|
77
|
+
unless link then
|
78
|
+
# disambiguation page? try the first item in the list
|
79
|
+
link =
|
80
|
+
@page.root.css('.mw-content-ltr > ul > li > a[href^="/wiki/"]').first
|
81
|
+
end
|
82
|
+
|
83
|
+
# convert a Nokogiri HTML element back to a mechanize link
|
84
|
+
link = Mechanize::Page::Link.new link, @agent, @page
|
85
|
+
|
86
|
+
return if @seen = @agent.visited?(link)
|
87
|
+
|
88
|
+
@page = link.click
|
89
|
+
|
90
|
+
extract_title
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
# Is +link_node+ in an open parenthetical section?
|
95
|
+
|
96
|
+
def in_parenthetical? link_node
|
97
|
+
siblings = link_node.parent.children
|
98
|
+
|
99
|
+
seen = false
|
100
|
+
|
101
|
+
before = siblings.reject do |node|
|
102
|
+
seen or (seen = node == link_node)
|
103
|
+
end
|
104
|
+
|
105
|
+
preceding_text = before.map { |node| node.text }.join
|
106
|
+
|
107
|
+
open = preceding_text.count '('
|
108
|
+
close = preceding_text.count ')'
|
109
|
+
|
110
|
+
open > close
|
111
|
+
end
|
112
|
+
|
113
|
+
##
|
114
|
+
# Prints the result of the search
|
115
|
+
|
116
|
+
def print_result
|
117
|
+
if @seen then
|
118
|
+
puts "[Loop detected]"
|
119
|
+
else
|
120
|
+
puts @title
|
121
|
+
end
|
122
|
+
puts
|
123
|
+
# subtract initial search or Special:Random
|
124
|
+
puts "After #{@agent.history.length - 1} pages"
|
125
|
+
end
|
126
|
+
|
127
|
+
##
|
128
|
+
# Retrieves a random page from wikipedia
|
129
|
+
|
130
|
+
def random
|
131
|
+
@page = @agent.get @random_url
|
132
|
+
|
133
|
+
extract_title
|
134
|
+
end
|
135
|
+
|
136
|
+
##
|
137
|
+
# Entry point
|
138
|
+
|
139
|
+
def run query = nil
|
140
|
+
fetch_first_page query
|
141
|
+
|
142
|
+
follow_first_link until finished?
|
143
|
+
|
144
|
+
print_result
|
145
|
+
end
|
146
|
+
|
147
|
+
##
|
148
|
+
# Searches for +query+ on wikipedia
|
149
|
+
|
150
|
+
def search query
|
151
|
+
@page = @agent.get @search_url, search: query
|
152
|
+
|
153
|
+
extract_title
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
157
|
+
|
158
|
+
WikipediaLinksToPhilosophy.new.run ARGV.shift if $0 == __FILE__
|
159
|
+
|
data/lib/mechanize.rb
CHANGED
@@ -4,7 +4,6 @@ require 'iconv' if RUBY_VERSION < '1.9.2'
|
|
4
4
|
require 'mutex_m'
|
5
5
|
require 'net/http/digest_auth'
|
6
6
|
require 'net/http/persistent'
|
7
|
-
require 'nkf'
|
8
7
|
require 'nokogiri'
|
9
8
|
require 'openssl'
|
10
9
|
require 'pp'
|
@@ -16,7 +15,7 @@ require 'zlib'
|
|
16
15
|
##
|
17
16
|
# The Mechanize library is used for automating interactions with a website. It
|
18
17
|
# can follow links and submit forms. Form fields can be populated and
|
19
|
-
# submitted. A history of
|
18
|
+
# submitted. A history of URLs is maintained and can be queried.
|
20
19
|
#
|
21
20
|
# == Example
|
22
21
|
#
|
@@ -33,13 +32,47 @@ require 'zlib'
|
|
33
32
|
#
|
34
33
|
# search_results = agent.submit search_form
|
35
34
|
# puts search_results.body
|
35
|
+
#
|
36
|
+
# == Issues with mechanize
|
37
|
+
#
|
38
|
+
# If you think you have a bug with mechanize, but aren't sure, please file a
|
39
|
+
# ticket at https://github.com/tenderlove/mechanize/issues
|
40
|
+
#
|
41
|
+
# Here are some common problems you may experience with mechanize
|
42
|
+
#
|
43
|
+
# === Problems connecting to SSL sites
|
44
|
+
#
|
45
|
+
# Mechanize defaults to validating SSL certificates using the default CA
|
46
|
+
# certificates for your platform. At this time, Windows users do not have
|
47
|
+
# integration between the OS default CA certificates and OpenSSL. #cert_store
|
48
|
+
# explains how to download and use Mozilla's CA certificates to allow SSL
|
49
|
+
# sites to work.
|
50
|
+
#
|
51
|
+
# === Problems with content-length
|
52
|
+
#
|
53
|
+
# Some sites return an incorrect content-length value. Unlike a browser,
|
54
|
+
# mechanize raises an error when the content-length header does not match the
|
55
|
+
# response length since it does not know if there was a connection problem or
|
56
|
+
# if the mismatch is a server bug.
|
57
|
+
#
|
58
|
+
# The error raised, Mechanize::ResponseReadError, can be converted to a parsed
|
59
|
+
# Page, File, etc. depending upon the content-type:
|
60
|
+
#
|
61
|
+
# agent = Mechanize.new
|
62
|
+
# uri = URI 'http://example/invalid_content_length'
|
63
|
+
#
|
64
|
+
# begin
|
65
|
+
# page = agent.get uri
|
66
|
+
# rescue Mechanize::ResponseReadError => e
|
67
|
+
# page = e.force_parse
|
68
|
+
# end
|
36
69
|
|
37
70
|
class Mechanize
|
38
71
|
|
39
72
|
##
|
40
73
|
# The version of Mechanize you are using.
|
41
74
|
|
42
|
-
VERSION = '2.1'
|
75
|
+
VERSION = '2.1.1'
|
43
76
|
|
44
77
|
##
|
45
78
|
# Base mechanize error class
|
@@ -137,6 +170,9 @@ class Mechanize
|
|
137
170
|
@default_encoding = nil
|
138
171
|
@force_default_encoding = false
|
139
172
|
|
173
|
+
# defaults
|
174
|
+
@agent.max_history = 50
|
175
|
+
|
140
176
|
yield self if block_given?
|
141
177
|
|
142
178
|
@agent.set_proxy @proxy_addr, @proxy_port, @proxy_user, @proxy_pass
|
@@ -179,6 +215,11 @@ class Mechanize
|
|
179
215
|
|
180
216
|
##
|
181
217
|
# Sets the maximum number of items allowed in the history to +length+.
|
218
|
+
#
|
219
|
+
# Setting the maximum history length to nil will make the history size
|
220
|
+
# unlimited. Take care when doing this, mechanize stores page bodies in the
|
221
|
+
# temporary files directory for pages in the history. For a long-running
|
222
|
+
# mechanize program this can be quite large.
|
182
223
|
|
183
224
|
def max_history= length
|
184
225
|
@agent.history.max_size = length
|
@@ -518,10 +559,12 @@ class Mechanize
|
|
518
559
|
|
519
560
|
##
|
520
561
|
# Sets the user and password to be used for HTTP authentication.
|
562
|
+
# sets the optional domain for NTLM authentication
|
521
563
|
|
522
|
-
def auth(user, password)
|
564
|
+
def auth(user, password, domain = nil)
|
523
565
|
@agent.user = user
|
524
566
|
@agent.password = password
|
567
|
+
@agent.domain = domain
|
525
568
|
end
|
526
569
|
|
527
570
|
alias basic_auth auth
|
@@ -869,7 +912,25 @@ class Mechanize
|
|
869
912
|
|
870
913
|
##
|
871
914
|
# An OpenSSL certificate store for verifying server certificates. This
|
872
|
-
# defaults to the default certificate store.
|
915
|
+
# defaults to the default certificate store for your system.
|
916
|
+
#
|
917
|
+
# If your system does not ship with a default set of certificates you can
|
918
|
+
# retrieve a copy of the set from Mozilla here:
|
919
|
+
# http://curl.haxx.se/docs/caextract.html
|
920
|
+
#
|
921
|
+
# (Note that this set does not have an HTTPS download option so you may
|
922
|
+
# wish to use the firefox-db2pem.sh script to extract the certificates
|
923
|
+
# from a local install to avoid man-in-the-middle attacks.)
|
924
|
+
#
|
925
|
+
# After downloading or generating a cacert.pem from the above link you
|
926
|
+
# can create a certificate store from the pem file like this:
|
927
|
+
#
|
928
|
+
# cert_store = OpenSSL::X509::Store.new
|
929
|
+
# cert_store.add_file 'cacert.pem'
|
930
|
+
#
|
931
|
+
# And have mechanize use it with:
|
932
|
+
#
|
933
|
+
# agent.cert_store = cert_store
|
873
934
|
|
874
935
|
def cert_store
|
875
936
|
@agent.cert_store
|
@@ -877,6 +938,8 @@ class Mechanize
|
|
877
938
|
|
878
939
|
##
|
879
940
|
# Sets the OpenSSL certificate store to +store+.
|
941
|
+
#
|
942
|
+
# See also #cert_store
|
880
943
|
|
881
944
|
def cert_store= cert_store
|
882
945
|
@agent.cert_store = cert_store
|
data/lib/mechanize/download.rb
CHANGED
@@ -9,6 +9,12 @@ class Mechanize::Download
|
|
9
9
|
|
10
10
|
include Mechanize::Parser
|
11
11
|
|
12
|
+
##
|
13
|
+
# The filename for this file based on the content-disposition of the
|
14
|
+
# response or the basename of the URL
|
15
|
+
|
16
|
+
attr_accessor :filename
|
17
|
+
|
12
18
|
##
|
13
19
|
# Accessor for the IO-like that contains the body
|
14
20
|
|
@@ -43,15 +49,10 @@ class Mechanize::Download
|
|
43
49
|
dirname = File.dirname filename
|
44
50
|
FileUtils.mkdir_p dirname
|
45
51
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
until @body_io.eof? do
|
50
|
-
io.write @body_io.read 16384
|
51
|
-
end
|
52
|
+
open filename, 'wb' do |io|
|
53
|
+
until @body_io.eof? do
|
54
|
+
io.write @body_io.read 16384
|
52
55
|
end
|
53
|
-
else
|
54
|
-
FileUtils.mv @body_io.path, filename
|
55
56
|
end
|
56
57
|
end
|
57
58
|
|
data/lib/mechanize/form.rb
CHANGED
@@ -255,6 +255,14 @@ class Mechanize::Form
|
|
255
255
|
# This method adds a button to the query. If the form needs to be
|
256
256
|
# submitted with multiple buttons, pass each button to this method.
|
257
257
|
def add_button_to_query(button)
|
258
|
+
unless button.node.document == @form_node.document then
|
259
|
+
message =
|
260
|
+
"#{button.inspect} does not belong to the same page as " \
|
261
|
+
"the form #{@name.inspect} in #{@page.uri}"
|
262
|
+
|
263
|
+
raise ArgumentError, message
|
264
|
+
end
|
265
|
+
|
258
266
|
@clicked_buttons << button
|
259
267
|
end
|
260
268
|
|
data/lib/mechanize/form/field.rb
CHANGED
@@ -50,5 +50,13 @@ class Mechanize::Form::Field
|
|
50
50
|
def dom_class
|
51
51
|
node['class']
|
52
52
|
end
|
53
|
+
|
54
|
+
def inspect # :nodoc:
|
55
|
+
"[%s:0x%x type: %s name: %s value: %s]" % [
|
56
|
+
self.class.name.sub(/Mechanize::Form::/, '').downcase,
|
57
|
+
object_id, @type, @name, @value
|
58
|
+
]
|
59
|
+
end
|
60
|
+
|
53
61
|
end
|
54
62
|
|
data/lib/mechanize/http/agent.rb
CHANGED
@@ -47,6 +47,7 @@ class Mechanize::HTTP::Agent
|
|
47
47
|
attr_reader :digest_challenges # :nodoc:
|
48
48
|
attr_accessor :user
|
49
49
|
attr_accessor :password
|
50
|
+
attr_accessor :domain
|
50
51
|
|
51
52
|
# :section: Redirection
|
52
53
|
|
@@ -156,7 +157,7 @@ class Mechanize::HTTP::Agent
|
|
156
157
|
@follow_meta_refresh_self = false
|
157
158
|
@gzip_enabled = true
|
158
159
|
@history = Mechanize::History.new
|
159
|
-
@idle_timeout =
|
160
|
+
@idle_timeout = 5
|
160
161
|
@keep_alive = true
|
161
162
|
@keep_alive_time = 300
|
162
163
|
@max_file_buffer = 10240
|
@@ -184,6 +185,7 @@ class Mechanize::HTTP::Agent
|
|
184
185
|
@digest_challenges = {}
|
185
186
|
@password = nil # HTTP auth password
|
186
187
|
@user = nil # HTTP auth user
|
188
|
+
@domain = nil # NTLM HTTP domain
|
187
189
|
|
188
190
|
# SSL
|
189
191
|
@ca_file = nil
|
@@ -264,7 +266,7 @@ class Mechanize::HTTP::Agent
|
|
264
266
|
response = connection.request(uri, request) { |res|
|
265
267
|
response_log res
|
266
268
|
|
267
|
-
response_body_io = response_read res, request
|
269
|
+
response_body_io = response_read res, request, uri
|
268
270
|
|
269
271
|
res
|
270
272
|
}
|
@@ -392,6 +394,62 @@ class Mechanize::HTTP::Agent
|
|
392
394
|
end
|
393
395
|
end
|
394
396
|
|
397
|
+
##
|
398
|
+
# Decodes a gzip-encoded +body_io+. If it cannot be decoded, inflate is
|
399
|
+
# tried followed by raising an error.
|
400
|
+
|
401
|
+
def content_encoding_gunzip body_io
|
402
|
+
log.debug('gzip response') if log
|
403
|
+
|
404
|
+
zio = Zlib::GzipReader.new body_io
|
405
|
+
out_io = Tempfile.new 'mechanize-decode'
|
406
|
+
out_io.unlink
|
407
|
+
out_io.binmode
|
408
|
+
|
409
|
+
until zio.eof? do
|
410
|
+
out_io.write zio.read 16384
|
411
|
+
end
|
412
|
+
|
413
|
+
zio.finish
|
414
|
+
|
415
|
+
return out_io
|
416
|
+
rescue Zlib::Error
|
417
|
+
log.error('unable to gunzip response, trying raw inflate') if log
|
418
|
+
|
419
|
+
body_io.rewind
|
420
|
+
body_io.read 10
|
421
|
+
|
422
|
+
begin
|
423
|
+
return inflate body_io, -Zlib::MAX_WBITS
|
424
|
+
rescue Zlib::Error => e
|
425
|
+
log.error("unable to gunzip response: #{e}") if log
|
426
|
+
raise
|
427
|
+
end
|
428
|
+
ensure
|
429
|
+
zio.close if zio and not zio.closed?
|
430
|
+
end
|
431
|
+
|
432
|
+
##
|
433
|
+
# Decodes a deflate-encoded +body_io+. If it cannot be decoded, raw inflate
|
434
|
+
# is tried followed by raising an error.
|
435
|
+
|
436
|
+
def content_encoding_inflate body_io
|
437
|
+
log.debug('deflate body') if log
|
438
|
+
|
439
|
+
return inflate body_io
|
440
|
+
rescue Zlib::Error
|
441
|
+
log.error('unable to inflate response, trying raw deflate') if log
|
442
|
+
|
443
|
+
body_io.rewind
|
444
|
+
|
445
|
+
begin
|
446
|
+
return inflate body_io, -Zlib::MAX_WBITS
|
447
|
+
rescue Zlib::Error => e
|
448
|
+
log.error("unable to inflate response: #{e}") if log
|
449
|
+
raise
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
395
453
|
def disable_keep_alive request
|
396
454
|
request['connection'] = 'close' unless @keep_alive
|
397
455
|
end
|
@@ -491,11 +549,17 @@ class Mechanize::HTTP::Agent
|
|
491
549
|
end
|
492
550
|
end
|
493
551
|
|
552
|
+
# Sets a Referer header. Fragment part is removed as demanded by
|
553
|
+
# RFC 2616 14.36, and user information part is removed just like
|
554
|
+
# major browsers do.
|
494
555
|
def request_referer request, uri, referer
|
495
556
|
return unless referer
|
496
557
|
return if 'https' == referer.scheme.downcase and
|
497
558
|
'https' != uri.scheme.downcase
|
498
|
-
|
559
|
+
if referer.fragment || referer.user || referer.password
|
560
|
+
referer = referer.dup
|
561
|
+
referer.fragment = referer.user = referer.password = nil
|
562
|
+
end
|
499
563
|
request['Referer'] = referer
|
500
564
|
end
|
501
565
|
|
@@ -602,7 +666,11 @@ class Mechanize::HTTP::Agent
|
|
602
666
|
referer)
|
603
667
|
raise Mechanize::UnauthorizedError, page unless @user || @password
|
604
668
|
|
605
|
-
|
669
|
+
www_authenticate = response['www-authenticate']
|
670
|
+
|
671
|
+
raise Mechanize::UnauthorizedError, page unless www_authenticate
|
672
|
+
|
673
|
+
challenges = @authenticate_parser.parse www_authenticate
|
606
674
|
|
607
675
|
if challenge = challenges.find { |c| c.scheme =~ /^Digest$/i } then
|
608
676
|
realm = challenge.realm uri
|
@@ -631,7 +699,7 @@ class Mechanize::HTTP::Agent
|
|
631
699
|
if challenge.params then
|
632
700
|
type_2 = Net::NTLM::Message.decode64 challenge.params
|
633
701
|
|
634
|
-
type_3 = type_2.response({ :user => @user, :password => @password, },
|
702
|
+
type_3 = type_2.response({ :user => @user, :password => @password, :domain => @domain },
|
635
703
|
{ :ntlmv2 => true }).encode64
|
636
704
|
|
637
705
|
headers['Authorization'] = "NTLM #{type_3}"
|
@@ -656,71 +724,42 @@ class Mechanize::HTTP::Agent
|
|
656
724
|
end
|
657
725
|
|
658
726
|
def response_content_encoding response, body_io
|
659
|
-
length = response.content_length
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
body_io.length
|
666
|
-
end unless length
|
667
|
-
|
668
|
-
out_io = nil
|
669
|
-
|
670
|
-
case response['Content-Encoding']
|
671
|
-
when nil, 'none', '7bit' then
|
672
|
-
out_io = body_io
|
673
|
-
when 'deflate' then
|
674
|
-
log.debug('deflate body') if log
|
675
|
-
|
676
|
-
return if length.zero?
|
677
|
-
|
678
|
-
begin
|
679
|
-
out_io = inflate body_io
|
680
|
-
rescue Zlib::BufError, Zlib::DataError
|
681
|
-
log.error('Unable to inflate page, retrying with raw deflate') if log
|
682
|
-
body_io.rewind
|
683
|
-
begin
|
684
|
-
out_io = inflate body_io, -Zlib::MAX_WBITS
|
685
|
-
rescue Zlib::BufError, Zlib::DataError
|
686
|
-
log.error("unable to inflate page: #{$!}") if log
|
687
|
-
nil
|
688
|
-
end
|
727
|
+
length = response.content_length ||
|
728
|
+
case body_io
|
729
|
+
when Tempfile, IO then
|
730
|
+
body_io.stat.size
|
731
|
+
else
|
732
|
+
body_io.length
|
689
733
|
end
|
690
|
-
when 'gzip', 'x-gzip' then
|
691
|
-
log.debug('gzip body') if log
|
692
734
|
|
693
|
-
|
735
|
+
return body_io if length.zero?
|
694
736
|
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
body_io.read 10
|
707
|
-
|
708
|
-
out_io = inflate body_io, -Zlib::MAX_WBITS
|
709
|
-
rescue Zlib::DataError
|
710
|
-
log.error("unable to gunzip page: #{$!}") if log
|
711
|
-
''
|
712
|
-
ensure
|
713
|
-
zio.close if zio and not zio.closed?
|
714
|
-
end
|
715
|
-
else
|
716
|
-
raise Mechanize::Error,
|
717
|
-
"Unsupported Content-Encoding: #{response['Content-Encoding']}"
|
718
|
-
end
|
737
|
+
out_io = case response['Content-Encoding']
|
738
|
+
when nil, 'none', '7bit' then
|
739
|
+
body_io
|
740
|
+
when 'deflate' then
|
741
|
+
content_encoding_inflate body_io
|
742
|
+
when 'gzip', 'x-gzip' then
|
743
|
+
content_encoding_gunzip body_io
|
744
|
+
else
|
745
|
+
raise Mechanize::Error,
|
746
|
+
"unsupported content-encoding: #{response['Content-Encoding']}"
|
747
|
+
end
|
719
748
|
|
720
749
|
out_io.flush
|
721
750
|
out_io.rewind
|
722
751
|
|
723
752
|
out_io
|
753
|
+
rescue Zlib::Error => e
|
754
|
+
message = "error handling content-encoding #{response['Content-Encoding']}:"
|
755
|
+
message << " #{e.message} (#{e.class})"
|
756
|
+
raise Mechanize::Error, message
|
757
|
+
ensure
|
758
|
+
begin
|
759
|
+
body_io.close! if Tempfile === body_io and out_io.path != body_io.path
|
760
|
+
rescue IOError
|
761
|
+
# HACK ruby 1.8 raises IOError when closing the stream
|
762
|
+
end
|
724
763
|
end
|
725
764
|
|
726
765
|
def response_cookies response, uri, page
|
@@ -778,11 +817,12 @@ class Mechanize::HTTP::Agent
|
|
778
817
|
@context.parse uri, response, body_io
|
779
818
|
end
|
780
819
|
|
781
|
-
def response_read response, request
|
820
|
+
def response_read response, request, uri
|
782
821
|
content_length = response.content_length
|
783
822
|
|
784
823
|
if content_length and content_length > @max_file_buffer then
|
785
824
|
body_io = Tempfile.new 'mechanize-raw'
|
825
|
+
body_io.unlink
|
786
826
|
body_io.binmode if defined? body_io.binmode
|
787
827
|
else
|
788
828
|
body_io = StringIO.new
|
@@ -797,7 +837,8 @@ class Mechanize::HTTP::Agent
|
|
797
837
|
|
798
838
|
if StringIO === body_io and total > @max_file_buffer then
|
799
839
|
new_io = Tempfile.new 'mechanize-raw'
|
800
|
-
new_io.
|
840
|
+
new_io.unlink
|
841
|
+
new_io.binmode
|
801
842
|
|
802
843
|
new_io.write body_io.string
|
803
844
|
|
@@ -809,7 +850,8 @@ class Mechanize::HTTP::Agent
|
|
809
850
|
}
|
810
851
|
rescue Net::HTTP::Persistent::Error => e
|
811
852
|
body_io.rewind
|
812
|
-
raise Mechanize::ResponseReadError.new(e, response, body_io
|
853
|
+
raise Mechanize::ResponseReadError.new(e, response, body_io, uri,
|
854
|
+
@context)
|
813
855
|
end
|
814
856
|
|
815
857
|
body_io.flush
|