mechanize 2.0.pre.2 → 2.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +22 -0
- data/Manifest.txt +11 -8
- data/Rakefile +2 -2
- data/examples/flickr_upload.rb +6 -7
- data/examples/mech-dump.rb +0 -2
- data/examples/proxy_req.rb +0 -2
- data/examples/rubyforge.rb +1 -3
- data/examples/spider.rb +2 -3
- data/lib/mechanize.rb +228 -680
- data/lib/mechanize/form/field.rb +1 -1
- data/lib/mechanize/history.rb +23 -5
- data/lib/mechanize/http.rb +3 -0
- data/lib/mechanize/http/agent.rb +738 -0
- data/lib/mechanize/inspect.rb +2 -2
- data/lib/mechanize/page.rb +101 -42
- data/lib/mechanize/page/frame.rb +24 -17
- data/lib/mechanize/page/link.rb +72 -54
- data/lib/mechanize/page/meta_refresh.rb +56 -0
- data/lib/mechanize/response_read_error.rb +27 -0
- data/test/htdocs/frame_referer_test.html +10 -0
- data/test/htdocs/tc_referer.html +4 -0
- data/test/test_frames.rb +9 -0
- data/test/test_history.rb +74 -98
- data/test/test_mechanize.rb +334 -812
- data/test/test_mechanize_form.rb +32 -3
- data/test/{test_textarea.rb → test_mechanize_form_textarea.rb} +1 -1
- data/test/test_mechanize_http_agent.rb +697 -0
- data/test/test_mechanize_link.rb +83 -0
- data/test/test_mechanize_page_encoding.rb +147 -0
- data/test/test_mechanize_page_link.rb +379 -0
- data/test/test_mechanize_page_meta_refresh.rb +115 -0
- data/test/test_pretty_print.rb +1 -1
- data/test/test_referer.rb +29 -5
- data/test/test_response_code.rb +21 -20
- data/test/test_robots.rb +13 -17
- data/test/test_scheme.rb +1 -1
- metadata +30 -31
- metadata.gz.sig +0 -0
- data/lib/mechanize/page/meta.rb +0 -48
- data/test/test_form_no_inputname.rb +0 -15
- data/test/test_links.rb +0 -146
- data/test/test_mechanize_page.rb +0 -224
- data/test/test_meta.rb +0 -67
- data/test/test_upload.rb +0 -109
- data/test/test_verbs.rb +0 -25
data.tar.gz.sig
CHANGED
Binary file
|
data/CHANGELOG.rdoc
CHANGED
@@ -27,6 +27,12 @@ Mechanize is now under the MIT license
|
|
27
27
|
* Mechanize#get no longer accepts the referer as the second argument.
|
28
28
|
* Mechanize#get no longer allows the HTTP method to be changed (:verb
|
29
29
|
option).
|
30
|
+
* Mechanize::Page::Meta is now Mechanize::Page::MetaRefresh to accurately
|
31
|
+
depict its responsibilities.
|
32
|
+
* Mechanize::Page#meta is now Mechanize::Page#meta_refresh as it only
|
33
|
+
contains meta elements with http-equiv of "refresh"
|
34
|
+
* Mechanize::Page#charset is now Mechanize::Page::charset. GH #112, patch
|
35
|
+
by Godfrey Chan.
|
30
36
|
|
31
37
|
* Deprecations
|
32
38
|
* Mechanize#get with an options hash is deprecated and will be removed after
|
@@ -42,6 +48,7 @@ Mechanize is now under the MIT license
|
|
42
48
|
current page. It works as expected when not passed a string or regexp.
|
43
49
|
* Provide a way to only follow permanent redirects (301)
|
44
50
|
automatically: <tt>agent.redirect_ok = :permanent</tt> GH #73
|
51
|
+
* Mechanize now supports HTML5 meta charset. GH #113
|
45
52
|
* Documented various Mechanize accessors. GH #66
|
46
53
|
* Mechanize now uses net-http-digest_auth. GH #31
|
47
54
|
* Mechanize now implements session cookies. GH #78
|
@@ -60,6 +67,18 @@ Mechanize is now under the MIT license
|
|
60
67
|
* Manual robots.txt test can be performed with
|
61
68
|
Mechanize#robots_allowed? and #robots_disallowed?.
|
62
69
|
* Mechanize::Form now supports the accept-charset attribute. GH #96
|
70
|
+
* Mechanize::ResponseReadError is raised if there is an exception while
|
71
|
+
reading the response body. This allows recovery from broken HTTP servers
|
72
|
+
(or connections). GH #90
|
73
|
+
* Mechanize#follow_meta_refresh set to :anywhere will follow meta refresh
|
74
|
+
found outside of a document's head. GH #99
|
75
|
+
* Add support for HTML5's rel="noreferrer" attribute which indicates
|
76
|
+
no "Referer" information should be sent when following the link.
|
77
|
+
* A frame will now load its content when #content is called. GH #111
|
78
|
+
* Added Mechanize#default_encoding to provide a default for pages with no
|
79
|
+
encoding specified. GH #104
|
80
|
+
* Added Mechanize#force_default_encoding which only uses
|
81
|
+
Mechanize#default_encoding for parsing HTML. GH #104
|
63
82
|
|
64
83
|
* Bug Fixes:
|
65
84
|
|
@@ -85,6 +104,9 @@ Mechanize is now under the MIT license
|
|
85
104
|
* Mechanize now unescapes URIs for meta refresh. GH #68
|
86
105
|
* Mechanize now has more robust HTML charset detection. GH #43
|
87
106
|
* Mechanize::Form::Textarea is now created from a textarea element. GH #94
|
107
|
+
* A meta content-type now overrides the HTTP content type. GH #114
|
108
|
+
* Mechanize::Page::Link#uri now handles both escaped and unescaped hrefs.
|
109
|
+
GH #107
|
88
110
|
|
89
111
|
=== 1.0.0
|
90
112
|
|
data/Manifest.txt
CHANGED
@@ -33,6 +33,8 @@ lib/mechanize/form/radio_button.rb
|
|
33
33
|
lib/mechanize/form/select_list.rb
|
34
34
|
lib/mechanize/headers.rb
|
35
35
|
lib/mechanize/history.rb
|
36
|
+
lib/mechanize/http.rb
|
37
|
+
lib/mechanize/http/agent.rb
|
36
38
|
lib/mechanize/inspect.rb
|
37
39
|
lib/mechanize/monkey_patch.rb
|
38
40
|
lib/mechanize/page.rb
|
@@ -41,11 +43,12 @@ lib/mechanize/page/frame.rb
|
|
41
43
|
lib/mechanize/page/image.rb
|
42
44
|
lib/mechanize/page/label.rb
|
43
45
|
lib/mechanize/page/link.rb
|
44
|
-
lib/mechanize/page/
|
46
|
+
lib/mechanize/page/meta_refresh.rb
|
45
47
|
lib/mechanize/pluggable_parsers.rb
|
46
48
|
lib/mechanize/redirect_limit_reached_error.rb
|
47
49
|
lib/mechanize/redirect_not_get_or_head_error.rb
|
48
50
|
lib/mechanize/response_code_error.rb
|
51
|
+
lib/mechanize/response_read_error.rb
|
49
52
|
lib/mechanize/robots_disallowed_error.rb
|
50
53
|
lib/mechanize/unsupported_scheme_error.rb
|
51
54
|
lib/mechanize/util.rb
|
@@ -73,6 +76,7 @@ test/htdocs/form_select_none.html
|
|
73
76
|
test/htdocs/form_select_noopts.html
|
74
77
|
test/htdocs/form_set_fields.html
|
75
78
|
test/htdocs/form_test.html
|
79
|
+
test/htdocs/frame_referer_test.html
|
76
80
|
test/htdocs/frame_test.html
|
77
81
|
test/htdocs/google.html
|
78
82
|
test/htdocs/iframe_test.html
|
@@ -117,7 +121,6 @@ test/test_cookies.rb
|
|
117
121
|
test/test_form_action.rb
|
118
122
|
test/test_form_as_hash.rb
|
119
123
|
test/test_form_button.rb
|
120
|
-
test/test_form_no_inputname.rb
|
121
124
|
test/test_frames.rb
|
122
125
|
test/test_headers.rb
|
123
126
|
test/test_history.rb
|
@@ -125,7 +128,6 @@ test/test_history_added.rb
|
|
125
128
|
test/test_html_unscape_forms.rb
|
126
129
|
test/test_if_modified_since.rb
|
127
130
|
test/test_images.rb
|
128
|
-
test/test_links.rb
|
129
131
|
test/test_mechanize.rb
|
130
132
|
test/test_mechanize_cookie.rb
|
131
133
|
test/test_mechanize_cookie_jar.rb
|
@@ -137,11 +139,15 @@ test/test_mechanize_form_check_box.rb
|
|
137
139
|
test/test_mechanize_form_encoding.rb
|
138
140
|
test/test_mechanize_form_field.rb
|
139
141
|
test/test_mechanize_form_image_button.rb
|
140
|
-
test/
|
142
|
+
test/test_mechanize_form_textarea.rb
|
143
|
+
test/test_mechanize_http_agent.rb
|
144
|
+
test/test_mechanize_link.rb
|
145
|
+
test/test_mechanize_page_encoding.rb
|
146
|
+
test/test_mechanize_page_link.rb
|
147
|
+
test/test_mechanize_page_meta_refresh.rb
|
141
148
|
test/test_mechanize_redirect_not_get_or_head_error.rb
|
142
149
|
test/test_mechanize_subclass.rb
|
143
150
|
test/test_mechanize_util.rb
|
144
|
-
test/test_meta.rb
|
145
151
|
test/test_multi_select.rb
|
146
152
|
test/test_no_attributes.rb
|
147
153
|
test/test_option.rb
|
@@ -163,6 +169,3 @@ test/test_select_none.rb
|
|
163
169
|
test/test_select_noopts.rb
|
164
170
|
test/test_set_fields.rb
|
165
171
|
test/test_ssl_server.rb
|
166
|
-
test/test_textarea.rb
|
167
|
-
test/test_upload.rb
|
168
|
-
test/test_verbs.rb
|
data/Rakefile
CHANGED
@@ -14,9 +14,9 @@ Hoe.spec 'mechanize' do
|
|
14
14
|
self.extra_rdoc_files += Dir['*.rdoc']
|
15
15
|
|
16
16
|
self.extra_deps << ['nokogiri', '~> 1.4']
|
17
|
-
self.extra_deps << ['net-http-persistent', '~> 1.
|
17
|
+
self.extra_deps << ['net-http-persistent', '~> 1.8']
|
18
18
|
self.extra_deps << ['net-http-digest_auth', '~> 1.1', '>= 1.1.1']
|
19
|
-
self.extra_deps << ['webrobots', '~> 0.0', '>= 0.0.
|
19
|
+
self.extra_deps << ['webrobots', '~> 0.0', '>= 0.0.9']
|
20
20
|
|
21
21
|
self.spec_extras[:required_ruby_version] = '>= 1.8.7'
|
22
22
|
end
|
data/examples/flickr_upload.rb
CHANGED
@@ -1,23 +1,22 @@
|
|
1
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
2
|
-
|
3
1
|
require 'rubygems'
|
4
2
|
require 'mechanize'
|
5
3
|
|
6
4
|
agent = Mechanize.new
|
7
5
|
|
8
6
|
# Get the flickr sign in page
|
9
|
-
page = agent.get
|
7
|
+
page = agent.get 'http://flickr.com/signin/flickr/'
|
10
8
|
|
11
9
|
# Fill out the login form
|
12
|
-
form = page.form_with
|
10
|
+
form = page.form_with :name => 'flickrloginform'
|
13
11
|
form.email = ARGV[0]
|
14
12
|
form.password = ARGV[1]
|
15
|
-
|
13
|
+
form.submit
|
16
14
|
|
17
15
|
# Go to the upload page
|
18
|
-
page =
|
16
|
+
page = page.link_with(:text => 'Upload').click
|
19
17
|
|
20
18
|
# Fill out the form
|
21
19
|
form = page.forms.action('/photos_upload_process.gne').first
|
22
20
|
form.file_uploads.name('file1').first.file_name = ARGV[2]
|
23
|
-
|
21
|
+
form.submit
|
22
|
+
|
data/examples/mech-dump.rb
CHANGED
data/examples/proxy_req.rb
CHANGED
data/examples/rubyforge.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
2
|
-
|
3
1
|
# This example logs a user in to rubyforge and prints out the body of the
|
4
2
|
# page after logging the user in.
|
5
3
|
require 'rubygems'
|
@@ -17,6 +15,6 @@ form.form_loginname = ARGV[0]
|
|
17
15
|
form.form_pw = ARGV[1]
|
18
16
|
|
19
17
|
# Submit the form
|
20
|
-
page =
|
18
|
+
page = form.submit form.buttons.first
|
21
19
|
|
22
20
|
puts page.body # Print out the body
|
data/examples/spider.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
2
|
-
|
3
1
|
require 'rubygems'
|
4
2
|
require 'mechanize'
|
5
3
|
|
@@ -7,13 +5,14 @@ agent = Mechanize.new
|
|
7
5
|
stack = agent.get(ARGV[0]).links
|
8
6
|
|
9
7
|
while l = stack.pop
|
8
|
+
next unless l.uri
|
10
9
|
host = l.uri.host
|
11
10
|
next unless host.nil? or host == agent.history.first.uri.host
|
12
11
|
next if agent.visited? l.href
|
13
12
|
|
14
13
|
puts "crawling #{l.uri}"
|
15
14
|
begin
|
16
|
-
page =
|
15
|
+
page = l.click
|
17
16
|
next unless Mechanize::Page === page
|
18
17
|
stack.push(*page.links)
|
19
18
|
rescue Mechanize::ResponseCodeError
|
data/lib/mechanize.rb
CHANGED
@@ -43,6 +43,7 @@ class Mechanize
|
|
43
43
|
else
|
44
44
|
"#{RUBY_VERSION}dev#{RUBY_REVISION}"
|
45
45
|
end
|
46
|
+
|
46
47
|
##
|
47
48
|
# User Agent aliases
|
48
49
|
|
@@ -61,74 +62,177 @@ class Mechanize
|
|
61
62
|
}
|
62
63
|
|
63
64
|
# A Mechanize::CookieJar which stores cookies
|
64
|
-
|
65
|
+
|
66
|
+
def cookie_jar
|
67
|
+
@agent.cookie_jar
|
68
|
+
end
|
69
|
+
|
70
|
+
def cookie_jar= cookie_jar
|
71
|
+
@agent.cookie_jar = cookie_jar
|
72
|
+
end
|
65
73
|
|
66
74
|
# Length of time to wait until a connection is opened in seconds
|
67
|
-
|
75
|
+
def open_timeout
|
76
|
+
@agent.open_timeout
|
77
|
+
end
|
78
|
+
|
79
|
+
def open_timeout= open_timeout
|
80
|
+
@agent.open_timeout = open_timeout
|
81
|
+
end
|
68
82
|
|
69
83
|
# Length of time to attempt to read data from the server
|
70
|
-
|
84
|
+
def read_timeout
|
85
|
+
@agent.read_timeout
|
86
|
+
end
|
87
|
+
|
88
|
+
def read_timeout= read_timeout
|
89
|
+
@agent.read_timeout = read_timeout
|
90
|
+
end
|
71
91
|
|
72
92
|
# The identification string for the client initiating a web request
|
73
|
-
|
93
|
+
def user_agent
|
94
|
+
@agent.user_agent
|
95
|
+
end
|
74
96
|
|
75
97
|
# The value of watch_for_set is passed to pluggable parsers for retrieved
|
76
98
|
# content
|
77
99
|
attr_accessor :watch_for_set
|
78
100
|
|
79
101
|
# Path to an OpenSSL server certificate file
|
80
|
-
|
102
|
+
def ca_file
|
103
|
+
@agent.ca_file
|
104
|
+
end
|
105
|
+
|
106
|
+
def ca_file= ca_file
|
107
|
+
@agent.ca_file = ca_file
|
108
|
+
end
|
109
|
+
|
110
|
+
def certificate
|
111
|
+
@agent.certificate
|
112
|
+
end
|
81
113
|
|
82
114
|
# An OpenSSL private key or the path to a private key
|
83
|
-
|
115
|
+
def key
|
116
|
+
@agent.key
|
117
|
+
end
|
118
|
+
|
119
|
+
def key= key
|
120
|
+
@agent.key = key
|
121
|
+
end
|
84
122
|
|
85
123
|
# An OpenSSL client certificate or the path to a certificate file.
|
86
|
-
|
124
|
+
def cert
|
125
|
+
@agent.cert
|
126
|
+
end
|
127
|
+
|
128
|
+
def cert= cert
|
129
|
+
@agent.cert = cert
|
130
|
+
end
|
87
131
|
|
88
132
|
# OpenSSL key password
|
89
|
-
|
133
|
+
def pass
|
134
|
+
@agent.pass
|
135
|
+
end
|
136
|
+
|
137
|
+
def pass= pass
|
138
|
+
@agent.pass = pass
|
139
|
+
end
|
140
|
+
|
141
|
+
# Controls how this agent deals with redirects. The following values are
|
142
|
+
# allowed:
|
143
|
+
#
|
144
|
+
# :all, true:: All 3xx redirects are followed (default)
|
145
|
+
# :permanent:: Only 301 Moved Permanantly redirects are followed
|
146
|
+
# false:: No redirects are followed
|
90
147
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
# Permanently) redirects are followed. If it is a false value, no
|
95
|
-
# redirects are followed.
|
96
|
-
attr_accessor :redirect_ok
|
148
|
+
def redirect_ok
|
149
|
+
@agent.redirect_ok
|
150
|
+
end
|
97
151
|
|
98
|
-
|
99
|
-
|
152
|
+
def redirect_ok= follow
|
153
|
+
@agent.redirect_ok = follow
|
154
|
+
end
|
100
155
|
|
101
|
-
def
|
102
|
-
|
103
|
-
@webrobots = nil if value != @robots
|
104
|
-
@robots = value
|
156
|
+
def gzip_enabled
|
157
|
+
@agent.gzip_enabled
|
105
158
|
end
|
106
159
|
|
107
160
|
# Disables HTTP/1.1 gzip compression (enabled by default)
|
108
|
-
|
161
|
+
def gzip_enabled=enabled
|
162
|
+
@agent.gzip_enabled = enabled
|
163
|
+
end
|
109
164
|
|
110
165
|
# HTTP/1.0 keep-alive time
|
111
|
-
|
166
|
+
def keep_alive_time
|
167
|
+
@agent.keep_alive_time
|
168
|
+
end
|
169
|
+
|
170
|
+
def keep_alive_time= keep_alive_time
|
171
|
+
@agent.keep_alive_time = keep_alive_time
|
172
|
+
end
|
112
173
|
|
113
174
|
# HTTP/1.1 keep-alives are always active. This does nothing.
|
114
175
|
attr_accessor :keep_alive
|
115
176
|
|
177
|
+
def conditional_requests
|
178
|
+
@agent.conditional_requests
|
179
|
+
end
|
180
|
+
|
116
181
|
# Disables If-Modified-Since conditional requests (enabled by default)
|
117
|
-
|
182
|
+
def conditional_requests= enabled
|
183
|
+
@agent.conditional_requests = enabled
|
184
|
+
end
|
118
185
|
|
119
|
-
# Follow HTML meta refresh
|
120
|
-
|
186
|
+
# Follow HTML meta refresh. If set to +:anywhere+ meta refresh tags outside
|
187
|
+
# of the head element will be followed.
|
188
|
+
def follow_meta_refresh
|
189
|
+
@agent.follow_meta_refresh
|
190
|
+
end
|
191
|
+
|
192
|
+
def follow_meta_refresh= follow
|
193
|
+
@agent.follow_meta_refresh = follow
|
194
|
+
end
|
121
195
|
|
122
196
|
# A callback for additional certificate verification. See
|
123
197
|
# OpenSSL::SSL::SSLContext#verify_callback
|
124
|
-
|
198
|
+
#
|
199
|
+
# The callback can be used for debugging or to ignore errors by always
|
200
|
+
# returning +true+. Specifying nil uses the default method that was valid
|
201
|
+
# when the SSLContext was created
|
202
|
+
def verify_callback
|
203
|
+
@agent.verify_callback
|
204
|
+
end
|
205
|
+
|
206
|
+
def verify_callback= verify_callback
|
207
|
+
@agent.verify_callback = verify_callback
|
208
|
+
end
|
125
209
|
|
126
210
|
attr_accessor :history_added
|
127
|
-
|
128
|
-
|
211
|
+
|
212
|
+
def redirection_limit
|
213
|
+
@agent.redirection_limit
|
214
|
+
end
|
215
|
+
|
216
|
+
def redirection_limit= limit
|
217
|
+
@agent.redirection_limit = limit
|
218
|
+
end
|
219
|
+
|
220
|
+
def scheme_handlers
|
221
|
+
@agent.scheme_handlers
|
222
|
+
end
|
223
|
+
|
224
|
+
def scheme_handlers= scheme_handlers
|
225
|
+
@agent.scheme_handlers = scheme_handlers
|
226
|
+
end
|
129
227
|
|
130
228
|
# A hash of custom request headers
|
131
|
-
|
229
|
+
def request_headers
|
230
|
+
@agent.request_headers
|
231
|
+
end
|
232
|
+
|
233
|
+
def request_headers= request_headers
|
234
|
+
@agent.request_headers = request_headers
|
235
|
+
end
|
132
236
|
|
133
237
|
# Proxy settings
|
134
238
|
attr_reader :proxy_addr
|
@@ -139,22 +243,29 @@ class Mechanize
|
|
139
243
|
# The HTML parser to be used when parsing documents
|
140
244
|
attr_accessor :html_parser
|
141
245
|
|
142
|
-
attr_reader :
|
246
|
+
attr_reader :agent # :nodoc:
|
247
|
+
|
248
|
+
def history
|
249
|
+
@agent.history
|
250
|
+
end
|
143
251
|
|
144
|
-
attr_reader :history
|
145
252
|
attr_reader :pluggable_parser
|
146
253
|
|
147
254
|
# A list of hooks to call after retrieving a response. Hooks are called with
|
148
255
|
# the agent and the response returned.
|
149
256
|
|
150
|
-
|
257
|
+
def post_connect_hooks
|
258
|
+
@agent.post_connect_hooks
|
259
|
+
end
|
151
260
|
|
152
261
|
# A list of hooks to call before making a request. Hooks are called with
|
153
262
|
# the agent and the request to be performed.
|
154
263
|
|
155
|
-
|
264
|
+
def pre_connect_hooks
|
265
|
+
@agent.pre_connect_hooks
|
266
|
+
end
|
156
267
|
|
157
|
-
alias
|
268
|
+
alias follow_redirect? redirect_ok
|
158
269
|
|
159
270
|
@html_parser = Nokogiri::HTML
|
160
271
|
class << self
|
@@ -167,50 +278,27 @@ class Mechanize
|
|
167
278
|
end
|
168
279
|
end
|
169
280
|
|
170
|
-
|
171
|
-
|
172
|
-
@cookie_jar = CookieJar.new
|
173
|
-
@log = nil
|
174
|
-
@open_timeout = nil
|
175
|
-
@read_timeout = nil
|
176
|
-
@user_agent = AGENT_ALIASES['Mechanize']
|
177
|
-
@watch_for_set = nil
|
178
|
-
@history_added = nil
|
179
|
-
@ca_file = nil # OpenSSL server certificate file
|
180
|
-
|
181
|
-
# callback for OpenSSL errors while verifying the server certificate
|
182
|
-
# chain, can be used for debugging or to ignore errors by always
|
183
|
-
# returning _true_
|
184
|
-
# specifying nil uses the default method that was valid when the SSL was created
|
185
|
-
@verify_callback = nil
|
186
|
-
@cert = nil # OpenSSL Certificate
|
187
|
-
@key = nil # OpenSSL Private Key
|
188
|
-
@pass = nil # OpenSSL Password
|
189
|
-
@redirect_ok = true
|
190
|
-
@gzip_enabled = true
|
281
|
+
# A default encoding name used when parsing HTML parsing. When set it is
|
282
|
+
# used after any other encoding. The default is nil.
|
191
283
|
|
192
|
-
|
193
|
-
@history = Mechanize::History.new
|
194
|
-
@pluggable_parser = PluggableParser.new
|
284
|
+
attr_accessor :default_encoding
|
195
285
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
@digest = nil # DigestAuth Digest
|
200
|
-
@digest_auth = Net::HTTP::DigestAuth.new
|
201
|
-
@auth_hash = {} # Keep track of urls for sending auth
|
202
|
-
@request_headers= {} # A hash of request headers to be used
|
286
|
+
# Overrides the encodings given by the HTTP server and the HTML page with
|
287
|
+
# the default_encoding when set to true.
|
288
|
+
attr_accessor :force_default_encoding
|
203
289
|
|
204
|
-
|
290
|
+
def initialize
|
291
|
+
@agent = Mechanize::HTTP::Agent.new
|
292
|
+
@agent.context = self
|
205
293
|
|
206
|
-
|
207
|
-
@
|
294
|
+
# attr_accessors
|
295
|
+
@agent.user_agent = AGENT_ALIASES['Mechanize']
|
296
|
+
@watch_for_set = nil
|
297
|
+
@history_added = nil
|
208
298
|
|
209
|
-
|
210
|
-
@
|
299
|
+
# attr_readers
|
300
|
+
@pluggable_parser = PluggableParser.new
|
211
301
|
|
212
|
-
# Connection Cache & Keep alive
|
213
|
-
@keep_alive_time = 300
|
214
302
|
@keep_alive = true
|
215
303
|
|
216
304
|
# Proxy
|
@@ -219,71 +307,49 @@ class Mechanize
|
|
219
307
|
@proxy_user = nil
|
220
308
|
@proxy_pass = nil
|
221
309
|
|
222
|
-
@
|
223
|
-
h[scheme] = lambda { |link, page|
|
224
|
-
raise Mechanize::UnsupportedSchemeError, scheme
|
225
|
-
}
|
226
|
-
}
|
310
|
+
@html_parser = self.class.html_parser
|
227
311
|
|
228
|
-
@
|
229
|
-
@
|
230
|
-
@scheme_handlers['relative'] = @scheme_handlers['http']
|
231
|
-
@scheme_handlers['file'] = @scheme_handlers['http']
|
312
|
+
@default_encoding = nil
|
313
|
+
@force_default_encoding = false
|
232
314
|
|
233
|
-
|
234
|
-
@post_connect_hooks = []
|
315
|
+
yield self if block_given?
|
235
316
|
|
236
|
-
@
|
317
|
+
@agent.set_proxy @proxy_addr, @proxy_port, @proxy_user, @proxy_pass
|
318
|
+
@agent.set_http
|
319
|
+
end
|
237
320
|
|
238
|
-
|
321
|
+
def max_history
|
322
|
+
@agent.history.max_size
|
323
|
+
end
|
239
324
|
|
240
|
-
|
241
|
-
|
242
|
-
else
|
243
|
-
set_http
|
244
|
-
end
|
325
|
+
def max_history= length
|
326
|
+
@agent.history.max_size = length
|
245
327
|
end
|
246
328
|
|
247
|
-
def max_history=(length); @history.max_size = length end
|
248
|
-
def max_history; @history.max_size end
|
249
329
|
def log=(l); self.class.log = l end
|
250
330
|
def log; self.class.log end
|
251
331
|
|
252
|
-
|
253
|
-
|
254
|
-
def set_proxy(addr, port, user = nil, pass = nil)
|
255
|
-
proxy = URI.parse "http://#{addr}"
|
256
|
-
proxy.port = port
|
257
|
-
proxy.user = user if user
|
258
|
-
proxy.password = pass if pass
|
259
|
-
|
260
|
-
set_http proxy
|
261
|
-
|
262
|
-
nil
|
332
|
+
def user_agent= user_agent
|
333
|
+
@agent.user_agent = user_agent
|
263
334
|
end
|
264
335
|
|
265
|
-
|
266
|
-
@webrobots = nil if value != @user_agent
|
267
|
-
@user_agent = value
|
268
|
-
end
|
269
|
-
|
270
|
-
# Set the user agent for the Mechanize object.
|
271
|
-
# See AGENT_ALIASES
|
336
|
+
# Set the user agent for the Mechanize object. See AGENT_ALIASES
|
272
337
|
def user_agent_alias=(al)
|
273
|
-
|
274
|
-
raise(ArgumentError, "unknown agent alias")
|
338
|
+
self.user_agent = AGENT_ALIASES[al] ||
|
339
|
+
raise(ArgumentError, "unknown agent alias #{al.inspect}")
|
275
340
|
end
|
276
341
|
|
277
342
|
# Returns a list of cookies stored in the cookie jar.
|
278
343
|
def cookies
|
279
|
-
@cookie_jar.to_a
|
344
|
+
@agent.cookie_jar.to_a
|
280
345
|
end
|
281
346
|
|
282
347
|
# Sets the user and password to be used for authentication.
|
283
348
|
def auth(user, password)
|
284
|
-
@user
|
285
|
-
@password
|
349
|
+
@agent.user = user
|
350
|
+
@agent.password = password
|
286
351
|
end
|
352
|
+
|
287
353
|
alias :basic_auth :auth
|
288
354
|
|
289
355
|
# Fetches the URL passed in and returns a page.
|
@@ -302,13 +368,12 @@ class Mechanize
|
|
302
368
|
method = options[:verb] || method
|
303
369
|
end
|
304
370
|
|
305
|
-
|
371
|
+
referer ||=
|
306
372
|
if uri.to_s =~ %r{\Ahttps?://}
|
307
|
-
|
373
|
+
Page.new(nil, {'content-type'=>'text/html'})
|
308
374
|
else
|
309
|
-
|
375
|
+
current_page || Page.new(nil, {'content-type'=>'text/html'})
|
310
376
|
end
|
311
|
-
end
|
312
377
|
|
313
378
|
# FIXME: Huge hack so that using a URI as a referer works. I need to
|
314
379
|
# refactor everything to pass around URIs but still support
|
@@ -321,7 +386,7 @@ class Mechanize
|
|
321
386
|
|
322
387
|
# fetch the page
|
323
388
|
headers ||= {}
|
324
|
-
page =
|
389
|
+
page = @agent.fetch uri, method, headers, parameters, referer
|
325
390
|
add_to_history(page)
|
326
391
|
yield page if block_given?
|
327
392
|
page
|
@@ -342,7 +407,7 @@ class Mechanize
|
|
342
407
|
# delete('http://example/', {'q' => 'foo'}, {})
|
343
408
|
#
|
344
409
|
def delete(uri, query_params = {}, headers = {})
|
345
|
-
page =
|
410
|
+
page = @agent.fetch(uri, :delete, headers, query_params)
|
346
411
|
add_to_history(page)
|
347
412
|
page
|
348
413
|
end
|
@@ -354,7 +419,7 @@ class Mechanize
|
|
354
419
|
#
|
355
420
|
def head(uri, query_params = {}, headers = {})
|
356
421
|
# fetch the page
|
357
|
-
page =
|
422
|
+
page = @agent.fetch(uri, :head, headers, query_params)
|
358
423
|
yield page if block_given?
|
359
424
|
page
|
360
425
|
end
|
@@ -371,12 +436,18 @@ class Mechanize
|
|
371
436
|
case link
|
372
437
|
when Page::Link
|
373
438
|
referer = link.page || current_page()
|
374
|
-
if robots
|
439
|
+
if @agent.robots
|
375
440
|
if (referer.is_a?(Page) && referer.parser.nofollow?) || link.rel?('nofollow')
|
376
441
|
raise RobotsDisallowedError.new(link.href)
|
377
442
|
end
|
378
443
|
end
|
379
|
-
|
444
|
+
if link.rel?('noreferrer')
|
445
|
+
href = @agent.resolve(link.href, link.page || current_page)
|
446
|
+
referer = Page.new(nil, {'content-type'=>'text/html'})
|
447
|
+
else
|
448
|
+
href = link.href
|
449
|
+
end
|
450
|
+
get href, [], referer
|
380
451
|
when String, Regexp
|
381
452
|
if real_link = page.link_with(:text => link)
|
382
453
|
click real_link
|
@@ -399,7 +470,7 @@ class Mechanize
|
|
399
470
|
# Equivalent to the browser back button. Returns the most recent page
|
400
471
|
# visited.
|
401
472
|
def back
|
402
|
-
@history.pop
|
473
|
+
@agent.history.pop
|
403
474
|
end
|
404
475
|
|
405
476
|
# Posts to the given URL with the request entity. The request
|
@@ -468,341 +539,27 @@ class Mechanize
|
|
468
539
|
'Content-Length' => entity.size.to_s,
|
469
540
|
}.update headers
|
470
541
|
|
471
|
-
page =
|
542
|
+
page = @agent.fetch uri, verb, headers, [entity], cur_page
|
472
543
|
add_to_history(page)
|
473
544
|
page
|
474
545
|
end
|
475
546
|
|
476
547
|
# Returns the current page loaded by Mechanize
|
477
548
|
def current_page
|
478
|
-
@
|
479
|
-
end
|
480
|
-
|
481
|
-
# Returns whether or not a url has been visited
|
482
|
-
def visited?(url)
|
483
|
-
! visited_page(url).nil?
|
549
|
+
@agent.current_page
|
484
550
|
end
|
485
551
|
|
486
552
|
# Returns a visited page for the url passed in, otherwise nil
|
487
553
|
def visited_page(url)
|
488
|
-
if url.respond_to? :href
|
489
|
-
url = url.href
|
490
|
-
end
|
491
|
-
@history.visited_page(resolve(url))
|
492
|
-
end
|
493
|
-
|
494
|
-
# Runs given block, then resets the page history as it was before. self is
|
495
|
-
# given as a parameter to the block. Returns the value of the block.
|
496
|
-
def transact
|
497
|
-
history_backup = @history.dup
|
498
|
-
begin
|
499
|
-
yield self
|
500
|
-
ensure
|
501
|
-
@history = history_backup
|
502
|
-
end
|
503
|
-
end
|
504
|
-
|
505
|
-
# Tests if this agent is allowed to access +url+, consulting the
|
506
|
-
# site's robots.txt.
|
507
|
-
def robots_allowed?(uri)
|
508
|
-
return true if uri.request_uri == '/robots.txt'
|
509
|
-
|
510
|
-
webrobots.allowed?(uri)
|
511
|
-
end
|
512
|
-
|
513
|
-
# Equivalent to !robots_allowed?(url).
|
514
|
-
def robots_disallowed?(url)
|
515
|
-
!webrobots.allowed?(url)
|
516
|
-
end
|
517
|
-
|
518
|
-
# Returns an error object if there is an error in fetching or
|
519
|
-
# parsing robots.txt of the site +url+.
|
520
|
-
def robots_error(url)
|
521
|
-
webrobots.error(url)
|
522
|
-
end
|
523
|
-
|
524
|
-
# Raises the error if there is an error in fetching or parsing
|
525
|
-
# robots.txt of the site +url+.
|
526
|
-
def robots_error!(url)
|
527
|
-
webrobots.error!(url)
|
528
|
-
end
|
554
|
+
url = url.href if url.respond_to? :href
|
529
555
|
|
530
|
-
|
531
|
-
def robots_reset(url)
|
532
|
-
webrobots.reset(url)
|
556
|
+
@agent.visited_page url
|
533
557
|
end
|
534
558
|
|
535
|
-
|
536
|
-
|
537
|
-
def connection_for uri
|
538
|
-
case uri.scheme.downcase
|
539
|
-
when 'http', 'https' then
|
540
|
-
return @http
|
541
|
-
when 'file' then
|
542
|
-
return Mechanize::FileConnection.new
|
543
|
-
end
|
544
|
-
end
|
545
|
-
|
546
|
-
def enable_gzip request
|
547
|
-
request['accept-encoding'] = if @gzip_enabled
|
548
|
-
'gzip,deflate,identity'
|
549
|
-
else
|
550
|
-
'identity'
|
551
|
-
end
|
552
|
-
end
|
553
|
-
|
554
|
-
def http_request uri, method, params = nil
|
555
|
-
case uri.scheme.downcase
|
556
|
-
when 'http', 'https' then
|
557
|
-
klass = Net::HTTP.const_get(method.to_s.capitalize)
|
558
|
-
|
559
|
-
request ||= klass.new(uri.request_uri)
|
560
|
-
request.body = params.first if params
|
561
|
-
|
562
|
-
request
|
563
|
-
when 'file' then
|
564
|
-
Mechanize::FileRequest.new uri
|
565
|
-
end
|
566
|
-
end
|
567
|
-
|
568
|
-
##
|
569
|
-
# Invokes hooks added to post_connect_hooks after a +response+ is returned.
|
570
|
-
# Yields the +agent+ and the +response+ returned to each hook.
|
571
|
-
|
572
|
-
def post_connect response # :yields: agent, response
|
573
|
-
@post_connect_hooks.each do |hook|
|
574
|
-
hook.call self, response
|
575
|
-
end
|
576
|
-
end
|
577
|
-
|
578
|
-
##
|
579
|
-
# Invokes hooks added to pre_connect_hooks before a +request+ is made.
|
580
|
-
# Yields the +agent+ and the +request+ that will be performed to each hook.
|
581
|
-
|
582
|
-
def pre_connect request # :yields: agent, request
|
583
|
-
@pre_connect_hooks.each do |hook|
|
584
|
-
hook.call self, request
|
585
|
-
end
|
586
|
-
end
|
587
|
-
|
588
|
-
def request_auth request, uri
|
589
|
-
auth_type = @auth_hash[uri.host]
|
590
|
-
|
591
|
-
return unless auth_type
|
592
|
-
|
593
|
-
case auth_type
|
594
|
-
when :basic
|
595
|
-
request.basic_auth @user, @password
|
596
|
-
when :digest, :iis_digest
|
597
|
-
uri.user = @user
|
598
|
-
uri.password = @password
|
599
|
-
|
600
|
-
iis = auth_type == :iis_digest
|
601
|
-
|
602
|
-
auth = @digest_auth.auth_header uri, @digest, request.method, iis
|
603
|
-
|
604
|
-
request['Authorization'] = auth
|
605
|
-
end
|
606
|
-
end
|
607
|
-
|
608
|
-
def request_cookies request, uri
|
609
|
-
return if @cookie_jar.empty? uri
|
610
|
-
|
611
|
-
cookies = @cookie_jar.cookies uri
|
612
|
-
|
613
|
-
return if cookies.empty?
|
614
|
-
|
615
|
-
request.add_field 'Cookie', cookies.join('; ')
|
616
|
-
end
|
617
|
-
|
618
|
-
def request_host request, uri
|
619
|
-
port = [80, 443].include?(uri.port.to_i) ? nil : uri.port
|
620
|
-
host = uri.host
|
621
|
-
|
622
|
-
request['Host'] = [host, port].compact.join ':'
|
623
|
-
end
|
624
|
-
|
625
|
-
def request_language_charset request
|
626
|
-
request['accept-charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
|
627
|
-
request['accept-language'] = 'en-us,en;q=0.5'
|
628
|
-
end
|
629
|
-
|
630
|
-
# Log specified headers for the request
|
631
|
-
def request_log request
|
632
|
-
return unless log
|
633
|
-
|
634
|
-
log.info("#{request.class}: #{request.path}")
|
635
|
-
|
636
|
-
request.each_header do |k, v|
|
637
|
-
log.debug("request-header: #{k} => #{v}")
|
638
|
-
end
|
639
|
-
end
|
640
|
-
|
641
|
-
def request_add_headers request, headers = {}
|
642
|
-
@request_headers.each do |k,v|
|
643
|
-
request[k] = v
|
644
|
-
end
|
645
|
-
|
646
|
-
headers.each do |field, value|
|
647
|
-
case field
|
648
|
-
when :etag then request["ETag"] = value
|
649
|
-
when :if_modified_since then request["If-Modified-Since"] = value
|
650
|
-
when Symbol then
|
651
|
-
raise ArgumentError, "unknown header symbol #{field}"
|
652
|
-
else
|
653
|
-
request[field] = value
|
654
|
-
end
|
655
|
-
end
|
656
|
-
end
|
657
|
-
|
658
|
-
def request_referer request, uri, referer
|
659
|
-
return unless referer
|
660
|
-
return if 'https' == referer.scheme.downcase and
|
661
|
-
'https' != uri.scheme.downcase
|
662
|
-
|
663
|
-
request['Referer'] = referer
|
664
|
-
end
|
665
|
-
|
666
|
-
def request_user_agent request
|
667
|
-
request['User-Agent'] = @user_agent if @user_agent
|
668
|
-
end
|
669
|
-
|
670
|
-
def resolve(uri, referer = current_page())
|
671
|
-
uri = uri.dup if uri.is_a?(URI)
|
672
|
-
|
673
|
-
unless uri.is_a?(URI)
|
674
|
-
uri = uri.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/o) { |match|
|
675
|
-
if RUBY_VERSION >= "1.9.0"
|
676
|
-
Mechanize::Util.uri_escape(match)
|
677
|
-
else
|
678
|
-
sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C')[0])
|
679
|
-
end
|
680
|
-
}
|
681
|
-
|
682
|
-
unescaped = uri.split(/(?:%[0-9A-Fa-f]{2})+|#/)
|
683
|
-
escaped = uri.scan(/(?:%[0-9A-Fa-f]{2})+|#/)
|
684
|
-
|
685
|
-
escaped_uri = Mechanize::Util.html_unescape(
|
686
|
-
unescaped.zip(escaped).map { |x,y|
|
687
|
-
"#{WEBrick::HTTPUtils.escape(x)}#{y}"
|
688
|
-
}.join('')
|
689
|
-
)
|
690
|
-
|
691
|
-
begin
|
692
|
-
uri = URI.parse(escaped_uri)
|
693
|
-
rescue
|
694
|
-
uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_uri))
|
695
|
-
end
|
696
|
-
end
|
697
|
-
|
698
|
-
scheme = uri.relative? ? 'relative' : uri.scheme.downcase
|
699
|
-
uri = @scheme_handlers[scheme].call(uri, referer)
|
700
|
-
|
701
|
-
if referer && referer.uri
|
702
|
-
if uri.path.length == 0 && uri.relative?
|
703
|
-
uri.path = referer.uri.path
|
704
|
-
end
|
705
|
-
end
|
706
|
-
|
707
|
-
uri.path = '/' if uri.path.length == 0
|
708
|
-
|
709
|
-
if uri.relative?
|
710
|
-
raise ArgumentError, "absolute URL needed (not #{uri})" unless
|
711
|
-
referer && referer.uri
|
712
|
-
|
713
|
-
base = nil
|
714
|
-
if referer.respond_to?(:bases) && referer.parser
|
715
|
-
base = referer.bases.last
|
716
|
-
end
|
717
|
-
|
718
|
-
uri = ((base && base.uri && base.uri.absolute?) ?
|
719
|
-
base.uri :
|
720
|
-
referer.uri) + uri
|
721
|
-
uri = referer.uri + uri
|
722
|
-
# Strip initial "/.." bits from the path
|
723
|
-
uri.path.sub!(/^(\/\.\.)+(?=\/)/, '')
|
724
|
-
end
|
725
|
-
|
726
|
-
unless ['http', 'https', 'file'].include?(uri.scheme.downcase)
|
727
|
-
raise ArgumentError, "unsupported scheme: #{uri.scheme}"
|
728
|
-
end
|
729
|
-
|
730
|
-
uri
|
731
|
-
end
|
732
|
-
|
733
|
-
def resolve_parameters uri, method, parameters
|
734
|
-
case method
|
735
|
-
when :head, :get, :delete, :trace then
|
736
|
-
if parameters and parameters.length > 0
|
737
|
-
uri.query ||= ''
|
738
|
-
uri.query << '&' if uri.query.length > 0
|
739
|
-
uri.query << Mechanize::Util.build_query_string(parameters)
|
740
|
-
end
|
741
|
-
|
742
|
-
return uri, nil
|
743
|
-
end
|
744
|
-
|
745
|
-
return uri, parameters
|
746
|
-
end
|
747
|
-
|
748
|
-
def response_cookies response, uri, page
|
749
|
-
if Mechanize::Page === page and page.body =~ /Set-Cookie/n
|
750
|
-
page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta|
|
751
|
-
Mechanize::Cookie.parse(uri, meta['content']) { |c|
|
752
|
-
log.debug("saved cookie: #{c}") if log
|
753
|
-
@cookie_jar.add(uri, c)
|
754
|
-
}
|
755
|
-
end
|
756
|
-
end
|
757
|
-
|
758
|
-
header_cookies = response.get_fields 'Set-Cookie'
|
759
|
-
|
760
|
-
return unless header_cookies
|
761
|
-
|
762
|
-
header_cookies.each do |cookie|
|
763
|
-
Mechanize::Cookie.parse(uri, cookie) { |c|
|
764
|
-
log.debug("saved cookie: #{c}") if log
|
765
|
-
@cookie_jar.add(uri, c)
|
766
|
-
}
|
767
|
-
end
|
768
|
-
end
|
769
|
-
|
770
|
-
def response_follow_meta_refresh response, uri, page, redirects
|
771
|
-
return unless @follow_meta_refresh
|
772
|
-
|
773
|
-
redirect_uri = nil
|
774
|
-
referer = page
|
775
|
-
|
776
|
-
if page.respond_to?(:meta) and (redirect = page.meta.first)
|
777
|
-
redirect_uri = Mechanize::Util.uri_unescape redirect.uri.to_s
|
778
|
-
sleep redirect.node['delay'].to_f
|
779
|
-
referer = Page.new(nil, {'content-type'=>'text/html'})
|
780
|
-
elsif refresh = response['refresh']
|
781
|
-
delay, redirect_uri = Page::Meta.parse(refresh, uri)
|
782
|
-
raise Mechanize::Error, 'Invalid refresh http header' unless delay
|
783
|
-
raise RedirectLimitReachedError.new(page, redirects) if
|
784
|
-
redirects + 1 > redirection_limit
|
785
|
-
sleep delay.to_f
|
786
|
-
end
|
787
|
-
|
788
|
-
if redirect_uri
|
789
|
-
@history.push(page, page.uri)
|
790
|
-
fetch_page(redirect_uri, :get, {}, [], referer, redirects + 1)
|
791
|
-
end
|
792
|
-
end
|
793
|
-
|
794
|
-
def response_log response
|
795
|
-
return unless log
|
796
|
-
|
797
|
-
log.info("status: #{response.class} #{response.http_version} " \
|
798
|
-
"#{response.code} #{response.message}")
|
799
|
-
|
800
|
-
response.each_header do |k, v|
|
801
|
-
log.debug("response-header: #{k} => #{v}")
|
802
|
-
end
|
803
|
-
end
|
559
|
+
# Returns whether or not a url has been visited
|
560
|
+
alias visited? visited_page
|
804
561
|
|
805
|
-
def
|
562
|
+
def parse uri, response, body
|
806
563
|
content_type = nil
|
807
564
|
|
808
565
|
unless response['Content-Type'].nil?
|
@@ -811,167 +568,38 @@ class Mechanize
|
|
811
568
|
end
|
812
569
|
|
813
570
|
# Find our pluggable parser
|
814
|
-
parser_klass = @pluggable_parser.parser
|
571
|
+
parser_klass = @pluggable_parser.parser content_type
|
815
572
|
|
816
|
-
parser_klass.new
|
573
|
+
parser_klass.new uri, response, body, response.code do |parser|
|
817
574
|
parser.mech = self if parser.respond_to? :mech=
|
818
|
-
if @watch_for_set and parser.respond_to?(:watch_for_set=)
|
819
|
-
parser.watch_for_set = @watch_for_set
|
820
|
-
end
|
821
|
-
}
|
822
|
-
end
|
823
|
-
|
824
|
-
def response_read response, request
|
825
|
-
body = StringIO.new
|
826
|
-
body.set_encoding Encoding::BINARY if body.respond_to? :set_encoding
|
827
|
-
total = 0
|
828
575
|
|
829
|
-
|
830
|
-
|
831
|
-
body.write(part)
|
832
|
-
log.debug("Read #{part.length} bytes (#{total} total)") if log
|
833
|
-
}
|
834
|
-
|
835
|
-
body.rewind
|
836
|
-
|
837
|
-
raise Mechanize::ResponseCodeError, response if
|
838
|
-
Net::HTTPUnknownResponse === response
|
839
|
-
|
840
|
-
content_length = response.content_length
|
841
|
-
|
842
|
-
unless Net::HTTP::Head === request or Net::HTTPRedirection === response then
|
843
|
-
raise EOFError, "Content-Length (#{content_length}) does not match " \
|
844
|
-
"response body length (#{body.length})" if
|
845
|
-
content_length and content_length != body.length
|
846
|
-
end
|
847
|
-
|
848
|
-
case response['Content-Encoding']
|
849
|
-
when nil, 'none', '7bit' then
|
850
|
-
body.string
|
851
|
-
when 'deflate' then
|
852
|
-
log.debug('deflate body') if log
|
853
|
-
|
854
|
-
if content_length > 0 or body.length > 0 then
|
855
|
-
begin
|
856
|
-
Zlib::Inflate.inflate body.string
|
857
|
-
rescue Zlib::BufError, Zlib::DataError
|
858
|
-
log.error('Unable to inflate page, retrying with raw deflate') if log
|
859
|
-
begin
|
860
|
-
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body.string)
|
861
|
-
rescue Zlib::BufError, Zlib::DataError
|
862
|
-
log.error("unable to inflate page: #{$!}") if log
|
863
|
-
''
|
864
|
-
end
|
865
|
-
end
|
866
|
-
end
|
867
|
-
when 'gzip', 'x-gzip' then
|
868
|
-
log.debug('gzip body') if log
|
869
|
-
|
870
|
-
if content_length > 0 or body.length > 0 then
|
871
|
-
begin
|
872
|
-
zio = Zlib::GzipReader.new body
|
873
|
-
zio.read
|
874
|
-
rescue Zlib::BufError, Zlib::GzipFile::Error
|
875
|
-
log.error('Unable to gunzip body, trying raw inflate') if log
|
876
|
-
body.rewind
|
877
|
-
body.read 10
|
878
|
-
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body.read)
|
879
|
-
rescue Zlib::DataError
|
880
|
-
log.error("unable to gunzip page: #{$!}") if log
|
881
|
-
''
|
882
|
-
ensure
|
883
|
-
zio.close if zio and not zio.closed?
|
884
|
-
end
|
885
|
-
end
|
886
|
-
else
|
887
|
-
raise Mechanize::Error,
|
888
|
-
"Unsupported Content-Encoding: #{response['Content-Encoding']}"
|
576
|
+
parser.watch_for_set = @watch_for_set if
|
577
|
+
@watch_for_set and parser.respond_to?(:watch_for_set=)
|
889
578
|
end
|
890
579
|
end
|
891
580
|
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
end
|
901
|
-
|
902
|
-
log.info("follow redirect to: #{response['Location']}") if log
|
903
|
-
|
904
|
-
from_uri = page.uri
|
905
|
-
|
906
|
-
raise RedirectLimitReachedError.new(page, redirects) if
|
907
|
-
redirects + 1 > redirection_limit
|
908
|
-
|
909
|
-
redirect_method = method == :head ? :head : :get
|
910
|
-
|
911
|
-
page = fetch_page(response['Location'].to_s, redirect_method, {}, [],
|
912
|
-
page, redirects + 1)
|
913
|
-
|
914
|
-
@history.push(page, from_uri)
|
915
|
-
|
916
|
-
return page
|
917
|
-
end
|
918
|
-
|
919
|
-
def response_authenticate(response, page, uri, request, headers, params,
|
920
|
-
referer)
|
921
|
-
raise ResponseCodeError, page unless @user || @password
|
922
|
-
raise ResponseCodeError, page if @auth_hash.has_key?(uri.host)
|
923
|
-
|
924
|
-
if response['www-authenticate'] =~ /Digest/i
|
925
|
-
@auth_hash[uri.host] = :digest
|
926
|
-
if response['server'] =~ /Microsoft-IIS/
|
927
|
-
@auth_hash[uri.host] = :iis_digest
|
928
|
-
end
|
929
|
-
@digest = response['www-authenticate']
|
930
|
-
else
|
931
|
-
@auth_hash[uri.host] = :basic
|
581
|
+
# Runs given block, then resets the page history as it was before. self is
|
582
|
+
# given as a parameter to the block. Returns the value of the block.
|
583
|
+
def transact
|
584
|
+
history_backup = @agent.history.dup
|
585
|
+
begin
|
586
|
+
yield self
|
587
|
+
ensure
|
588
|
+
@agent.history = history_backup
|
932
589
|
end
|
933
|
-
|
934
|
-
fetch_page(uri, request.method.downcase.to_sym, headers, params, referer)
|
935
590
|
end
|
936
591
|
|
937
|
-
|
938
|
-
|
939
|
-
def webrobots_http_get(uri)
|
940
|
-
get_file(uri)
|
941
|
-
rescue Mechanize::ResponseCodeError => e
|
942
|
-
return '' if e.response_code == '404'
|
943
|
-
raise e
|
592
|
+
def robots
|
593
|
+
@agent.robots
|
944
594
|
end
|
945
595
|
|
946
|
-
def
|
947
|
-
@
|
596
|
+
def robots= enabled
|
597
|
+
@agent.robots = enabled
|
948
598
|
end
|
949
599
|
|
950
|
-
|
951
|
-
@http = Net::HTTP::Persistent.new 'mechanize', proxy
|
952
|
-
|
953
|
-
@http.keep_alive = @keep_alive_time
|
954
|
-
|
955
|
-
@http.ca_file = @ca_file
|
956
|
-
@http.verify_callback = @verify_callback
|
957
|
-
|
958
|
-
if @cert and @key then
|
959
|
-
cert = if OpenSSL::X509::Certificate === @cert then
|
960
|
-
@cert
|
961
|
-
else
|
962
|
-
OpenSSL::X509::Certificate.new ::File.read @cert
|
963
|
-
end
|
964
|
-
|
965
|
-
key = if OpenSSL::PKey::PKey === @key then
|
966
|
-
@key
|
967
|
-
else
|
968
|
-
OpenSSL::PKey::RSA.new ::File.read(@key), @pass
|
969
|
-
end
|
600
|
+
alias :page :current_page
|
970
601
|
|
971
|
-
|
972
|
-
@http.private_key = key
|
973
|
-
end
|
974
|
-
end
|
602
|
+
private
|
975
603
|
|
976
604
|
def post_form(uri, form, headers = {})
|
977
605
|
cur_page = form.page || current_page ||
|
@@ -987,99 +615,16 @@ class Mechanize
|
|
987
615
|
}.merge headers
|
988
616
|
|
989
617
|
# fetch the page
|
990
|
-
page =
|
618
|
+
page = @agent.fetch uri, :post, headers, [request_data], cur_page
|
991
619
|
add_to_history(page)
|
992
620
|
page
|
993
621
|
end
|
994
622
|
|
995
|
-
# uri is an absolute URI
|
996
|
-
def fetch_page uri, method = :get, headers = {}, params = [],
|
997
|
-
referer = current_page, redirects = 0
|
998
|
-
referer_uri = referer ? referer.uri : nil
|
999
|
-
|
1000
|
-
uri = resolve uri, referer
|
1001
|
-
|
1002
|
-
uri, params = resolve_parameters uri, method, params
|
1003
|
-
|
1004
|
-
request = http_request uri, method, params
|
1005
|
-
|
1006
|
-
connection = connection_for uri
|
1007
|
-
|
1008
|
-
request_auth request, uri
|
1009
|
-
|
1010
|
-
enable_gzip request
|
1011
|
-
|
1012
|
-
request_language_charset request
|
1013
|
-
request_cookies request, uri
|
1014
|
-
request_host request, uri
|
1015
|
-
request_referer request, uri, referer_uri
|
1016
|
-
request_user_agent request
|
1017
|
-
request_add_headers request, headers
|
1018
|
-
|
1019
|
-
pre_connect request
|
1020
|
-
|
1021
|
-
# Consult robots.txt
|
1022
|
-
if robots && uri.is_a?(URI::HTTP)
|
1023
|
-
robots_allowed?(uri) or raise RobotsDisallowedError.new(uri)
|
1024
|
-
end
|
1025
|
-
|
1026
|
-
# Add If-Modified-Since if page is in history
|
1027
|
-
if (page = visited_page(uri)) and page.response['Last-Modified']
|
1028
|
-
request['If-Modified-Since'] = page.response['Last-Modified']
|
1029
|
-
end if(@conditional_requests)
|
1030
|
-
|
1031
|
-
# Specify timeouts if given
|
1032
|
-
connection.open_timeout = @open_timeout if @open_timeout
|
1033
|
-
connection.read_timeout = @read_timeout if @read_timeout
|
1034
|
-
|
1035
|
-
request_log request
|
1036
|
-
|
1037
|
-
response_body = nil
|
1038
|
-
|
1039
|
-
# Send the request
|
1040
|
-
response = connection.request(uri, request) { |res|
|
1041
|
-
response_log res
|
1042
|
-
|
1043
|
-
response_body = response_read res, request
|
1044
|
-
|
1045
|
-
res
|
1046
|
-
}
|
1047
|
-
|
1048
|
-
post_connect response
|
1049
|
-
|
1050
|
-
page = response_parse response, response_body, uri
|
1051
|
-
|
1052
|
-
response_cookies response, uri, page
|
1053
|
-
|
1054
|
-
meta = response_follow_meta_refresh response, uri, page, redirects
|
1055
|
-
return meta if meta
|
1056
|
-
|
1057
|
-
case response
|
1058
|
-
when Net::HTTPSuccess
|
1059
|
-
if robots && page.is_a?(Page)
|
1060
|
-
page.parser.noindex? and raise RobotsDisallowedError.new(uri)
|
1061
|
-
end
|
1062
|
-
|
1063
|
-
page
|
1064
|
-
when Mechanize::FileResponse
|
1065
|
-
page
|
1066
|
-
when Net::HTTPNotModified
|
1067
|
-
log.debug("Got cached page") if log
|
1068
|
-
visited_page(uri) || page
|
1069
|
-
when Net::HTTPRedirection
|
1070
|
-
response_redirect response, method, page, redirects
|
1071
|
-
when Net::HTTPUnauthorized
|
1072
|
-
response_authenticate(response, page, uri, request, headers, params,
|
1073
|
-
referer)
|
1074
|
-
else
|
1075
|
-
raise ResponseCodeError.new(page), "Unhandled response"
|
1076
|
-
end
|
1077
|
-
end
|
1078
|
-
|
1079
623
|
def add_to_history(page)
|
1080
|
-
@history.push(page, resolve(page.uri))
|
1081
|
-
history_added.call(page) if history_added
|
624
|
+
@agent.history.push(page, @agent.resolve(page.uri))
|
625
|
+
@history_added.call(page) if @history_added
|
1082
626
|
end
|
627
|
+
|
1083
628
|
end
|
1084
629
|
|
1085
630
|
require 'mechanize/content_type_error'
|
@@ -1091,6 +636,8 @@ require 'mechanize/file_request'
|
|
1091
636
|
require 'mechanize/file_response'
|
1092
637
|
require 'mechanize/form'
|
1093
638
|
require 'mechanize/history'
|
639
|
+
require 'mechanize/http'
|
640
|
+
require 'mechanize/http/agent'
|
1094
641
|
require 'mechanize/page'
|
1095
642
|
require 'mechanize/inspect'
|
1096
643
|
require 'mechanize/monkey_patch'
|
@@ -1098,6 +645,7 @@ require 'mechanize/pluggable_parsers'
|
|
1098
645
|
require 'mechanize/redirect_limit_reached_error'
|
1099
646
|
require 'mechanize/redirect_not_get_or_head_error'
|
1100
647
|
require 'mechanize/response_code_error'
|
648
|
+
require 'mechanize/response_read_error'
|
1101
649
|
require 'mechanize/robots_disallowed_error'
|
1102
650
|
require 'mechanize/unsupported_scheme_error'
|
1103
651
|
require 'mechanize/util'
|