mechanize 0.6.11 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data/CHANGELOG.txt +8 -0
- data/Manifest.txt +31 -22
- data/lib/mechanize.rb +2 -652
- data/lib/www/mechanize.rb +635 -0
- data/lib/www/mechanize/content_type_error.rb +16 -0
- data/lib/www/mechanize/cookie.rb +64 -0
- data/lib/{mechanize/cookie.rb → www/mechanize/cookie_jar.rb} +0 -60
- data/lib/www/mechanize/file.rb +73 -0
- data/lib/www/mechanize/file_saver.rb +39 -0
- data/lib/{mechanize → www/mechanize}/form.rb +119 -137
- data/lib/www/mechanize/form/button.rb +8 -0
- data/lib/www/mechanize/form/check_box.rb +13 -0
- data/lib/www/mechanize/form/field.rb +28 -0
- data/lib/www/mechanize/form/file_upload.rb +24 -0
- data/lib/www/mechanize/form/image_button.rb +23 -0
- data/lib/www/mechanize/form/multi_select_list.rb +69 -0
- data/lib/www/mechanize/form/option.rb +51 -0
- data/lib/www/mechanize/form/radio_button.rb +38 -0
- data/lib/www/mechanize/form/select_list.rb +41 -0
- data/lib/www/mechanize/headers.rb +12 -0
- data/lib/{mechanize → www/mechanize}/history.rb +0 -0
- data/lib/{mechanize → www/mechanize}/inspect.rb +21 -28
- data/lib/{mechanize → www/mechanize}/list.rb +0 -0
- data/lib/{mechanize → www/mechanize}/monkey_patch.rb +19 -0
- data/lib/www/mechanize/page.rb +121 -0
- data/lib/www/mechanize/page/base.rb +10 -0
- data/lib/www/mechanize/page/frame.rb +22 -0
- data/lib/www/mechanize/page/link.rb +50 -0
- data/lib/www/mechanize/page/meta.rb +10 -0
- data/lib/www/mechanize/pluggable_parsers.rb +93 -0
- data/lib/{mechanize/errors.rb → www/mechanize/response_code_error.rb} +1 -13
- data/test/{test_includes.rb → helper.rb} +4 -18
- data/test/{test_servlets.rb → servlets.rb} +0 -0
- data/test/tc_authenticate.rb +1 -8
- data/test/tc_bad_links.rb +3 -10
- data/test/tc_blank_form.rb +1 -8
- data/test/tc_checkboxes.rb +1 -8
- data/test/tc_cookie_class.rb +1 -6
- data/test/tc_cookie_jar.rb +1 -7
- data/test/tc_cookies.rb +10 -17
- data/test/tc_encoded_links.rb +5 -12
- data/test/tc_errors.rb +4 -11
- data/test/tc_follow_meta.rb +1 -8
- data/test/tc_form_action.rb +6 -14
- data/test/tc_form_as_hash.rb +1 -9
- data/test/tc_form_button.rb +5 -8
- data/test/tc_form_no_inputname.rb +1 -8
- data/test/tc_forms.rb +16 -24
- data/test/tc_frames.rb +3 -10
- data/test/tc_gzipping.rb +2 -9
- data/test/tc_history.rb +5 -12
- data/test/tc_html_unscape_forms.rb +8 -15
- data/test/tc_if_modified_since.rb +1 -6
- data/test/tc_keep_alive.rb +1 -8
- data/test/tc_links.rb +12 -19
- data/test/tc_mech.rb +26 -34
- data/test/{test_mechanize_file.rb → tc_mechanize_file.rb} +1 -6
- data/test/tc_multi_select.rb +10 -17
- data/test/tc_no_attributes.rb +1 -8
- data/test/tc_page.rb +3 -10
- data/test/tc_pluggable_parser.rb +8 -15
- data/test/tc_post_form.rb +3 -10
- data/test/tc_pretty_print.rb +3 -10
- data/test/tc_radiobutton.rb +2 -9
- data/test/tc_referer.rb +13 -20
- data/test/tc_relative_links.rb +1 -8
- data/test/tc_response_code.rb +14 -21
- data/test/tc_save_file.rb +1 -9
- data/test/tc_select.rb +3 -10
- data/test/tc_select_all.rb +2 -10
- data/test/tc_select_none.rb +2 -10
- data/test/tc_select_noopts.rb +2 -9
- data/test/tc_set_fields.rb +2 -9
- data/test/tc_ssl_server.rb +5 -12
- data/test/tc_subclass.rb +2 -9
- data/test/tc_textarea.rb +2 -9
- data/test/tc_upload.rb +2 -9
- data/test/test_all.rb +4 -43
- metadata +96 -80
- data/lib/mechanize/form_elements.rb +0 -254
- data/lib/mechanize/net-overrides/net/http.rb +0 -2107
- data/lib/mechanize/net-overrides/net/https.rb +0 -172
- data/lib/mechanize/net-overrides/net/protocol.rb +0 -380
- data/lib/mechanize/page.rb +0 -138
- data/lib/mechanize/page_elements.rb +0 -77
- data/lib/mechanize/parsers/rexml_page.rb +0 -35
- data/lib/mechanize/pluggable_parsers.rb +0 -204
- data/lib/mechanize/rexml.rb +0 -236
- data/setup.rb +0 -1585
- data/test/tc_proxy.rb +0 -25
- data/test/tc_watches.rb +0 -32
data/CHANGELOG.txt
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
= Mechanize CHANGELOG
|
2
2
|
|
3
|
+
== 0.7.0
|
4
|
+
|
5
|
+
* Removed Ruby 1.8.2 support
|
6
|
+
* Changed parser to lazily parse links
|
7
|
+
* Lazily parsing document
|
8
|
+
* Adding verify_callback for SSL requests. Thanks Mike Dalessio!
|
9
|
+
* Fixed a bug with Accept-Language header. Thanks Bill Siggelkow.
|
10
|
+
|
3
11
|
== 0.6.11
|
4
12
|
|
5
13
|
* Detecting single quotes in meta redirects.
|
data/Manifest.txt
CHANGED
@@ -13,28 +13,40 @@ eg/proxy_req.rb
|
|
13
13
|
eg/rubyforge.rb
|
14
14
|
eg/spider.rb
|
15
15
|
lib/mechanize.rb
|
16
|
-
lib/mechanize
|
17
|
-
lib/mechanize/
|
18
|
-
lib/mechanize/
|
19
|
-
lib/mechanize/
|
20
|
-
lib/mechanize/
|
21
|
-
lib/mechanize/
|
22
|
-
lib/mechanize/
|
23
|
-
lib/mechanize/
|
24
|
-
lib/mechanize/
|
25
|
-
lib/mechanize/
|
26
|
-
lib/mechanize/
|
27
|
-
lib/mechanize/
|
28
|
-
lib/mechanize/
|
29
|
-
lib/mechanize/
|
30
|
-
lib/mechanize/
|
31
|
-
lib/mechanize/
|
32
|
-
|
16
|
+
lib/www/mechanize.rb
|
17
|
+
lib/www/mechanize/content_type_error.rb
|
18
|
+
lib/www/mechanize/cookie.rb
|
19
|
+
lib/www/mechanize/cookie_jar.rb
|
20
|
+
lib/www/mechanize/file.rb
|
21
|
+
lib/www/mechanize/file_saver.rb
|
22
|
+
lib/www/mechanize/form.rb
|
23
|
+
lib/www/mechanize/form/button.rb
|
24
|
+
lib/www/mechanize/form/check_box.rb
|
25
|
+
lib/www/mechanize/form/field.rb
|
26
|
+
lib/www/mechanize/form/file_upload.rb
|
27
|
+
lib/www/mechanize/form/image_button.rb
|
28
|
+
lib/www/mechanize/form/multi_select_list.rb
|
29
|
+
lib/www/mechanize/form/option.rb
|
30
|
+
lib/www/mechanize/form/radio_button.rb
|
31
|
+
lib/www/mechanize/form/select_list.rb
|
32
|
+
lib/www/mechanize/headers.rb
|
33
|
+
lib/www/mechanize/history.rb
|
34
|
+
lib/www/mechanize/inspect.rb
|
35
|
+
lib/www/mechanize/list.rb
|
36
|
+
lib/www/mechanize/monkey_patch.rb
|
37
|
+
lib/www/mechanize/page.rb
|
38
|
+
lib/www/mechanize/page/base.rb
|
39
|
+
lib/www/mechanize/page/frame.rb
|
40
|
+
lib/www/mechanize/page/link.rb
|
41
|
+
lib/www/mechanize/page/meta.rb
|
42
|
+
lib/www/mechanize/pluggable_parsers.rb
|
43
|
+
lib/www/mechanize/response_code_error.rb
|
33
44
|
test/data/htpasswd
|
34
45
|
test/data/server.crt
|
35
46
|
test/data/server.csr
|
36
47
|
test/data/server.key
|
37
48
|
test/data/server.pem
|
49
|
+
test/helper.rb
|
38
50
|
test/htdocs/alt_text.html
|
39
51
|
test/htdocs/bad_form_test.html
|
40
52
|
test/htdocs/button.jpg
|
@@ -73,6 +85,7 @@ test/htdocs/tc_referer.html
|
|
73
85
|
test/htdocs/tc_relative_links.html
|
74
86
|
test/htdocs/tc_textarea.html
|
75
87
|
test/htdocs/unusual______.html
|
88
|
+
test/servlets.rb
|
76
89
|
test/ssl_server.rb
|
77
90
|
test/tc_authenticate.rb
|
78
91
|
test/tc_bad_links.rb
|
@@ -97,13 +110,13 @@ test/tc_if_modified_since.rb
|
|
97
110
|
test/tc_keep_alive.rb
|
98
111
|
test/tc_links.rb
|
99
112
|
test/tc_mech.rb
|
113
|
+
test/tc_mechanize_file.rb
|
100
114
|
test/tc_multi_select.rb
|
101
115
|
test/tc_no_attributes.rb
|
102
116
|
test/tc_page.rb
|
103
117
|
test/tc_pluggable_parser.rb
|
104
118
|
test/tc_post_form.rb
|
105
119
|
test/tc_pretty_print.rb
|
106
|
-
test/tc_proxy.rb
|
107
120
|
test/tc_radiobutton.rb
|
108
121
|
test/tc_referer.rb
|
109
122
|
test/tc_relative_links.rb
|
@@ -118,8 +131,4 @@ test/tc_ssl_server.rb
|
|
118
131
|
test/tc_subclass.rb
|
119
132
|
test/tc_textarea.rb
|
120
133
|
test/tc_upload.rb
|
121
|
-
test/tc_watches.rb
|
122
134
|
test/test_all.rb
|
123
|
-
test/test_includes.rb
|
124
|
-
test/test_mechanize_file.rb
|
125
|
-
test/test_servlets.rb
|
data/lib/mechanize.rb
CHANGED
@@ -1,657 +1,7 @@
|
|
1
|
-
# Original Code:
|
2
1
|
# Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de)
|
3
|
-
#
|
4
|
-
# New Code:
|
5
|
-
# Copyright (c) 2006 by Aaron Patterson (aaronp@rubyforge.org)
|
2
|
+
# Copyright (c) 2007 by Aaron Patterson (aaronp@rubyforge.org)
|
6
3
|
#
|
7
4
|
# Please see the LICENSE file for licensing.
|
8
|
-
#
|
9
|
-
|
10
|
-
# required due to the missing get_fields method in Ruby 1.8.2
|
11
|
-
unless RUBY_VERSION > "1.8.2"
|
12
|
-
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides")
|
13
|
-
end
|
14
|
-
|
15
|
-
require 'net/http'
|
16
|
-
require 'net/https'
|
17
|
-
|
18
|
-
# Monkey patch for ruby 1.8.4
|
19
|
-
unless RUBY_VERSION > "1.8.4"
|
20
|
-
module Net # :nodoc:
|
21
|
-
class HTTPResponse # :nodoc:
|
22
|
-
CODE_TO_OBJ['500'] = HTTPInternalServerError
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
require 'uri'
|
28
|
-
require 'webrick/httputils'
|
29
|
-
require 'zlib'
|
30
|
-
require 'stringio'
|
31
|
-
require 'digest/md5'
|
32
|
-
require 'mechanize/monkey_patch'
|
33
|
-
require 'mechanize/cookie'
|
34
|
-
require 'mechanize/errors'
|
35
|
-
require 'mechanize/pluggable_parsers'
|
36
|
-
require 'mechanize/form'
|
37
|
-
require 'mechanize/form_elements'
|
38
|
-
require 'mechanize/history'
|
39
|
-
require 'mechanize/list'
|
40
|
-
require 'mechanize/page'
|
41
|
-
require 'mechanize/page_elements'
|
42
|
-
require 'mechanize/inspect'
|
43
|
-
|
44
|
-
module WWW
|
45
|
-
|
46
|
-
# = Synopsis
|
47
|
-
# The Mechanize library is used for automating interaction with a website. It
|
48
|
-
# can follow links, and submit forms. Form fields can be populated and
|
49
|
-
# submitted. A history of URL's is maintained and can be queried.
|
50
|
-
#
|
51
|
-
# == Example
|
52
|
-
# require 'rubygems'
|
53
|
-
# require 'mechanize'
|
54
|
-
# require 'logger'
|
55
|
-
#
|
56
|
-
# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
|
57
|
-
# agent.user_agent_alias = 'Mac Safari'
|
58
|
-
# page = agent.get("http://www.google.com/")
|
59
|
-
# search_form = page.forms.name("f").first
|
60
|
-
# search_form.fields.name("q").value = "Hello"
|
61
|
-
# search_results = agent.submit(search_form)
|
62
|
-
# puts search_results.body
|
63
|
-
class Mechanize
|
64
|
-
##
|
65
|
-
# The version of Mechanize you are using.
|
66
|
-
|
67
|
-
VERSION = '0.6.11'
|
68
|
-
|
69
|
-
##
|
70
|
-
# User Agent aliases
|
71
|
-
AGENT_ALIASES = {
|
72
|
-
'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
|
73
|
-
'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
74
|
-
'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
|
75
|
-
'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3',
|
76
|
-
'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
|
77
|
-
'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
|
78
|
-
'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
|
79
|
-
'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
|
80
|
-
'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"
|
81
|
-
}
|
82
|
-
|
83
|
-
attr_accessor :cookie_jar
|
84
|
-
attr_accessor :log
|
85
|
-
attr_accessor :open_timeout, :read_timeout
|
86
|
-
attr_accessor :user_agent
|
87
|
-
attr_accessor :watch_for_set
|
88
|
-
attr_accessor :ca_file
|
89
|
-
attr_accessor :key
|
90
|
-
attr_accessor :cert
|
91
|
-
attr_accessor :pass
|
92
|
-
attr_accessor :redirect_ok
|
93
|
-
attr_accessor :keep_alive_time
|
94
|
-
attr_accessor :keep_alive
|
95
|
-
attr_accessor :conditional_requests
|
96
|
-
attr_accessor :follow_meta_refresh
|
97
|
-
|
98
|
-
attr_reader :history
|
99
|
-
attr_reader :pluggable_parser
|
100
|
-
|
101
|
-
alias :follow_redirect? :redirect_ok
|
102
|
-
|
103
|
-
@@nonce_count = -1
|
104
|
-
CNONCE = Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535)))
|
105
|
-
|
106
|
-
def initialize
|
107
|
-
# attr_accessors
|
108
|
-
@cookie_jar = CookieJar.new
|
109
|
-
@log = nil
|
110
|
-
@open_timeout = nil
|
111
|
-
@read_timeout = nil
|
112
|
-
@user_agent = AGENT_ALIASES['Mechanize']
|
113
|
-
@watch_for_set = nil
|
114
|
-
@ca_file = nil
|
115
|
-
@cert = nil # OpenSSL Certificate
|
116
|
-
@key = nil # OpenSSL Private Key
|
117
|
-
@pass = nil # OpenSSL Password
|
118
|
-
@redirect_ok = true # Should we follow redirects?
|
119
|
-
|
120
|
-
# attr_readers
|
121
|
-
@history = WWW::Mechanize::History.new
|
122
|
-
@pluggable_parser = PluggableParser.new
|
123
|
-
|
124
|
-
# Auth variables
|
125
|
-
@user = nil # Auth User
|
126
|
-
@password = nil # Auth Password
|
127
|
-
@digest = nil # DigestAuth Digest
|
128
|
-
@auth_hash = {} # Keep track of urls for sending auth
|
129
|
-
|
130
|
-
# Proxy settings
|
131
|
-
@proxy_addr = nil
|
132
|
-
@proxy_pass = nil
|
133
|
-
@proxy_port = nil
|
134
|
-
@proxy_user = nil
|
135
|
-
|
136
|
-
@conditional_requests = true
|
137
|
-
|
138
|
-
@follow_meta_refresh = false
|
139
|
-
|
140
|
-
# Connection Cache & Keep alive
|
141
|
-
@connection_cache = {}
|
142
|
-
@keep_alive_time = 300
|
143
|
-
@keep_alive = true
|
144
|
-
|
145
|
-
yield self if block_given?
|
146
|
-
end
|
147
|
-
|
148
|
-
def max_history=(length); @history.max_size = length; end
|
149
|
-
def max_history; @history.max_size; end
|
150
|
-
|
151
|
-
# Sets the proxy address, port, user, and password
|
152
|
-
def set_proxy(addr, port, user = nil, pass = nil)
|
153
|
-
@proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
|
154
|
-
end
|
155
|
-
|
156
|
-
# Set the user agent for the Mechanize object.
|
157
|
-
# See AGENT_ALIASES
|
158
|
-
def user_agent_alias=(al)
|
159
|
-
self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
|
160
|
-
end
|
161
|
-
|
162
|
-
# Returns a list of cookies stored in the cookie jar.
|
163
|
-
def cookies
|
164
|
-
@cookie_jar.to_a
|
165
|
-
end
|
166
|
-
|
167
|
-
# Sets the user and password to be used for basic authentication.
|
168
|
-
def basic_auth(user, password)
|
169
|
-
auth(user, password)
|
170
|
-
end
|
171
|
-
|
172
|
-
def auth(user, password)
|
173
|
-
@user = user
|
174
|
-
@password = password
|
175
|
-
end
|
176
|
-
|
177
|
-
# Fetches the URL passed in and returns a page.
|
178
|
-
def get(url, referer=nil, &block)
|
179
|
-
cur_page = referer || current_page ||
|
180
|
-
Page.new( nil, {'content-type'=>'text/html'})
|
181
|
-
|
182
|
-
# fetch the page
|
183
|
-
abs_uri = to_absolute_uri(url, cur_page)
|
184
|
-
request = fetch_request(abs_uri)
|
185
|
-
page = fetch_page(abs_uri, request, cur_page, &block)
|
186
|
-
add_to_history(page)
|
187
|
-
page
|
188
|
-
end
|
189
|
-
|
190
|
-
# Fetch a file and return the contents of the file.
|
191
|
-
def get_file(url)
|
192
|
-
get(url).body
|
193
|
-
end
|
194
|
-
|
195
|
-
|
196
|
-
# Clicks the WWW::Mechanize::Link object passed in and returns the
|
197
|
-
# page fetched.
|
198
|
-
def click(link)
|
199
|
-
referer =
|
200
|
-
begin
|
201
|
-
link.page
|
202
|
-
rescue
|
203
|
-
nil
|
204
|
-
end
|
205
|
-
uri = to_absolute_uri(
|
206
|
-
link.attributes['href'] || link.attributes['src'] || link.href,
|
207
|
-
referer || current_page()
|
208
|
-
)
|
209
|
-
get(uri, referer)
|
210
|
-
end
|
211
|
-
|
212
|
-
# Equivalent to the browser back button. Returns the most recent page
|
213
|
-
# visited.
|
214
|
-
def back
|
215
|
-
@history.pop
|
216
|
-
end
|
217
|
-
|
218
|
-
# Posts to the given URL wht the query parameters passed in. Query
|
219
|
-
# parameters can be passed as a hash, or as an array of arrays.
|
220
|
-
# Example:
|
221
|
-
# agent.post('http://example.com/', "foo" => "bar")
|
222
|
-
# or
|
223
|
-
# agent.post('http://example.com/', [ ["foo", "bar"] ])
|
224
|
-
def post(url, query={})
|
225
|
-
node = Hpricot::Elem.new(Hpricot::STag.new('form'))
|
226
|
-
node['method'] = 'POST'
|
227
|
-
node['enctype'] = 'application/x-www-form-urlencoded'
|
228
|
-
|
229
|
-
form = Form.new(node)
|
230
|
-
query.each { |k,v|
|
231
|
-
form.fields << Field.new(k,v)
|
232
|
-
}
|
233
|
-
post_form(url, form)
|
234
|
-
end
|
235
|
-
|
236
|
-
# Submit a form with an optional button.
|
237
|
-
# Without a button:
|
238
|
-
# page = agent.get('http://example.com')
|
239
|
-
# agent.submit(page.forms.first)
|
240
|
-
# With a button
|
241
|
-
# agent.submit(page.forms.first, page.forms.first.buttons.first)
|
242
|
-
def submit(form, button=nil)
|
243
|
-
form.add_button_to_query(button) if button
|
244
|
-
uri = to_absolute_uri(form.action, form.page)
|
245
|
-
case form.method.upcase
|
246
|
-
when 'POST'
|
247
|
-
post_form(uri, form)
|
248
|
-
when 'GET'
|
249
|
-
uri.query = WWW::Mechanize.build_query_string(form.build_query)
|
250
|
-
get(uri)
|
251
|
-
else
|
252
|
-
raise "unsupported method: #{form.method.upcase}"
|
253
|
-
end
|
254
|
-
end
|
255
|
-
|
256
|
-
# Returns the current page loaded by Mechanize
|
257
|
-
def current_page
|
258
|
-
@history.last
|
259
|
-
end
|
260
|
-
|
261
|
-
# Returns whether or not a url has been visited
|
262
|
-
def visited?(url)
|
263
|
-
! visited_page(url).nil?
|
264
|
-
end
|
265
|
-
|
266
|
-
# Returns a visited page for the url passed in, otherwise nil
|
267
|
-
def visited_page(url)
|
268
|
-
if url.respond_to? :href
|
269
|
-
url = url.href
|
270
|
-
end
|
271
|
-
@history.visited_page(to_absolute_uri(url))
|
272
|
-
end
|
273
|
-
|
274
|
-
# Runs given block, then resets the page history as it was before. self is
|
275
|
-
# given as a parameter to the block. Returns the value of the block.
|
276
|
-
def transact
|
277
|
-
history_backup = @history.dup
|
278
|
-
begin
|
279
|
-
yield self
|
280
|
-
ensure
|
281
|
-
@history = history_backup
|
282
|
-
end
|
283
|
-
end
|
284
|
-
|
285
|
-
alias :page :current_page
|
286
|
-
|
287
|
-
protected
|
288
|
-
def set_headers(uri, request, cur_page)
|
289
|
-
if @keep_alive
|
290
|
-
request.add_field('Connection', 'keep-alive')
|
291
|
-
request.add_field('Keep-Alive', keep_alive_time.to_s)
|
292
|
-
else
|
293
|
-
request.add_field('Connection', 'close')
|
294
|
-
end
|
295
|
-
request.add_field('Accept-Encoding', 'gzip,identity')
|
296
|
-
request.add_field('Accept-Language', 'en-us,en;q0.5')
|
297
|
-
request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
|
298
|
-
|
299
|
-
unless @cookie_jar.empty?(uri)
|
300
|
-
cookies = @cookie_jar.cookies(uri)
|
301
|
-
cookie = cookies.length > 0 ? cookies.join("; ") : nil
|
302
|
-
if log
|
303
|
-
cookies.each do |c|
|
304
|
-
log.debug("using cookie: #{c}")
|
305
|
-
end
|
306
|
-
end
|
307
|
-
request.add_field('Cookie', cookie)
|
308
|
-
end
|
309
|
-
|
310
|
-
# Add Referer header to request
|
311
|
-
unless cur_page.uri.nil?
|
312
|
-
request.add_field('Referer', cur_page.uri.to_s)
|
313
|
-
end
|
314
|
-
|
315
|
-
# Add User-Agent header to request
|
316
|
-
request.add_field('User-Agent', @user_agent) if @user_agent
|
317
|
-
|
318
|
-
# Add If-Modified-Since if page is in history
|
319
|
-
if @conditional_requests
|
320
|
-
if( (page = visited_page(uri)) && page.response['Last-Modified'] )
|
321
|
-
request.add_field('If-Modified-Since', page.response['Last-Modified'])
|
322
|
-
end
|
323
|
-
end
|
324
|
-
|
325
|
-
if( @auth_hash[uri.host] )
|
326
|
-
case @auth_hash[uri.host]
|
327
|
-
when :basic
|
328
|
-
request.basic_auth(@user, @password)
|
329
|
-
when :digest
|
330
|
-
@digest_response ||= nil
|
331
|
-
@digest_response = self.gen_auth_header(uri,request,@digest) if @digest
|
332
|
-
request.add_field('Authorization', @digest_response) if @digest_response
|
333
|
-
end
|
334
|
-
end
|
335
|
-
|
336
|
-
request
|
337
|
-
end
|
338
|
-
|
339
|
-
def gen_auth_header(uri, request, auth_header, is_IIS = false)
|
340
|
-
@@nonce_count += 1
|
341
|
-
|
342
|
-
user = @digest_user
|
343
|
-
password = @digest_password
|
344
|
-
|
345
|
-
auth_header =~ /^(\w+) (.*)/
|
346
|
-
|
347
|
-
params = {}
|
348
|
-
$2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 }
|
349
|
-
|
350
|
-
a_1 = "#{@user}:#{params['realm']}:#{@password}"
|
351
|
-
a_2 = "#{request.method}:#{uri.path}"
|
352
|
-
request_digest = ''
|
353
|
-
request_digest << Digest::MD5.hexdigest(a_1)
|
354
|
-
request_digest << ':' << params['nonce']
|
355
|
-
request_digest << ':' << ('%08x' % @@nonce_count)
|
356
|
-
request_digest << ':' << CNONCE
|
357
|
-
request_digest << ':' << params['qop']
|
358
|
-
request_digest << ':' << Digest::MD5.hexdigest(a_2)
|
359
|
-
|
360
|
-
header = ''
|
361
|
-
header << "Digest username=\"#{@user}\", "
|
362
|
-
header << "realm=\"#{params['realm']}\", "
|
363
|
-
if is_IIS then
|
364
|
-
header << "qop=\"#{params['qop']}\", "
|
365
|
-
else
|
366
|
-
header << "qop=#{params['qop']}, "
|
367
|
-
end
|
368
|
-
header << "uri=\"#{uri.path}\", "
|
369
|
-
header << "algorithm=MD5, "
|
370
|
-
header << "nonce=\"#{params['nonce']}\", "
|
371
|
-
header << "nc=#{'%08x' % @@nonce_count}, "
|
372
|
-
header << "cnonce=\"#{CNONCE}\", "
|
373
|
-
header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\""
|
374
|
-
|
375
|
-
return header
|
376
|
-
end
|
377
|
-
|
378
|
-
private
|
379
|
-
|
380
|
-
def to_absolute_uri(url, cur_page=current_page())
|
381
|
-
unless url.is_a? URI
|
382
|
-
url = url.to_s.strip.gsub(/[^#{0.chr}-#{125.chr}]/) { |match|
|
383
|
-
sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0])
|
384
|
-
}
|
385
|
-
|
386
|
-
url = URI.parse(
|
387
|
-
Util.html_unescape(
|
388
|
-
url.split(/%[0-9A-Fa-f]{2}|#/).zip(
|
389
|
-
url.scan(/%[0-9A-Fa-f]{2}|#/)
|
390
|
-
).map { |x,y|
|
391
|
-
"#{URI.escape(x)}#{y}"
|
392
|
-
}.join('')
|
393
|
-
)
|
394
|
-
)
|
395
|
-
end
|
396
|
-
|
397
|
-
url.path = '/' if url.path.length == 0
|
398
|
-
|
399
|
-
# construct an absolute uri
|
400
|
-
if url.relative?
|
401
|
-
raise 'no history. please specify an absolute URL' unless cur_page.uri
|
402
|
-
base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil
|
403
|
-
url = ((base && base.uri && base.uri.absolute?) ?
|
404
|
-
base.uri :
|
405
|
-
cur_page.uri) + url
|
406
|
-
url = cur_page.uri + url
|
407
|
-
# Strip initial "/.." bits from the path
|
408
|
-
url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
|
409
|
-
end
|
410
|
-
|
411
|
-
return url
|
412
|
-
end
|
413
|
-
|
414
|
-
def post_form(url, form)
|
415
|
-
cur_page = form.page || current_page ||
|
416
|
-
Page.new( nil, {'content-type'=>'text/html'})
|
417
|
-
|
418
|
-
request_data = form.request_data
|
419
|
-
|
420
|
-
abs_url = to_absolute_uri(url, cur_page)
|
421
|
-
request = fetch_request(abs_url, :post)
|
422
|
-
request.add_field('Content-Type', form.enctype)
|
423
|
-
request.add_field('Content-Length', request_data.size.to_s)
|
424
|
-
|
425
|
-
log.debug("query: #{ request_data.inspect }") if log
|
426
|
-
|
427
|
-
# fetch the page
|
428
|
-
page = fetch_page(abs_url, request, cur_page, [request_data])
|
429
|
-
add_to_history(page)
|
430
|
-
page
|
431
|
-
end
|
432
|
-
|
433
|
-
# Creates a new request object based on the scheme and type
|
434
|
-
def fetch_request(uri, type = :get)
|
435
|
-
raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
|
436
|
-
if type == :get
|
437
|
-
Net::HTTP::Get.new(uri.request_uri)
|
438
|
-
else
|
439
|
-
Net::HTTP::Post.new(uri.request_uri)
|
440
|
-
end
|
441
|
-
end
|
442
|
-
|
443
|
-
# uri is an absolute URI
|
444
|
-
def fetch_page(uri, request, cur_page=current_page(), request_data=[])
|
445
|
-
raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
|
446
|
-
|
447
|
-
log.info("#{ request.class }: #{ request.path }") if log
|
448
|
-
|
449
|
-
page = nil
|
450
|
-
|
451
|
-
cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= {
|
452
|
-
:connection => nil,
|
453
|
-
:keep_alive_options => {},
|
454
|
-
})
|
455
|
-
http_obj = cache_obj[:connection]
|
456
|
-
if http_obj.nil? || ! http_obj.started?
|
457
|
-
http_obj = cache_obj[:connection] =
|
458
|
-
Net::HTTP.new( uri.host,
|
459
|
-
uri.port,
|
460
|
-
@proxy_addr,
|
461
|
-
@proxy_port,
|
462
|
-
@proxy_user,
|
463
|
-
@proxy_pass
|
464
|
-
)
|
465
|
-
cache_obj[:keep_alive_options] = {}
|
466
|
-
|
467
|
-
# Specify timeouts if given
|
468
|
-
http_obj.open_timeout = @open_timeout if @open_timeout
|
469
|
-
http_obj.read_timeout = @read_timeout if @read_timeout
|
470
|
-
end
|
471
|
-
|
472
|
-
if uri.scheme == 'https' && ! http_obj.started?
|
473
|
-
http_obj.use_ssl = true
|
474
|
-
http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
475
|
-
if @ca_file
|
476
|
-
http_obj.ca_file = @ca_file
|
477
|
-
http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
478
|
-
end
|
479
|
-
if @cert && @key
|
480
|
-
http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
|
481
|
-
http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
|
482
|
-
end
|
483
|
-
end
|
484
|
-
|
485
|
-
# If we're keeping connections alive and the last request time is too
|
486
|
-
# long ago, stop the connection. Or, if the max requests left is 1,
|
487
|
-
# reset the connection.
|
488
|
-
if @keep_alive && http_obj.started?
|
489
|
-
opts = cache_obj[:keep_alive_options]
|
490
|
-
if((opts[:timeout] &&
|
491
|
-
Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) ||
|
492
|
-
opts[:max] && opts[:max].to_i == 1)
|
493
|
-
|
494
|
-
log.debug('Finishing stale connection') if log
|
495
|
-
http_obj.finish
|
496
|
-
|
497
|
-
end
|
498
|
-
end
|
499
|
-
|
500
|
-
http_obj.start unless http_obj.started?
|
501
|
-
|
502
|
-
request = set_headers(uri, request, cur_page)
|
503
|
-
|
504
|
-
# Log specified headers for the request
|
505
|
-
if log
|
506
|
-
request.each_header do |k, v|
|
507
|
-
log.debug("request-header: #{ k } => #{ v }")
|
508
|
-
end
|
509
|
-
end
|
510
|
-
|
511
|
-
cache_obj[:last_request_time] = Time.now.to_i
|
512
|
-
|
513
|
-
# Send the request
|
514
|
-
response = http_obj.request(request, *request_data) {|response|
|
515
|
-
|
516
|
-
body = StringIO.new
|
517
|
-
total = 0
|
518
|
-
response.read_body { |part|
|
519
|
-
total += part.length
|
520
|
-
body.write(part)
|
521
|
-
log.debug("Read #{total} bytes") if log
|
522
|
-
}
|
523
|
-
body.rewind
|
524
|
-
|
525
|
-
response.each_header { |k,v|
|
526
|
-
log.debug("response-header: #{ k } => #{ v }")
|
527
|
-
} if log
|
528
|
-
|
529
|
-
content_type = nil
|
530
|
-
unless response['Content-Type'].nil?
|
531
|
-
data = response['Content-Type'].match(/^([^;]*)/)
|
532
|
-
content_type = data[1].downcase unless data.nil?
|
533
|
-
end
|
534
|
-
|
535
|
-
response_body =
|
536
|
-
if encoding = response['Content-Encoding']
|
537
|
-
case encoding.downcase
|
538
|
-
when 'gzip'
|
539
|
-
log.debug('gunzip body') if log
|
540
|
-
Zlib::GzipReader.new(body).read
|
541
|
-
when 'x-gzip'
|
542
|
-
body.read
|
543
|
-
else
|
544
|
-
raise 'Unsupported content encoding'
|
545
|
-
end
|
546
|
-
else
|
547
|
-
body.read
|
548
|
-
end
|
549
|
-
|
550
|
-
# Find our pluggable parser
|
551
|
-
page = @pluggable_parser.parser(content_type).new(
|
552
|
-
uri,
|
553
|
-
response,
|
554
|
-
response_body,
|
555
|
-
response.code
|
556
|
-
) { |parser|
|
557
|
-
parser.mech = self if parser.respond_to? :mech=
|
558
|
-
if parser.respond_to?(:watch_for_set=) && @watch_for_set
|
559
|
-
parser.watch_for_set = @watch_for_set
|
560
|
-
end
|
561
|
-
}
|
562
|
-
|
563
|
-
}
|
564
|
-
|
565
|
-
# If the server sends back keep alive options, save them
|
566
|
-
if keep_alive_info = response['keep-alive']
|
567
|
-
keep_alive_info.split(/,\s*/).each do |option|
|
568
|
-
k, v = option.split(/=/)
|
569
|
-
cache_obj[:keep_alive_options] ||= {}
|
570
|
-
cache_obj[:keep_alive_options][k.intern] = v
|
571
|
-
end
|
572
|
-
end
|
573
|
-
|
574
|
-
(response.get_fields('Set-Cookie')||[]).each do |cookie|
|
575
|
-
Cookie::parse(uri, cookie, log) { |c|
|
576
|
-
log.debug("saved cookie: #{c}") if log
|
577
|
-
@cookie_jar.add(uri, c)
|
578
|
-
}
|
579
|
-
end
|
580
|
-
|
581
|
-
log.info("status: #{ page.code }") if log
|
582
|
-
|
583
|
-
res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
|
584
|
-
|
585
|
-
if follow_meta_refresh && page.respond_to?(:meta) &&
|
586
|
-
(redirect = page.meta.first)
|
587
|
-
return redirect.click
|
588
|
-
end
|
589
|
-
|
590
|
-
return page if res_klass <= Net::HTTPSuccess
|
591
|
-
|
592
|
-
if res_klass == Net::HTTPNotModified
|
593
|
-
log.debug("Got cached page") if log
|
594
|
-
return visited_page(uri)
|
595
|
-
elsif res_klass <= Net::HTTPRedirection
|
596
|
-
return page unless follow_redirect?
|
597
|
-
log.info("follow redirect to: #{ response['Location'] }") if log
|
598
|
-
from_uri = page.uri
|
599
|
-
abs_uri = to_absolute_uri(response['Location'].to_s, page)
|
600
|
-
page = fetch_page(abs_uri, fetch_request(abs_uri), page)
|
601
|
-
@history.push(page, from_uri)
|
602
|
-
return page
|
603
|
-
elsif res_klass <= Net::HTTPUnauthorized
|
604
|
-
raise ResponseCodeError.new(page) unless @user || @password
|
605
|
-
raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
|
606
|
-
if response['www-authenticate'] =~ /Digest/i
|
607
|
-
@auth_hash[uri.host] = :digest
|
608
|
-
@digest = response['www-authenticate']
|
609
|
-
else
|
610
|
-
@auth_hash[uri.host] = :basic
|
611
|
-
end
|
612
|
-
return fetch_page( uri,
|
613
|
-
fetch_request(uri, request.method.downcase.to_sym),
|
614
|
-
cur_page,
|
615
|
-
request_data
|
616
|
-
)
|
617
|
-
end
|
618
|
-
|
619
|
-
raise ResponseCodeError.new(page), "Unhandled response", caller
|
620
|
-
end
|
621
|
-
|
622
|
-
def self.build_query_string(parameters)
|
623
|
-
vals = []
|
624
|
-
parameters.each { |k,v|
|
625
|
-
next if k.nil?
|
626
|
-
vals <<
|
627
|
-
[WEBrick::HTTPUtils.escape_form(k),
|
628
|
-
WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
|
629
|
-
}
|
630
|
-
|
631
|
-
vals.join("&")
|
632
|
-
end
|
633
|
-
|
634
|
-
def add_to_history(page)
|
635
|
-
@history.push(page, to_absolute_uri(page.uri))
|
636
|
-
end
|
637
|
-
|
638
|
-
# :stopdoc:
|
639
|
-
class Util
|
640
|
-
def self.html_unescape(s)
|
641
|
-
return s unless s
|
642
|
-
s.gsub(/&(\w+|#[0-9]+);/) { |match|
|
643
|
-
number = case match
|
644
|
-
when /&(\w+);/
|
645
|
-
Hpricot::NamedCharacters[$1]
|
646
|
-
when /&#([0-9]+);/
|
647
|
-
$1.to_i
|
648
|
-
end
|
649
5
|
|
650
|
-
number ? ([number].pack('U') rescue match) : match
|
651
|
-
}
|
652
|
-
end
|
653
|
-
end
|
654
|
-
# :startdoc:
|
655
|
-
end
|
656
6
|
|
657
|
-
|
7
|
+
require 'www/mechanize'
|