tenderlove-mechanize 0.9.3.20090617085936
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +496 -0
- data/EXAMPLES.rdoc +171 -0
- data/FAQ.rdoc +11 -0
- data/GUIDE.rdoc +122 -0
- data/LICENSE.rdoc +340 -0
- data/Manifest.txt +169 -0
- data/README.rdoc +60 -0
- data/Rakefile +43 -0
- data/examples/flickr_upload.rb +23 -0
- data/examples/mech-dump.rb +7 -0
- data/examples/proxy_req.rb +9 -0
- data/examples/rubyforge.rb +21 -0
- data/examples/spider.rb +11 -0
- data/lib/mechanize.rb +7 -0
- data/lib/www/mechanize/chain/auth_headers.rb +80 -0
- data/lib/www/mechanize/chain/body_decoding_handler.rb +48 -0
- data/lib/www/mechanize/chain/connection_resolver.rb +78 -0
- data/lib/www/mechanize/chain/custom_headers.rb +23 -0
- data/lib/www/mechanize/chain/handler.rb +9 -0
- data/lib/www/mechanize/chain/header_resolver.rb +53 -0
- data/lib/www/mechanize/chain/parameter_resolver.rb +24 -0
- data/lib/www/mechanize/chain/post_connect_hook.rb +0 -0
- data/lib/www/mechanize/chain/pre_connect_hook.rb +22 -0
- data/lib/www/mechanize/chain/request_resolver.rb +32 -0
- data/lib/www/mechanize/chain/response_body_parser.rb +40 -0
- data/lib/www/mechanize/chain/response_header_handler.rb +50 -0
- data/lib/www/mechanize/chain/response_reader.rb +41 -0
- data/lib/www/mechanize/chain/ssl_resolver.rb +42 -0
- data/lib/www/mechanize/chain/uri_resolver.rb +77 -0
- data/lib/www/mechanize/chain.rb +34 -0
- data/lib/www/mechanize/content_type_error.rb +16 -0
- data/lib/www/mechanize/cookie.rb +72 -0
- data/lib/www/mechanize/cookie_jar.rb +191 -0
- data/lib/www/mechanize/file.rb +73 -0
- data/lib/www/mechanize/file_response.rb +62 -0
- data/lib/www/mechanize/file_saver.rb +39 -0
- data/lib/www/mechanize/form/button.rb +8 -0
- data/lib/www/mechanize/form/check_box.rb +13 -0
- data/lib/www/mechanize/form/field.rb +28 -0
- data/lib/www/mechanize/form/file_upload.rb +24 -0
- data/lib/www/mechanize/form/image_button.rb +23 -0
- data/lib/www/mechanize/form/multi_select_list.rb +69 -0
- data/lib/www/mechanize/form/option.rb +51 -0
- data/lib/www/mechanize/form/radio_button.rb +38 -0
- data/lib/www/mechanize/form/select_list.rb +45 -0
- data/lib/www/mechanize/form.rb +360 -0
- data/lib/www/mechanize/headers.rb +12 -0
- data/lib/www/mechanize/history.rb +67 -0
- data/lib/www/mechanize/inspect.rb +90 -0
- data/lib/www/mechanize/monkey_patch.rb +37 -0
- data/lib/www/mechanize/page/base.rb +10 -0
- data/lib/www/mechanize/page/frame.rb +22 -0
- data/lib/www/mechanize/page/link.rb +50 -0
- data/lib/www/mechanize/page/meta.rb +51 -0
- data/lib/www/mechanize/page.rb +176 -0
- data/lib/www/mechanize/pluggable_parsers.rb +103 -0
- data/lib/www/mechanize/redirect_limit_reached_error.rb +18 -0
- data/lib/www/mechanize/redirect_not_get_or_head_error.rb +20 -0
- data/lib/www/mechanize/response_code_error.rb +25 -0
- data/lib/www/mechanize/unsupported_scheme_error.rb +10 -0
- data/lib/www/mechanize/util.rb +76 -0
- data/lib/www/mechanize.rb +619 -0
- data/mechanize.gemspec +41 -0
- data/test/chain/test_argument_validator.rb +14 -0
- data/test/chain/test_auth_headers.rb +25 -0
- data/test/chain/test_custom_headers.rb +18 -0
- data/test/chain/test_header_resolver.rb +28 -0
- data/test/chain/test_parameter_resolver.rb +35 -0
- data/test/chain/test_request_resolver.rb +29 -0
- data/test/chain/test_response_reader.rb +24 -0
- data/test/data/htpasswd +1 -0
- data/test/data/server.crt +16 -0
- data/test/data/server.csr +12 -0
- data/test/data/server.key +15 -0
- data/test/data/server.pem +15 -0
- data/test/helper.rb +129 -0
- data/test/htdocs/alt_text.html +10 -0
- data/test/htdocs/bad_form_test.html +9 -0
- data/test/htdocs/button.jpg +0 -0
- data/test/htdocs/empty_form.html +6 -0
- data/test/htdocs/file_upload.html +26 -0
- data/test/htdocs/find_link.html +41 -0
- data/test/htdocs/form_multi_select.html +16 -0
- data/test/htdocs/form_multival.html +37 -0
- data/test/htdocs/form_no_action.html +18 -0
- data/test/htdocs/form_no_input_name.html +16 -0
- data/test/htdocs/form_select.html +16 -0
- data/test/htdocs/form_select_all.html +16 -0
- data/test/htdocs/form_select_none.html +17 -0
- data/test/htdocs/form_select_noopts.html +10 -0
- data/test/htdocs/form_set_fields.html +14 -0
- data/test/htdocs/form_test.html +188 -0
- data/test/htdocs/frame_test.html +30 -0
- data/test/htdocs/google.html +13 -0
- data/test/htdocs/iframe_test.html +16 -0
- data/test/htdocs/index.html +6 -0
- data/test/htdocs/link with space.html +5 -0
- data/test/htdocs/meta_cookie.html +11 -0
- data/test/htdocs/no_title_test.html +6 -0
- data/test/htdocs/relative/tc_relative_links.html +21 -0
- data/test/htdocs/tc_bad_links.html +5 -0
- data/test/htdocs/tc_base_link.html +8 -0
- data/test/htdocs/tc_blank_form.html +11 -0
- data/test/htdocs/tc_checkboxes.html +19 -0
- data/test/htdocs/tc_encoded_links.html +5 -0
- data/test/htdocs/tc_follow_meta.html +8 -0
- data/test/htdocs/tc_form_action.html +48 -0
- data/test/htdocs/tc_links.html +18 -0
- data/test/htdocs/tc_no_attributes.html +16 -0
- data/test/htdocs/tc_pretty_print.html +17 -0
- data/test/htdocs/tc_radiobuttons.html +17 -0
- data/test/htdocs/tc_referer.html +10 -0
- data/test/htdocs/tc_relative_links.html +19 -0
- data/test/htdocs/tc_textarea.html +23 -0
- data/test/htdocs/unusual______.html +5 -0
- data/test/servlets.rb +365 -0
- data/test/ssl_server.rb +48 -0
- data/test/test_authenticate.rb +71 -0
- data/test/test_bad_links.rb +25 -0
- data/test/test_blank_form.rb +16 -0
- data/test/test_checkboxes.rb +61 -0
- data/test/test_content_type.rb +13 -0
- data/test/test_cookie_class.rb +338 -0
- data/test/test_cookie_jar.rb +362 -0
- data/test/test_cookies.rb +123 -0
- data/test/test_encoded_links.rb +20 -0
- data/test/test_errors.rb +49 -0
- data/test/test_follow_meta.rb +108 -0
- data/test/test_form_action.rb +44 -0
- data/test/test_form_as_hash.rb +61 -0
- data/test/test_form_button.rb +38 -0
- data/test/test_form_no_inputname.rb +15 -0
- data/test/test_forms.rb +564 -0
- data/test/test_frames.rb +25 -0
- data/test/test_get_headers.rb +52 -0
- data/test/test_gzipping.rb +22 -0
- data/test/test_hash_api.rb +45 -0
- data/test/test_history.rb +142 -0
- data/test/test_history_added.rb +16 -0
- data/test/test_html_unscape_forms.rb +39 -0
- data/test/test_if_modified_since.rb +20 -0
- data/test/test_keep_alive.rb +31 -0
- data/test/test_links.rb +120 -0
- data/test/test_mech.rb +268 -0
- data/test/test_mechanize_file.rb +47 -0
- data/test/test_meta.rb +65 -0
- data/test/test_multi_select.rb +106 -0
- data/test/test_no_attributes.rb +13 -0
- data/test/test_option.rb +18 -0
- data/test/test_page.rb +119 -0
- data/test/test_pluggable_parser.rb +145 -0
- data/test/test_post_form.rb +34 -0
- data/test/test_pretty_print.rb +22 -0
- data/test/test_radiobutton.rb +75 -0
- data/test/test_redirect_limit_reached.rb +41 -0
- data/test/test_redirect_verb_handling.rb +45 -0
- data/test/test_referer.rb +39 -0
- data/test/test_relative_links.rb +40 -0
- data/test/test_request.rb +13 -0
- data/test/test_response_code.rb +52 -0
- data/test/test_save_file.rb +48 -0
- data/test/test_scheme.rb +48 -0
- data/test/test_select.rb +106 -0
- data/test/test_select_all.rb +15 -0
- data/test/test_select_none.rb +15 -0
- data/test/test_select_noopts.rb +16 -0
- data/test/test_set_fields.rb +44 -0
- data/test/test_ssl_server.rb +20 -0
- data/test/test_subclass.rb +14 -0
- data/test/test_textarea.rb +45 -0
- data/test/test_upload.rb +109 -0
- data/test/test_verbs.rb +25 -0
- metadata +314 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
|
3
|
+
module WWW
|
4
|
+
class Mechanize
|
5
|
+
class Util
|
6
|
+
CODE_DIC = {
|
7
|
+
:JIS => "ISO-2022-JP",
|
8
|
+
:EUC => "EUC-JP",
|
9
|
+
:SJIS => "SHIFT_JIS",
|
10
|
+
:UTF8 => "UTF-8", :UTF16 => "UTF-16", :UTF32 => "UTF-32"}
|
11
|
+
|
12
|
+
class << self
|
13
|
+
def build_query_string(parameters, enc=nil)
|
14
|
+
parameters.map { |k,v|
|
15
|
+
if k
|
16
|
+
# WEBrick::HTTP.escape* has some problems about m17n on ruby-1.9.*.
|
17
|
+
[CGI.escape(k.to_s), CGI.escape(v.to_s)].join("=")
|
18
|
+
=begin
|
19
|
+
[WEBrick::HTTPUtils.escape_form(k.to_s),
|
20
|
+
WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
|
21
|
+
=end
|
22
|
+
|
23
|
+
end
|
24
|
+
}.compact.join('&')
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_native_charset(s, code=nil)
|
28
|
+
if Mechanize.html_parser == Nokogiri::HTML
|
29
|
+
return unless s
|
30
|
+
code ||= detect_charset(s)
|
31
|
+
Iconv.iconv("UTF-8", code, s).join("")
|
32
|
+
else
|
33
|
+
s
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def from_native_charset(s, code)
|
38
|
+
if Mechanize.html_parser == Nokogiri::HTML
|
39
|
+
return unless s
|
40
|
+
Iconv.iconv(code, "UTF-8", s).join("")
|
41
|
+
else
|
42
|
+
return s
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def html_unescape(s)
|
47
|
+
return s unless s
|
48
|
+
s.gsub(/&(\w+|#[0-9]+);/) { |match|
|
49
|
+
number = case match
|
50
|
+
when /&(\w+);/
|
51
|
+
Mechanize.html_parser::NamedCharacters[$1]
|
52
|
+
when /&#([0-9]+);/
|
53
|
+
$1.to_i
|
54
|
+
end
|
55
|
+
|
56
|
+
number ? ([number].pack('U') rescue match) : match
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
def detect_charset(src)
|
61
|
+
tmp = NKF.guess(src || "<html></html>")
|
62
|
+
if RUBY_VERSION >= "1.9.0"
|
63
|
+
enc = tmp.to_s.upcase
|
64
|
+
else
|
65
|
+
enc = NKF.constants.find{|c|
|
66
|
+
NKF.const_get(c) == tmp
|
67
|
+
}
|
68
|
+
enc = CODE_DIC[enc.intern]
|
69
|
+
end
|
70
|
+
enc || "ISO-8859-1"
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,619 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'net/https'
|
3
|
+
require 'uri'
|
4
|
+
require 'webrick/httputils'
|
5
|
+
require 'zlib'
|
6
|
+
require 'stringio'
|
7
|
+
require 'digest/md5'
|
8
|
+
require 'fileutils'
|
9
|
+
require 'nokogiri'
|
10
|
+
require 'forwardable'
|
11
|
+
require 'iconv'
|
12
|
+
require 'nkf'
|
13
|
+
|
14
|
+
require 'www/mechanize/util'
|
15
|
+
require 'www/mechanize/content_type_error'
|
16
|
+
require 'www/mechanize/response_code_error'
|
17
|
+
require 'www/mechanize/unsupported_scheme_error'
|
18
|
+
require 'www/mechanize/redirect_limit_reached_error'
|
19
|
+
require 'www/mechanize/redirect_not_get_or_head_error'
|
20
|
+
require 'www/mechanize/cookie'
|
21
|
+
require 'www/mechanize/cookie_jar'
|
22
|
+
require 'www/mechanize/history'
|
23
|
+
require 'www/mechanize/form'
|
24
|
+
require 'www/mechanize/pluggable_parsers'
|
25
|
+
require 'www/mechanize/file_response'
|
26
|
+
require 'www/mechanize/inspect'
|
27
|
+
require 'www/mechanize/chain'
|
28
|
+
require 'www/mechanize/monkey_patch'
|
29
|
+
|
30
|
+
module WWW
|
31
|
+
# = Synopsis
|
32
|
+
# The Mechanize library is used for automating interaction with a website. It
|
33
|
+
# can follow links, and submit forms. Form fields can be populated and
|
34
|
+
# submitted. A history of URL's is maintained and can be queried.
|
35
|
+
#
|
36
|
+
# == Example
|
37
|
+
# require 'rubygems'
|
38
|
+
# require 'mechanize'
|
39
|
+
# require 'logger'
|
40
|
+
#
|
41
|
+
# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
|
42
|
+
# agent.user_agent_alias = 'Mac Safari'
|
43
|
+
# page = agent.get("http://www.google.com/")
|
44
|
+
# search_form = page.form_with(:name => "f")
|
45
|
+
# search_form.field_with(:name => "q").value = "Hello"
|
46
|
+
# search_results = agent.submit(search_form)
|
47
|
+
# puts search_results.body
|
48
|
+
class Mechanize
|
49
|
+
##
|
50
|
+
# The version of Mechanize you are using.
|
51
|
+
VERSION = '0.9.3'
|
52
|
+
|
53
|
+
##
|
54
|
+
# User Agent aliases
|
55
|
+
AGENT_ALIASES = {
|
56
|
+
'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
|
57
|
+
'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
58
|
+
'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
|
59
|
+
'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3',
|
60
|
+
'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
|
61
|
+
'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
|
62
|
+
'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
|
63
|
+
'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
|
64
|
+
'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3',
|
65
|
+
'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"
|
66
|
+
}
|
67
|
+
|
68
|
+
attr_accessor :cookie_jar
|
69
|
+
attr_accessor :open_timeout, :read_timeout
|
70
|
+
attr_accessor :user_agent
|
71
|
+
attr_accessor :watch_for_set
|
72
|
+
attr_accessor :ca_file
|
73
|
+
attr_accessor :key
|
74
|
+
attr_accessor :cert
|
75
|
+
attr_accessor :pass
|
76
|
+
attr_accessor :redirect_ok
|
77
|
+
attr_accessor :keep_alive_time
|
78
|
+
attr_accessor :keep_alive
|
79
|
+
attr_accessor :conditional_requests
|
80
|
+
attr_accessor :follow_meta_refresh
|
81
|
+
attr_accessor :verify_callback
|
82
|
+
attr_accessor :history_added
|
83
|
+
attr_accessor :scheme_handlers
|
84
|
+
attr_accessor :redirection_limit
|
85
|
+
|
86
|
+
# A hash of custom request headers
|
87
|
+
attr_accessor :request_headers
|
88
|
+
|
89
|
+
# The HTML parser to be used when parsing documents
|
90
|
+
attr_accessor :html_parser
|
91
|
+
|
92
|
+
attr_reader :history
|
93
|
+
attr_reader :pluggable_parser
|
94
|
+
|
95
|
+
alias :follow_redirect? :redirect_ok
|
96
|
+
|
97
|
+
@html_parser = Nokogiri::HTML
|
98
|
+
class << self; attr_accessor :html_parser, :log end
|
99
|
+
|
100
|
+
def initialize
|
101
|
+
# attr_accessors
|
102
|
+
@cookie_jar = CookieJar.new
|
103
|
+
@log = nil
|
104
|
+
@open_timeout = nil
|
105
|
+
@read_timeout = nil
|
106
|
+
@user_agent = AGENT_ALIASES['Mechanize']
|
107
|
+
@watch_for_set = nil
|
108
|
+
@history_added = nil
|
109
|
+
@ca_file = nil # OpenSSL server certificate file
|
110
|
+
|
111
|
+
# callback for OpenSSL errors while verifying the server certificate
|
112
|
+
# chain, can be used for debugging or to ignore errors by always
|
113
|
+
# returning _true_
|
114
|
+
@verify_callback = nil
|
115
|
+
@cert = nil # OpenSSL Certificate
|
116
|
+
@key = nil # OpenSSL Private Key
|
117
|
+
@pass = nil # OpenSSL Password
|
118
|
+
@redirect_ok = true # Should we follow redirects?
|
119
|
+
|
120
|
+
# attr_readers
|
121
|
+
@history = WWW::Mechanize::History.new
|
122
|
+
@pluggable_parser = PluggableParser.new
|
123
|
+
|
124
|
+
# Auth variables
|
125
|
+
@user = nil # Auth User
|
126
|
+
@password = nil # Auth Password
|
127
|
+
@digest = nil # DigestAuth Digest
|
128
|
+
@auth_hash = {} # Keep track of urls for sending auth
|
129
|
+
@request_headers= {} # A hash of request headers to be used
|
130
|
+
|
131
|
+
# Proxy settings
|
132
|
+
@proxy_addr = nil
|
133
|
+
@proxy_pass = nil
|
134
|
+
@proxy_port = nil
|
135
|
+
@proxy_user = nil
|
136
|
+
|
137
|
+
@conditional_requests = true
|
138
|
+
|
139
|
+
@follow_meta_refresh = false
|
140
|
+
@redirection_limit = 20
|
141
|
+
|
142
|
+
# Connection Cache & Keep alive
|
143
|
+
@connection_cache = {}
|
144
|
+
@keep_alive_time = 300
|
145
|
+
@keep_alive = true
|
146
|
+
|
147
|
+
@scheme_handlers = Hash.new { |h,k|
|
148
|
+
h[k] = lambda { |link, page|
|
149
|
+
raise UnsupportedSchemeError.new(k)
|
150
|
+
}
|
151
|
+
}
|
152
|
+
@scheme_handlers['http'] = lambda { |link, page| link }
|
153
|
+
@scheme_handlers['https'] = @scheme_handlers['http']
|
154
|
+
@scheme_handlers['relative'] = @scheme_handlers['http']
|
155
|
+
@scheme_handlers['file'] = @scheme_handlers['http']
|
156
|
+
|
157
|
+
@pre_connect_hook = Chain::PreConnectHook.new
|
158
|
+
@post_connect_hook = Chain::PostConnectHook.new
|
159
|
+
|
160
|
+
@html_parser = self.class.html_parser
|
161
|
+
|
162
|
+
yield self if block_given?
|
163
|
+
end
|
164
|
+
|
165
|
+
def max_history=(length); @history.max_size = length end
|
166
|
+
def max_history; @history.max_size end
|
167
|
+
def log=(l); self.class.log = l end
|
168
|
+
def log; self.class.log end
|
169
|
+
|
170
|
+
def pre_connect_hooks
|
171
|
+
@pre_connect_hook.hooks
|
172
|
+
end
|
173
|
+
|
174
|
+
def post_connect_hooks
|
175
|
+
@post_connect_hook.hooks
|
176
|
+
end
|
177
|
+
|
178
|
+
# Sets the proxy address, port, user, and password
|
179
|
+
# +addr+ should be a host, with no "http://"
|
180
|
+
def set_proxy(addr, port, user = nil, pass = nil)
|
181
|
+
@proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
|
182
|
+
end
|
183
|
+
|
184
|
+
# Set the user agent for the Mechanize object.
|
185
|
+
# See AGENT_ALIASES
|
186
|
+
def user_agent_alias=(al)
|
187
|
+
self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
|
188
|
+
end
|
189
|
+
|
190
|
+
# Returns a list of cookies stored in the cookie jar.
|
191
|
+
def cookies
|
192
|
+
@cookie_jar.to_a
|
193
|
+
end
|
194
|
+
|
195
|
+
# Sets the user and password to be used for authentication.
|
196
|
+
def auth(user, password)
|
197
|
+
@user = user
|
198
|
+
@password = password
|
199
|
+
end
|
200
|
+
alias :basic_auth :auth
|
201
|
+
|
202
|
+
# Fetches the URL passed in and returns a page.
|
203
|
+
def get(options, parameters = [], referer = nil)
|
204
|
+
unless options.is_a? Hash
|
205
|
+
url = options
|
206
|
+
unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0
|
207
|
+
referer = parameters
|
208
|
+
parameters = []
|
209
|
+
end
|
210
|
+
else
|
211
|
+
raise ArgumentError.new("url must be specified") unless url = options[:url]
|
212
|
+
parameters = options[:params] || []
|
213
|
+
referer = options[:referer]
|
214
|
+
headers = options[:headers]
|
215
|
+
end
|
216
|
+
|
217
|
+
unless referer
|
218
|
+
if url.to_s =~ /^http/
|
219
|
+
referer = Page.new(nil, {'content-type'=>'text/html'})
|
220
|
+
else
|
221
|
+
referer = current_page || Page.new(nil, {'content-type'=>'text/html'})
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
# FIXME: Huge hack so that using a URI as a referer works. I need to
|
226
|
+
# refactor everything to pass around URIs but still support
|
227
|
+
# WWW::Mechanize::Page#base
|
228
|
+
unless referer.is_a?(WWW::Mechanize::File)
|
229
|
+
referer = referer.is_a?(String) ?
|
230
|
+
Page.new(URI.parse(referer), {'content-type' => 'text/html'}) :
|
231
|
+
Page.new(referer, {'content-type' => 'text/html'})
|
232
|
+
end
|
233
|
+
|
234
|
+
# fetch the page
|
235
|
+
page = fetch_page( :uri => url,
|
236
|
+
:referer => referer,
|
237
|
+
:headers => headers || {},
|
238
|
+
:params => parameters
|
239
|
+
)
|
240
|
+
add_to_history(page)
|
241
|
+
yield page if block_given?
|
242
|
+
page
|
243
|
+
end
|
244
|
+
|
245
|
+
####
|
246
|
+
# PUT to +url+ with +entity+, and setting +options+:
|
247
|
+
#
|
248
|
+
# put('http://tenderlovemaking.com/', 'new content', :headers => {'Content-Type' => 'text/plain'})
|
249
|
+
#
|
250
|
+
def put(url, entity, options = {})
|
251
|
+
request_with_entity(:put, url, entity, options)
|
252
|
+
end
|
253
|
+
|
254
|
+
####
|
255
|
+
# DELETE to +url+ with +query_params+, and setting +options+:
|
256
|
+
#
|
257
|
+
# delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
|
258
|
+
#
|
259
|
+
def delete(url, query_params = {}, options = {})
|
260
|
+
page = head(url, query_params, options.merge({:verb => :delete}))
|
261
|
+
add_to_history(page)
|
262
|
+
page
|
263
|
+
end
|
264
|
+
|
265
|
+
####
|
266
|
+
# HEAD to +url+ with +query_params+, and setting +options+:
|
267
|
+
#
|
268
|
+
# head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
|
269
|
+
#
|
270
|
+
def head(url, query_params = {}, options = {})
|
271
|
+
options = {
|
272
|
+
:uri => url,
|
273
|
+
:headers => {},
|
274
|
+
:params => query_params,
|
275
|
+
:verb => :head
|
276
|
+
}.merge(options)
|
277
|
+
# fetch the page
|
278
|
+
page = fetch_page(options)
|
279
|
+
yield page if block_given?
|
280
|
+
page
|
281
|
+
end
|
282
|
+
|
283
|
+
# Fetch a file and return the contents of the file.
|
284
|
+
def get_file(url)
|
285
|
+
get(url).body
|
286
|
+
end
|
287
|
+
|
288
|
+
# Clicks the WWW::Mechanize::Link object passed in and returns the
|
289
|
+
# page fetched.
|
290
|
+
def click(link)
|
291
|
+
referer = link.page rescue referer = nil
|
292
|
+
href = link.respond_to?(:href) ? link.href :
|
293
|
+
(link['href'] || link['src'])
|
294
|
+
get(:url => href, :referer => (referer || current_page()))
|
295
|
+
end
|
296
|
+
|
297
|
+
# Equivalent to the browser back button. Returns the most recent page
|
298
|
+
# visited.
|
299
|
+
def back
|
300
|
+
@history.pop
|
301
|
+
end
|
302
|
+
|
303
|
+
# Posts to the given URL with the request entity. The request
|
304
|
+
# entity is specified by either a string, or a list of key-value
|
305
|
+
# pairs represented by a hash or an array of arrays.
|
306
|
+
#
|
307
|
+
# Examples:
|
308
|
+
# agent.post('http://example.com/', "foo" => "bar")
|
309
|
+
#
|
310
|
+
# agent.post('http://example.com/', [ ["foo", "bar"] ])
|
311
|
+
#
|
312
|
+
# agent.post('http://example.com/', "<message>hello</message>", 'Content-Type' => 'application/xml')
|
313
|
+
def post(url, query={}, headers={})
|
314
|
+
if query.is_a?(String)
|
315
|
+
return request_with_entity(:post, url, query, :headers => headers)
|
316
|
+
end
|
317
|
+
node = {}
|
318
|
+
# Create a fake form
|
319
|
+
class << node
|
320
|
+
def search(*args); []; end
|
321
|
+
end
|
322
|
+
node['method'] = 'POST'
|
323
|
+
node['enctype'] = 'application/x-www-form-urlencoded'
|
324
|
+
|
325
|
+
form = Form.new(node)
|
326
|
+
query.each { |k,v|
|
327
|
+
if v.is_a?(IO)
|
328
|
+
form.enctype = 'multipart/form-data'
|
329
|
+
ul = Form::FileUpload.new(k.to_s,::File.basename(v.path))
|
330
|
+
ul.file_data = v.read
|
331
|
+
form.file_uploads << ul
|
332
|
+
else
|
333
|
+
form.fields << Form::Field.new(k.to_s,v)
|
334
|
+
end
|
335
|
+
}
|
336
|
+
post_form(url, form, headers)
|
337
|
+
end
|
338
|
+
|
339
|
+
# Submit a form with an optional button.
|
340
|
+
# Without a button:
|
341
|
+
# page = agent.get('http://example.com')
|
342
|
+
# agent.submit(page.forms.first)
|
343
|
+
# With a button
|
344
|
+
# agent.submit(page.forms.first, page.forms.first.buttons.first)
|
345
|
+
def submit(form, button=nil, headers={})
|
346
|
+
form.add_button_to_query(button) if button
|
347
|
+
case form.method.upcase
|
348
|
+
when 'POST'
|
349
|
+
post_form(form.action, form, headers)
|
350
|
+
when 'GET'
|
351
|
+
get( :url => form.action.gsub(/\?[^\?]*$/, ''),
|
352
|
+
:params => form.build_query,
|
353
|
+
:headers => headers,
|
354
|
+
:referer => form.page
|
355
|
+
)
|
356
|
+
else
|
357
|
+
raise "unsupported method: #{form.method.upcase}"
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
def request_with_entity(verb, url, entity, options={})
|
362
|
+
cur_page = current_page || Page.new( nil, {'content-type'=>'text/html'})
|
363
|
+
|
364
|
+
options = {
|
365
|
+
:uri => url,
|
366
|
+
:referer => cur_page,
|
367
|
+
:headers => {},
|
368
|
+
}.update(options)
|
369
|
+
|
370
|
+
headers = {
|
371
|
+
'Content-Type' => 'application/octet-stream',
|
372
|
+
'Content-Length' => entity.size.to_s,
|
373
|
+
}.update(options[:headers])
|
374
|
+
|
375
|
+
options.update({
|
376
|
+
:verb => verb,
|
377
|
+
:params => [entity],
|
378
|
+
:headers => headers,
|
379
|
+
})
|
380
|
+
|
381
|
+
page = fetch_page(options)
|
382
|
+
add_to_history(page)
|
383
|
+
page
|
384
|
+
end
|
385
|
+
|
386
|
+
# Returns the current page loaded by Mechanize
|
387
|
+
def current_page
|
388
|
+
@history.last
|
389
|
+
end
|
390
|
+
|
391
|
+
# Returns whether or not a url has been visited
|
392
|
+
def visited?(url)
|
393
|
+
! visited_page(url).nil?
|
394
|
+
end
|
395
|
+
|
396
|
+
# Returns a visited page for the url passed in, otherwise nil
|
397
|
+
def visited_page(url)
|
398
|
+
if url.respond_to? :href
|
399
|
+
url = url.href
|
400
|
+
end
|
401
|
+
@history.visited_page(resolve(url))
|
402
|
+
end
|
403
|
+
|
404
|
+
# Runs given block, then resets the page history as it was before. self is
|
405
|
+
# given as a parameter to the block. Returns the value of the block.
|
406
|
+
def transact
|
407
|
+
history_backup = @history.dup
|
408
|
+
begin
|
409
|
+
yield self
|
410
|
+
ensure
|
411
|
+
@history = history_backup
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
alias :page :current_page
|
416
|
+
|
417
|
+
private
|
418
|
+
|
419
|
+
def resolve(url, referer = current_page())
|
420
|
+
hash = { :uri => url, :referer => referer }
|
421
|
+
chain = Chain.new([
|
422
|
+
Chain::URIResolver.new(@scheme_handlers)
|
423
|
+
]).handle(hash)
|
424
|
+
hash[:uri].to_s
|
425
|
+
end
|
426
|
+
|
427
|
+
def post_form(url, form, headers = {})
|
428
|
+
cur_page = form.page || current_page ||
|
429
|
+
Page.new( nil, {'content-type'=>'text/html'})
|
430
|
+
|
431
|
+
request_data = form.request_data
|
432
|
+
|
433
|
+
log.debug("query: #{ request_data.inspect }") if log
|
434
|
+
|
435
|
+
# fetch the page
|
436
|
+
page = fetch_page( :uri => url,
|
437
|
+
:referer => cur_page,
|
438
|
+
:verb => :post,
|
439
|
+
:params => [request_data],
|
440
|
+
:headers => {
|
441
|
+
'Content-Type' => form.enctype,
|
442
|
+
'Content-Length' => request_data.size.to_s,
|
443
|
+
}.merge(headers))
|
444
|
+
add_to_history(page)
|
445
|
+
page
|
446
|
+
end
|
447
|
+
|
448
|
+
# uri is an absolute URI
|
449
|
+
def fetch_page(params)
|
450
|
+
options = {
|
451
|
+
:request => nil,
|
452
|
+
:response => nil,
|
453
|
+
:connection => nil,
|
454
|
+
:referer => current_page(),
|
455
|
+
:uri => nil,
|
456
|
+
:verb => :get,
|
457
|
+
:agent => self,
|
458
|
+
:redirects => 0,
|
459
|
+
:params => [],
|
460
|
+
:headers => {},
|
461
|
+
}.merge(params)
|
462
|
+
|
463
|
+
before_connect = Chain.new([
|
464
|
+
Chain::URIResolver.new(@scheme_handlers),
|
465
|
+
Chain::ParameterResolver.new,
|
466
|
+
Chain::RequestResolver.new,
|
467
|
+
Chain::ConnectionResolver.new(
|
468
|
+
@connection_cache,
|
469
|
+
@keep_alive,
|
470
|
+
@proxy_addr,
|
471
|
+
@proxy_port,
|
472
|
+
@proxy_user,
|
473
|
+
@proxy_pass
|
474
|
+
),
|
475
|
+
Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass),
|
476
|
+
Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest),
|
477
|
+
Chain::HeaderResolver.new(
|
478
|
+
@keep_alive,
|
479
|
+
@keep_alive_time,
|
480
|
+
@cookie_jar,
|
481
|
+
@user_agent,
|
482
|
+
{}
|
483
|
+
),
|
484
|
+
Chain::CustomHeaders.new,
|
485
|
+
@pre_connect_hook,
|
486
|
+
])
|
487
|
+
before_connect.handle(options)
|
488
|
+
|
489
|
+
uri = options[:uri]
|
490
|
+
request = options[:request]
|
491
|
+
cur_page = options[:referer]
|
492
|
+
request_data = options[:params]
|
493
|
+
redirects = options[:redirects]
|
494
|
+
http_obj = options[:connection]
|
495
|
+
|
496
|
+
# Add If-Modified-Since if page is in history
|
497
|
+
if( (page = visited_page(uri)) && page.response['Last-Modified'] )
|
498
|
+
request['If-Modified-Since'] = page.response['Last-Modified']
|
499
|
+
end if(@conditional_requests)
|
500
|
+
|
501
|
+
# Specify timeouts if given
|
502
|
+
http_obj.open_timeout = @open_timeout if @open_timeout
|
503
|
+
http_obj.read_timeout = @read_timeout if @read_timeout
|
504
|
+
http_obj.start unless http_obj.started?
|
505
|
+
|
506
|
+
# Log specified headers for the request
|
507
|
+
log.info("#{ request.class }: #{ request.path }") if log
|
508
|
+
request.each_header do |k, v|
|
509
|
+
log.debug("request-header: #{ k } => #{ v }")
|
510
|
+
end if log
|
511
|
+
|
512
|
+
# Send the request
|
513
|
+
attempts = 0
|
514
|
+
begin
|
515
|
+
response = http_obj.request(request, *request_data) { |r|
|
516
|
+
connection_chain = Chain.new([
|
517
|
+
Chain::ResponseReader.new(r),
|
518
|
+
Chain::BodyDecodingHandler.new,
|
519
|
+
])
|
520
|
+
connection_chain.handle(options)
|
521
|
+
}
|
522
|
+
rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x
|
523
|
+
log.error("Rescuing EOF error") if log
|
524
|
+
http_obj.finish
|
525
|
+
raise x if attempts >= 2
|
526
|
+
request.body = nil
|
527
|
+
http_obj.start
|
528
|
+
attempts += 1
|
529
|
+
retry
|
530
|
+
end
|
531
|
+
|
532
|
+
after_connect = Chain.new([
|
533
|
+
@post_connect_hook,
|
534
|
+
Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set),
|
535
|
+
Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache),
|
536
|
+
])
|
537
|
+
after_connect.handle(options)
|
538
|
+
|
539
|
+
res_klass = options[:res_klass]
|
540
|
+
response_body = options[:response_body]
|
541
|
+
page = options[:page]
|
542
|
+
|
543
|
+
log.info("status: #{ page.code }") if log
|
544
|
+
|
545
|
+
if follow_meta_refresh
|
546
|
+
redirect_uri = nil
|
547
|
+
referer = page
|
548
|
+
if (page.respond_to?(:meta) && (redirect = page.meta.first))
|
549
|
+
redirect_uri = redirect.uri.to_s
|
550
|
+
sleep redirect.node['delay'].to_f
|
551
|
+
referer = Page.new(nil, {'content-type'=>'text/html'})
|
552
|
+
elsif refresh = response['refresh']
|
553
|
+
delay, redirect_uri = Page::Meta.parse(refresh, uri)
|
554
|
+
raise StandardError, "Invalid refresh http header" unless delay
|
555
|
+
if redirects + 1 > redirection_limit
|
556
|
+
raise RedirectLimitReachedError.new(page, redirects)
|
557
|
+
end
|
558
|
+
sleep delay.to_f
|
559
|
+
end
|
560
|
+
if redirect_uri
|
561
|
+
@history.push(page, page.uri)
|
562
|
+
return fetch_page(
|
563
|
+
:uri => redirect_uri,
|
564
|
+
:referer => referer,
|
565
|
+
:params => [],
|
566
|
+
:verb => :get,
|
567
|
+
:redirects => redirects + 1
|
568
|
+
)
|
569
|
+
end
|
570
|
+
end
|
571
|
+
|
572
|
+
return page if res_klass <= Net::HTTPSuccess
|
573
|
+
|
574
|
+
if res_klass == Net::HTTPNotModified
|
575
|
+
log.debug("Got cached page") if log
|
576
|
+
return visited_page(uri) || page
|
577
|
+
elsif res_klass <= Net::HTTPRedirection
|
578
|
+
return page unless follow_redirect?
|
579
|
+
log.info("follow redirect to: #{ response['Location'] }") if log
|
580
|
+
from_uri = page.uri
|
581
|
+
raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit
|
582
|
+
redirect_verb = options[:verb] == :head ? :head : :get
|
583
|
+
page = fetch_page( :uri => response['Location'].to_s,
|
584
|
+
:referer => page,
|
585
|
+
:params => [],
|
586
|
+
:verb => redirect_verb,
|
587
|
+
:redirects => redirects + 1
|
588
|
+
)
|
589
|
+
@history.push(page, from_uri)
|
590
|
+
return page
|
591
|
+
elsif res_klass <= Net::HTTPUnauthorized
|
592
|
+
raise ResponseCodeError.new(page) unless @user || @password
|
593
|
+
raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
|
594
|
+
if response['www-authenticate'] =~ /Digest/i
|
595
|
+
@auth_hash[uri.host] = :digest
|
596
|
+
if response['server'] =~ /Microsoft-IIS/
|
597
|
+
@auth_hash[uri.host] = :iis_digest
|
598
|
+
end
|
599
|
+
@digest = response['www-authenticate']
|
600
|
+
else
|
601
|
+
@auth_hash[uri.host] = :basic
|
602
|
+
end
|
603
|
+
return fetch_page( :uri => uri,
|
604
|
+
:referer => cur_page,
|
605
|
+
:verb => request.method.downcase.to_sym,
|
606
|
+
:params => request_data,
|
607
|
+
:headers => options[:headers]
|
608
|
+
)
|
609
|
+
end
|
610
|
+
|
611
|
+
raise ResponseCodeError.new(page), "Unhandled response", caller
|
612
|
+
end
|
613
|
+
|
614
|
+
def add_to_history(page)
|
615
|
+
@history.push(page, resolve(page.uri))
|
616
|
+
history_added.call(page) if history_added
|
617
|
+
end
|
618
|
+
end
|
619
|
+
end
|