diamond-mechanize 2.1 → 2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metadata +222 -167
- data/Rakefile +0 -49
- data/lib/mechanize/content_type_error.rb +0 -13
- data/lib/mechanize/cookie.rb +0 -232
- data/lib/mechanize/cookie_jar.rb +0 -194
- data/lib/mechanize/download.rb +0 -59
- data/lib/mechanize/element_matcher.rb +0 -36
- data/lib/mechanize/file.rb +0 -65
- data/lib/mechanize/file_connection.rb +0 -17
- data/lib/mechanize/file_request.rb +0 -26
- data/lib/mechanize/file_response.rb +0 -74
- data/lib/mechanize/file_saver.rb +0 -39
- data/lib/mechanize/form/button.rb +0 -6
- data/lib/mechanize/form/check_box.rb +0 -12
- data/lib/mechanize/form/field.rb +0 -54
- data/lib/mechanize/form/file_upload.rb +0 -21
- data/lib/mechanize/form/hidden.rb +0 -3
- data/lib/mechanize/form/image_button.rb +0 -19
- data/lib/mechanize/form/keygen.rb +0 -34
- data/lib/mechanize/form/multi_select_list.rb +0 -94
- data/lib/mechanize/form/option.rb +0 -50
- data/lib/mechanize/form/radio_button.rb +0 -55
- data/lib/mechanize/form/reset.rb +0 -3
- data/lib/mechanize/form/select_list.rb +0 -44
- data/lib/mechanize/form/submit.rb +0 -3
- data/lib/mechanize/form/text.rb +0 -3
- data/lib/mechanize/form/textarea.rb +0 -3
- data/lib/mechanize/form.rb +0 -543
- data/lib/mechanize/headers.rb +0 -23
- data/lib/mechanize/history.rb +0 -82
- data/lib/mechanize/http/agent.rb +0 -1004
- data/lib/mechanize/http/auth_challenge.rb +0 -59
- data/lib/mechanize/http/auth_realm.rb +0 -31
- data/lib/mechanize/http/content_disposition_parser.rb +0 -188
- data/lib/mechanize/http/www_authenticate_parser.rb +0 -155
- data/lib/mechanize/http.rb +0 -8
- data/lib/mechanize/monkey_patch.rb +0 -16
- data/lib/mechanize/page/base.rb +0 -7
- data/lib/mechanize/page/frame.rb +0 -27
- data/lib/mechanize/page/image.rb +0 -30
- data/lib/mechanize/page/label.rb +0 -20
- data/lib/mechanize/page/link.rb +0 -98
- data/lib/mechanize/page/meta_refresh.rb +0 -68
- data/lib/mechanize/page.rb +0 -440
- data/lib/mechanize/parser.rb +0 -173
- data/lib/mechanize/pluggable_parsers.rb +0 -144
- data/lib/mechanize/redirect_limit_reached_error.rb +0 -19
- data/lib/mechanize/redirect_not_get_or_head_error.rb +0 -21
- data/lib/mechanize/response_code_error.rb +0 -21
- data/lib/mechanize/response_read_error.rb +0 -27
- data/lib/mechanize/robots_disallowed_error.rb +0 -28
- data/lib/mechanize/test_case.rb +0 -663
- data/lib/mechanize/unauthorized_error.rb +0 -3
- data/lib/mechanize/unsupported_scheme_error.rb +0 -6
- data/lib/mechanize/util.rb +0 -101
- data/lib/mechanize.rb +0 -1079
- data/test/data/htpasswd +0 -1
- data/test/data/server.crt +0 -16
- data/test/data/server.csr +0 -12
- data/test/data/server.key +0 -15
- data/test/data/server.pem +0 -15
- data/test/htdocs/alt_text.html +0 -10
- data/test/htdocs/bad_form_test.html +0 -9
- data/test/htdocs/button.jpg +0 -0
- data/test/htdocs/canonical_uri.html +0 -9
- data/test/htdocs/dir with spaces/foo.html +0 -1
- data/test/htdocs/empty_form.html +0 -6
- data/test/htdocs/file_upload.html +0 -26
- data/test/htdocs/find_link.html +0 -41
- data/test/htdocs/form_multi_select.html +0 -16
- data/test/htdocs/form_multival.html +0 -37
- data/test/htdocs/form_no_action.html +0 -18
- data/test/htdocs/form_no_input_name.html +0 -16
- data/test/htdocs/form_order_test.html +0 -11
- data/test/htdocs/form_select.html +0 -16
- data/test/htdocs/form_set_fields.html +0 -14
- data/test/htdocs/form_test.html +0 -188
- data/test/htdocs/frame_referer_test.html +0 -10
- data/test/htdocs/frame_test.html +0 -30
- data/test/htdocs/google.html +0 -13
- data/test/htdocs/index.html +0 -6
- data/test/htdocs/link with space.html +0 -5
- data/test/htdocs/meta_cookie.html +0 -11
- data/test/htdocs/no_title_test.html +0 -6
- data/test/htdocs/noindex.html +0 -9
- data/test/htdocs/rails_3_encoding_hack_form_test.html +0 -27
- data/test/htdocs/relative/tc_relative_links.html +0 -21
- data/test/htdocs/robots.html +0 -8
- data/test/htdocs/robots.txt +0 -2
- data/test/htdocs/tc_bad_charset.html +0 -9
- data/test/htdocs/tc_bad_links.html +0 -5
- data/test/htdocs/tc_base_link.html +0 -8
- data/test/htdocs/tc_blank_form.html +0 -11
- data/test/htdocs/tc_charset.html +0 -6
- data/test/htdocs/tc_checkboxes.html +0 -19
- data/test/htdocs/tc_encoded_links.html +0 -5
- data/test/htdocs/tc_field_precedence.html +0 -11
- data/test/htdocs/tc_follow_meta.html +0 -8
- data/test/htdocs/tc_form_action.html +0 -48
- data/test/htdocs/tc_links.html +0 -19
- data/test/htdocs/tc_meta_in_body.html +0 -9
- data/test/htdocs/tc_pretty_print.html +0 -17
- data/test/htdocs/tc_referer.html +0 -16
- data/test/htdocs/tc_relative_links.html +0 -19
- data/test/htdocs/tc_textarea.html +0 -23
- data/test/htdocs/test_click.html +0 -11
- data/test/htdocs/unusual______.html +0 -5
- data/test/test_mechanize.rb +0 -1164
- data/test/test_mechanize_cookie.rb +0 -451
- data/test/test_mechanize_cookie_jar.rb +0 -483
- data/test/test_mechanize_download.rb +0 -43
- data/test/test_mechanize_file.rb +0 -61
- data/test/test_mechanize_file_connection.rb +0 -21
- data/test/test_mechanize_file_request.rb +0 -19
- data/test/test_mechanize_file_saver.rb +0 -21
- data/test/test_mechanize_form.rb +0 -875
- data/test/test_mechanize_form_check_box.rb +0 -38
- data/test/test_mechanize_form_encoding.rb +0 -114
- data/test/test_mechanize_form_field.rb +0 -63
- data/test/test_mechanize_form_file_upload.rb +0 -20
- data/test/test_mechanize_form_image_button.rb +0 -12
- data/test/test_mechanize_form_keygen.rb +0 -32
- data/test/test_mechanize_form_multi_select_list.rb +0 -84
- data/test/test_mechanize_form_option.rb +0 -55
- data/test/test_mechanize_form_radio_button.rb +0 -78
- data/test/test_mechanize_form_select_list.rb +0 -76
- data/test/test_mechanize_form_textarea.rb +0 -52
- data/test/test_mechanize_headers.rb +0 -35
- data/test/test_mechanize_history.rb +0 -103
- data/test/test_mechanize_http_agent.rb +0 -1225
- data/test/test_mechanize_http_auth_challenge.rb +0 -39
- data/test/test_mechanize_http_auth_realm.rb +0 -49
- data/test/test_mechanize_http_content_disposition_parser.rb +0 -118
- data/test/test_mechanize_http_www_authenticate_parser.rb +0 -146
- data/test/test_mechanize_link.rb +0 -80
- data/test/test_mechanize_page.rb +0 -118
- data/test/test_mechanize_page_encoding.rb +0 -182
- data/test/test_mechanize_page_frame.rb +0 -16
- data/test/test_mechanize_page_link.rb +0 -390
- data/test/test_mechanize_page_meta_refresh.rb +0 -127
- data/test/test_mechanize_parser.rb +0 -289
- data/test/test_mechanize_pluggable_parser.rb +0 -52
- data/test/test_mechanize_redirect_limit_reached_error.rb +0 -24
- data/test/test_mechanize_redirect_not_get_or_head_error.rb +0 -14
- data/test/test_mechanize_subclass.rb +0 -22
- data/test/test_mechanize_util.rb +0 -103
- data/test/test_multi_select.rb +0 -119
data/lib/mechanize/headers.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
class Mechanize::Headers < Hash
|
2
|
-
def [](key)
|
3
|
-
super(key.downcase)
|
4
|
-
end
|
5
|
-
|
6
|
-
def []=(key, value)
|
7
|
-
super(key.downcase, value)
|
8
|
-
end
|
9
|
-
|
10
|
-
def key?(key)
|
11
|
-
super(key.downcase)
|
12
|
-
end
|
13
|
-
|
14
|
-
def canonical_each
|
15
|
-
block_given? or return enum_for(__method__)
|
16
|
-
each { |key, value|
|
17
|
-
key = key.capitalize
|
18
|
-
key.gsub!(/-([a-z])/) { "-#{$1.upcase}" }
|
19
|
-
yield [key, value]
|
20
|
-
}
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
data/lib/mechanize/history.rb
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
##
|
2
|
-
# This class manages history for your mechanize object.
|
3
|
-
|
4
|
-
class Mechanize::History < Array
|
5
|
-
|
6
|
-
attr_accessor :max_size
|
7
|
-
|
8
|
-
def initialize(max_size = nil)
|
9
|
-
@max_size = max_size
|
10
|
-
@history_index = {}
|
11
|
-
end
|
12
|
-
|
13
|
-
def initialize_copy(orig)
|
14
|
-
super
|
15
|
-
@history_index = orig.instance_variable_get(:@history_index).dup
|
16
|
-
end
|
17
|
-
|
18
|
-
def inspect # :nodoc:
|
19
|
-
uris = map { |page| page.uri }.join ', '
|
20
|
-
|
21
|
-
"[#{uris}]"
|
22
|
-
end
|
23
|
-
|
24
|
-
def push(page, uri = nil)
|
25
|
-
super page
|
26
|
-
|
27
|
-
index = uri ? uri : page.uri
|
28
|
-
@history_index[index.to_s] = page
|
29
|
-
|
30
|
-
shift while length > @max_size if @max_size
|
31
|
-
|
32
|
-
self
|
33
|
-
end
|
34
|
-
|
35
|
-
alias :<< :push
|
36
|
-
|
37
|
-
def visited? uri
|
38
|
-
page = @history_index[uri.to_s]
|
39
|
-
|
40
|
-
return page if page # HACK
|
41
|
-
|
42
|
-
uri = uri.dup
|
43
|
-
uri.path = '/' if uri.path.empty?
|
44
|
-
|
45
|
-
@history_index[uri.to_s]
|
46
|
-
end
|
47
|
-
|
48
|
-
alias visited_page visited?
|
49
|
-
|
50
|
-
def clear
|
51
|
-
@history_index.clear
|
52
|
-
super
|
53
|
-
end
|
54
|
-
|
55
|
-
def shift
|
56
|
-
return nil if length == 0
|
57
|
-
page = self[0]
|
58
|
-
self[0] = nil
|
59
|
-
|
60
|
-
super
|
61
|
-
|
62
|
-
remove_from_index(page)
|
63
|
-
page
|
64
|
-
end
|
65
|
-
|
66
|
-
def pop
|
67
|
-
return nil if length == 0
|
68
|
-
page = super
|
69
|
-
remove_from_index(page)
|
70
|
-
page
|
71
|
-
end
|
72
|
-
|
73
|
-
private
|
74
|
-
|
75
|
-
def remove_from_index(page)
|
76
|
-
@history_index.each do |k,v|
|
77
|
-
@history_index.delete(k) if v == page
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
end
|
82
|
-
|
data/lib/mechanize/http/agent.rb
DELETED
@@ -1,1004 +0,0 @@
|
|
1
|
-
require 'tempfile'
|
2
|
-
require 'net/ntlm'
|
3
|
-
require 'kconv'
|
4
|
-
require 'webrobots'
|
5
|
-
|
6
|
-
##
|
7
|
-
# An HTTP (and local disk access) user agent. This class is an implementation
|
8
|
-
# detail and is subject to change at any time.
|
9
|
-
|
10
|
-
class Mechanize::HTTP::Agent
|
11
|
-
|
12
|
-
# :section: Headers
|
13
|
-
|
14
|
-
# Disables If-Modified-Since conditional requests (enabled by default)
|
15
|
-
attr_accessor :conditional_requests
|
16
|
-
|
17
|
-
# Is gzip compression of requests enabled?
|
18
|
-
attr_accessor :gzip_enabled
|
19
|
-
|
20
|
-
# A hash of request headers to be used for every request
|
21
|
-
attr_accessor :request_headers
|
22
|
-
|
23
|
-
# The User-Agent header to send
|
24
|
-
attr_reader :user_agent
|
25
|
-
|
26
|
-
# :section: History
|
27
|
-
|
28
|
-
# history of requests made
|
29
|
-
attr_accessor :history
|
30
|
-
|
31
|
-
# :section: Hooks
|
32
|
-
|
33
|
-
# A list of hooks to call after retrieving a response. Hooks are called with
|
34
|
-
# the agent and the response returned.
|
35
|
-
attr_reader :post_connect_hooks
|
36
|
-
|
37
|
-
# A list of hooks to call before making a request. Hooks are called with
|
38
|
-
# the agent and the request to be performed.
|
39
|
-
attr_reader :pre_connect_hooks
|
40
|
-
|
41
|
-
# A list of hooks to call to handle the content-encoding of a request.
|
42
|
-
attr_reader :content_encoding_hooks
|
43
|
-
|
44
|
-
# :section: HTTP Authentication
|
45
|
-
|
46
|
-
attr_reader :authenticate_methods # :nodoc:
|
47
|
-
attr_reader :digest_challenges # :nodoc:
|
48
|
-
attr_accessor :user
|
49
|
-
attr_accessor :password
|
50
|
-
|
51
|
-
# :section: Redirection
|
52
|
-
|
53
|
-
# Follow HTML meta refresh and HTTP Refresh. If set to +:anywhere+ meta
|
54
|
-
# refresh tags outside of the head element will be followed.
|
55
|
-
attr_accessor :follow_meta_refresh
|
56
|
-
|
57
|
-
# Follow an HTML meta refresh that has no "url=" in the content attribute.
|
58
|
-
#
|
59
|
-
# Defaults to false to prevent infinite refresh loops.
|
60
|
-
attr_accessor :follow_meta_refresh_self
|
61
|
-
|
62
|
-
# Controls how this agent deals with redirects. The following values are
|
63
|
-
# allowed:
|
64
|
-
#
|
65
|
-
# :all, true:: All 3xx redirects are followed (default)
|
66
|
-
# :permanent:: Only 301 Moved Permanantly redirects are followed
|
67
|
-
# false:: No redirects are followed
|
68
|
-
attr_accessor :redirect_ok
|
69
|
-
|
70
|
-
# Maximum number of redirects to follow
|
71
|
-
attr_accessor :redirection_limit
|
72
|
-
|
73
|
-
# :section: Robots
|
74
|
-
|
75
|
-
# When true, this agent will consult the site's robots.txt for each access.
|
76
|
-
attr_reader :robots
|
77
|
-
|
78
|
-
# :section: SSL
|
79
|
-
|
80
|
-
# Path to an OpenSSL server certificate file
|
81
|
-
attr_accessor :ca_file
|
82
|
-
|
83
|
-
# An OpenSSL private key or the path to a private key
|
84
|
-
attr_accessor :key
|
85
|
-
|
86
|
-
# An OpenSSL client certificate or the path to a certificate file.
|
87
|
-
attr_accessor :cert
|
88
|
-
|
89
|
-
# An SSL certificate store
|
90
|
-
attr_accessor :cert_store
|
91
|
-
|
92
|
-
# OpenSSL key password
|
93
|
-
attr_accessor :pass
|
94
|
-
|
95
|
-
# A callback for additional certificate verification. See
|
96
|
-
# OpenSSL::SSL::SSLContext#verify_callback
|
97
|
-
#
|
98
|
-
# The callback can be used for debugging or to ignore errors by always
|
99
|
-
# returning +true+. Specifying nil uses the default method that was valid
|
100
|
-
# when the SSLContext was created
|
101
|
-
attr_accessor :verify_callback
|
102
|
-
|
103
|
-
# How to verify SSL connections. Defaults to VERIFY_PEER
|
104
|
-
attr_accessor :verify_mode
|
105
|
-
|
106
|
-
# :section: Timeouts
|
107
|
-
|
108
|
-
# Reset connections that have not been used in this many seconds
|
109
|
-
attr_reader :idle_timeout
|
110
|
-
|
111
|
-
# Set to false to disable HTTP/1.1 keep-alive requests
|
112
|
-
attr_accessor :keep_alive
|
113
|
-
|
114
|
-
# Length of time to wait until a connection is opened in seconds
|
115
|
-
attr_accessor :open_timeout
|
116
|
-
|
117
|
-
# Length of time to attempt to read data from the server
|
118
|
-
attr_accessor :read_timeout
|
119
|
-
|
120
|
-
# :section:
|
121
|
-
|
122
|
-
# The cookies for this agent
|
123
|
-
attr_accessor :cookie_jar
|
124
|
-
|
125
|
-
# URI for a proxy connection
|
126
|
-
attr_reader :proxy_uri
|
127
|
-
|
128
|
-
# Retry non-idempotent requests?
|
129
|
-
attr_reader :retry_change_requests
|
130
|
-
|
131
|
-
# Responses larger than this will be written to a Tempfile instead of stored
|
132
|
-
# in memory.
|
133
|
-
attr_accessor :max_file_buffer
|
134
|
-
|
135
|
-
# :section: Utility
|
136
|
-
|
137
|
-
# The context parses responses into pages
|
138
|
-
attr_accessor :context
|
139
|
-
|
140
|
-
attr_reader :http # :nodoc:
|
141
|
-
|
142
|
-
# Handlers for various URI schemes
|
143
|
-
attr_accessor :scheme_handlers
|
144
|
-
|
145
|
-
# :section:
|
146
|
-
|
147
|
-
# Creates a new Mechanize HTTP user agent. The user agent is an
|
148
|
-
# implementation detail of mechanize and its API may change at any time.
|
149
|
-
|
150
|
-
def initialize
|
151
|
-
@conditional_requests = true
|
152
|
-
@context = nil
|
153
|
-
@content_encoding_hooks = []
|
154
|
-
@cookie_jar = Mechanize::CookieJar.new
|
155
|
-
@follow_meta_refresh = false
|
156
|
-
@follow_meta_refresh_self = false
|
157
|
-
@gzip_enabled = true
|
158
|
-
@history = Mechanize::History.new
|
159
|
-
@idle_timeout = nil
|
160
|
-
@keep_alive = true
|
161
|
-
@keep_alive_time = 300
|
162
|
-
@max_file_buffer = 10240
|
163
|
-
@open_timeout = nil
|
164
|
-
@post_connect_hooks = []
|
165
|
-
@pre_connect_hooks = []
|
166
|
-
@proxy_uri = nil
|
167
|
-
@read_timeout = nil
|
168
|
-
@redirect_ok = true
|
169
|
-
@redirection_limit = 20
|
170
|
-
@request_headers = {}
|
171
|
-
@retry_change_requests = false
|
172
|
-
@robots = false
|
173
|
-
@user_agent = nil
|
174
|
-
@webrobots = nil
|
175
|
-
|
176
|
-
# HTTP Authentication
|
177
|
-
@authenticate_parser = Mechanize::HTTP::WWWAuthenticateParser.new
|
178
|
-
@authenticate_methods = Hash.new do |methods, uri|
|
179
|
-
methods[uri] = Hash.new do |realms, auth_scheme|
|
180
|
-
realms[auth_scheme] = []
|
181
|
-
end
|
182
|
-
end
|
183
|
-
@digest_auth = Net::HTTP::DigestAuth.new
|
184
|
-
@digest_challenges = {}
|
185
|
-
@password = nil # HTTP auth password
|
186
|
-
@user = nil # HTTP auth user
|
187
|
-
|
188
|
-
# SSL
|
189
|
-
@ca_file = nil
|
190
|
-
@cert = nil
|
191
|
-
@cert_store = nil
|
192
|
-
@key = nil
|
193
|
-
@pass = nil
|
194
|
-
@verify_callback = nil
|
195
|
-
@verify_mode = nil
|
196
|
-
|
197
|
-
@scheme_handlers = Hash.new { |h, scheme|
|
198
|
-
h[scheme] = lambda { |link, page|
|
199
|
-
raise Mechanize::UnsupportedSchemeError, scheme
|
200
|
-
}
|
201
|
-
}
|
202
|
-
|
203
|
-
@scheme_handlers['http'] = lambda { |link, page| link }
|
204
|
-
@scheme_handlers['https'] = @scheme_handlers['http']
|
205
|
-
@scheme_handlers['relative'] = @scheme_handlers['http']
|
206
|
-
@scheme_handlers['file'] = @scheme_handlers['http']
|
207
|
-
end
|
208
|
-
|
209
|
-
# Retrieves +uri+ and parses it into a page or other object according to
|
210
|
-
# PluggableParser. If the URI is an HTTP or HTTPS scheme URI the given HTTP
|
211
|
-
# +method+ is used to retrieve it, along with the HTTP +headers+, request
|
212
|
-
# +params+ and HTTP +referer+.
|
213
|
-
#
|
214
|
-
# +redirects+ tracks the number of redirects experienced when retrieving the
|
215
|
-
# page. If it is over the redirection_limit an error will be raised.
|
216
|
-
|
217
|
-
def fetch uri, method = :get, headers = {}, params = [],
|
218
|
-
referer = current_page, redirects = 0
|
219
|
-
referer_uri = referer ? referer.uri : nil
|
220
|
-
|
221
|
-
uri = resolve uri, referer
|
222
|
-
|
223
|
-
uri, params = resolve_parameters uri, method, params
|
224
|
-
|
225
|
-
request = http_request uri, method, params
|
226
|
-
|
227
|
-
connection = connection_for uri
|
228
|
-
|
229
|
-
request_auth request, uri
|
230
|
-
|
231
|
-
disable_keep_alive request
|
232
|
-
enable_gzip request
|
233
|
-
|
234
|
-
request_language_charset request
|
235
|
-
request_cookies request, uri
|
236
|
-
request_host request, uri
|
237
|
-
request_referer request, uri, referer_uri
|
238
|
-
request_user_agent request
|
239
|
-
request_add_headers request, headers
|
240
|
-
|
241
|
-
pre_connect request
|
242
|
-
|
243
|
-
# Consult robots.txt
|
244
|
-
if robots && uri.is_a?(URI::HTTP)
|
245
|
-
robots_allowed?(uri) or raise Mechanize::RobotsDisallowedError.new(uri)
|
246
|
-
end
|
247
|
-
|
248
|
-
# Add If-Modified-Since if page is in history
|
249
|
-
page = visited_page(uri)
|
250
|
-
|
251
|
-
if (page = visited_page(uri)) and page.response['Last-Modified']
|
252
|
-
request['If-Modified-Since'] = page.response['Last-Modified']
|
253
|
-
end if(@conditional_requests)
|
254
|
-
|
255
|
-
# Specify timeouts if given
|
256
|
-
connection.open_timeout = @open_timeout if @open_timeout
|
257
|
-
connection.read_timeout = @read_timeout if @read_timeout
|
258
|
-
|
259
|
-
request_log request
|
260
|
-
|
261
|
-
response_body_io = nil
|
262
|
-
|
263
|
-
# Send the request
|
264
|
-
response = connection.request(uri, request) { |res|
|
265
|
-
response_log res
|
266
|
-
|
267
|
-
response_body_io = response_read res, request
|
268
|
-
|
269
|
-
res
|
270
|
-
}
|
271
|
-
|
272
|
-
hook_content_encoding response, uri, response_body_io
|
273
|
-
|
274
|
-
response_body_io = response_content_encoding response, response_body_io
|
275
|
-
|
276
|
-
post_connect uri, response, response_body_io
|
277
|
-
|
278
|
-
page = response_parse response, response_body_io, uri
|
279
|
-
|
280
|
-
response_cookies response, uri, page
|
281
|
-
|
282
|
-
meta = response_follow_meta_refresh response, uri, page, redirects
|
283
|
-
return meta if meta
|
284
|
-
|
285
|
-
case response
|
286
|
-
when Net::HTTPSuccess
|
287
|
-
if robots && page.is_a?(Mechanize::Page)
|
288
|
-
page.parser.noindex? and raise Mechanize::RobotsDisallowedError.new(uri)
|
289
|
-
end
|
290
|
-
|
291
|
-
page
|
292
|
-
when Mechanize::FileResponse
|
293
|
-
page
|
294
|
-
when Net::HTTPNotModified
|
295
|
-
log.debug("Got cached page") if log
|
296
|
-
visited_page(uri) || page
|
297
|
-
when Net::HTTPRedirection
|
298
|
-
response_redirect response, method, page, redirects, referer
|
299
|
-
when Net::HTTPUnauthorized
|
300
|
-
response_authenticate(response, page, uri, request, headers, params,
|
301
|
-
referer)
|
302
|
-
else
|
303
|
-
raise Mechanize::ResponseCodeError.new(page), "Unhandled response"
|
304
|
-
end
|
305
|
-
end
|
306
|
-
|
307
|
-
# Retry non-idempotent requests
|
308
|
-
|
309
|
-
def retry_change_requests= retri
|
310
|
-
@retry_change_requests = retri
|
311
|
-
@http.retry_change_requests = retri if @http
|
312
|
-
end
|
313
|
-
|
314
|
-
# :section: Headers
|
315
|
-
|
316
|
-
def user_agent= user_agent
|
317
|
-
@webrobots = nil if user_agent != @user_agent
|
318
|
-
@user_agent = user_agent
|
319
|
-
end
|
320
|
-
|
321
|
-
# :section: History
|
322
|
-
|
323
|
-
# Equivalent to the browser back button. Returns the most recent page
|
324
|
-
# visited.
|
325
|
-
def back
|
326
|
-
@history.pop
|
327
|
-
end
|
328
|
-
|
329
|
-
##
|
330
|
-
# Returns the latest page loaded by the agent
|
331
|
-
|
332
|
-
def current_page
|
333
|
-
@history.last
|
334
|
-
end
|
335
|
-
|
336
|
-
def max_history
|
337
|
-
@history.max_size
|
338
|
-
end
|
339
|
-
|
340
|
-
def max_history=(length)
|
341
|
-
@history.max_size = length
|
342
|
-
end
|
343
|
-
|
344
|
-
# Returns a visited page for the url passed in, otherwise nil
|
345
|
-
def visited_page url
|
346
|
-
@history.visited_page resolve url
|
347
|
-
end
|
348
|
-
|
349
|
-
# :section: Hooks
|
350
|
-
|
351
|
-
def hook_content_encoding response, uri, response_body_io
|
352
|
-
@content_encoding_hooks.each do |hook|
|
353
|
-
hook.call self, uri, response, response_body_io
|
354
|
-
end
|
355
|
-
end
|
356
|
-
|
357
|
-
##
|
358
|
-
# Invokes hooks added to post_connect_hooks after a +response+ is returned
|
359
|
-
# and the response +body+ is handled.
|
360
|
-
#
|
361
|
-
# Yields the +context+, the +uri+ for the request, the +response+ and the
|
362
|
-
# response +body+.
|
363
|
-
|
364
|
-
def post_connect uri, response, body_io # :yields: agent, uri, response, body
|
365
|
-
@post_connect_hooks.each do |hook|
|
366
|
-
begin
|
367
|
-
hook.call self, uri, response, body_io.read
|
368
|
-
ensure
|
369
|
-
body_io.rewind
|
370
|
-
end
|
371
|
-
end
|
372
|
-
end
|
373
|
-
|
374
|
-
##
|
375
|
-
# Invokes hooks added to pre_connect_hooks before a +request+ is made.
|
376
|
-
# Yields the +agent+ and the +request+ that will be performed to each hook.
|
377
|
-
|
378
|
-
def pre_connect request # :yields: agent, request
|
379
|
-
@pre_connect_hooks.each do |hook|
|
380
|
-
hook.call self, request
|
381
|
-
end
|
382
|
-
end
|
383
|
-
|
384
|
-
# :section: Request
|
385
|
-
|
386
|
-
def connection_for uri
|
387
|
-
case uri.scheme.downcase
|
388
|
-
when 'http', 'https' then
|
389
|
-
return @http
|
390
|
-
when 'file' then
|
391
|
-
return Mechanize::FileConnection.new
|
392
|
-
end
|
393
|
-
end
|
394
|
-
|
395
|
-
def disable_keep_alive request
|
396
|
-
request['connection'] = 'close' unless @keep_alive
|
397
|
-
end
|
398
|
-
|
399
|
-
def enable_gzip request
|
400
|
-
request['accept-encoding'] = if @gzip_enabled
|
401
|
-
'gzip,deflate,identity'
|
402
|
-
else
|
403
|
-
'identity'
|
404
|
-
end
|
405
|
-
end
|
406
|
-
|
407
|
-
def http_request uri, method, params = nil
|
408
|
-
case uri.scheme.downcase
|
409
|
-
when 'http', 'https' then
|
410
|
-
klass = Net::HTTP.const_get(method.to_s.capitalize)
|
411
|
-
|
412
|
-
request ||= klass.new(uri.request_uri)
|
413
|
-
request.body = params.first if params
|
414
|
-
|
415
|
-
request
|
416
|
-
when 'file' then
|
417
|
-
Mechanize::FileRequest.new uri
|
418
|
-
end
|
419
|
-
end
|
420
|
-
|
421
|
-
def request_add_headers request, headers = {}
|
422
|
-
@request_headers.each do |k,v|
|
423
|
-
request[k] = v
|
424
|
-
end
|
425
|
-
|
426
|
-
headers.each do |field, value|
|
427
|
-
case field
|
428
|
-
when :etag then request["ETag"] = value
|
429
|
-
when :if_modified_since then request["If-Modified-Since"] = value
|
430
|
-
when Symbol then
|
431
|
-
raise ArgumentError, "unknown header symbol #{field}"
|
432
|
-
else
|
433
|
-
request[field] = value
|
434
|
-
end
|
435
|
-
end
|
436
|
-
end
|
437
|
-
|
438
|
-
def request_auth request, uri
|
439
|
-
base_uri = uri + '/'
|
440
|
-
schemes = @authenticate_methods[base_uri]
|
441
|
-
|
442
|
-
if realm = schemes[:digest].find { |r| r.uri == base_uri } then
|
443
|
-
request_auth_digest request, uri, realm, base_uri, false
|
444
|
-
elsif realm = schemes[:iis_digest].find { |r| r.uri == base_uri } then
|
445
|
-
request_auth_digest request, uri, realm, base_uri, true
|
446
|
-
elsif schemes[:basic].find { |r| r.uri == base_uri } then
|
447
|
-
request.basic_auth @user, @password
|
448
|
-
end
|
449
|
-
end
|
450
|
-
|
451
|
-
def request_auth_digest request, uri, realm, base_uri, iis
|
452
|
-
challenge = @digest_challenges[realm]
|
453
|
-
|
454
|
-
uri.user = @user
|
455
|
-
uri.password = @password
|
456
|
-
|
457
|
-
auth = @digest_auth.auth_header uri, challenge.to_s, request.method, iis
|
458
|
-
request['Authorization'] = auth
|
459
|
-
end
|
460
|
-
|
461
|
-
def request_cookies request, uri
|
462
|
-
return if @cookie_jar.empty? uri
|
463
|
-
|
464
|
-
cookies = @cookie_jar.cookies uri
|
465
|
-
|
466
|
-
return if cookies.empty?
|
467
|
-
|
468
|
-
request.add_field 'Cookie', cookies.join('; ')
|
469
|
-
end
|
470
|
-
|
471
|
-
def request_host request, uri
|
472
|
-
port = [80, 443].include?(uri.port.to_i) ? nil : uri.port
|
473
|
-
host = uri.host
|
474
|
-
|
475
|
-
request['Host'] = [host, port].compact.join ':'
|
476
|
-
end
|
477
|
-
|
478
|
-
def request_language_charset request
|
479
|
-
request['accept-charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
|
480
|
-
request['accept-language'] = 'en-us,en;q=0.5'
|
481
|
-
end
|
482
|
-
|
483
|
-
# Log specified headers for the request
|
484
|
-
def request_log request
|
485
|
-
return unless log
|
486
|
-
|
487
|
-
log.info("#{request.class}: #{request.path}")
|
488
|
-
|
489
|
-
request.each_header do |k, v|
|
490
|
-
log.debug("request-header: #{k} => #{v}")
|
491
|
-
end
|
492
|
-
end
|
493
|
-
|
494
|
-
def request_referer request, uri, referer
|
495
|
-
return unless referer
|
496
|
-
return if 'https' == referer.scheme.downcase and
|
497
|
-
'https' != uri.scheme.downcase
|
498
|
-
|
499
|
-
request['Referer'] = referer
|
500
|
-
end
|
501
|
-
|
502
|
-
def request_user_agent request
|
503
|
-
request['User-Agent'] = @user_agent if @user_agent
|
504
|
-
end
|
505
|
-
|
506
|
-
def resolve(uri, referer = current_page)
|
507
|
-
uri = uri.dup if uri.is_a?(URI)
|
508
|
-
|
509
|
-
unless uri.is_a?(URI)
|
510
|
-
uri = uri.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/o) { |match|
|
511
|
-
if RUBY_VERSION >= "1.9.0"
|
512
|
-
Mechanize::Util.uri_escape(match)
|
513
|
-
else
|
514
|
-
sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C')[0])
|
515
|
-
end
|
516
|
-
}
|
517
|
-
|
518
|
-
unescaped = uri.split(/(?:%[0-9A-Fa-f]{2})+|#/)
|
519
|
-
escaped = uri.scan(/(?:%[0-9A-Fa-f]{2})+|#/)
|
520
|
-
|
521
|
-
escaped_uri = Mechanize::Util.html_unescape(
|
522
|
-
unescaped.zip(escaped).map { |x,y|
|
523
|
-
"#{WEBrick::HTTPUtils.escape(x)}#{y}"
|
524
|
-
}.join('')
|
525
|
-
)
|
526
|
-
|
527
|
-
begin
|
528
|
-
uri = URI.parse(escaped_uri)
|
529
|
-
rescue
|
530
|
-
uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_uri))
|
531
|
-
end
|
532
|
-
end
|
533
|
-
|
534
|
-
scheme = uri.relative? ? 'relative' : uri.scheme.downcase
|
535
|
-
uri = @scheme_handlers[scheme].call(uri, referer)
|
536
|
-
|
537
|
-
if referer && referer.uri
|
538
|
-
if uri.path.length == 0 && uri.relative?
|
539
|
-
uri.path = referer.uri.path
|
540
|
-
end
|
541
|
-
end
|
542
|
-
|
543
|
-
uri.path = '/' if uri.path.length == 0
|
544
|
-
|
545
|
-
if uri.relative?
|
546
|
-
raise ArgumentError, "absolute URL needed (not #{uri})" unless
|
547
|
-
referer && referer.uri
|
548
|
-
|
549
|
-
base = nil
|
550
|
-
if referer.respond_to?(:bases) && referer.parser
|
551
|
-
base = referer.bases.last
|
552
|
-
end
|
553
|
-
|
554
|
-
uri = ((base && base.uri && base.uri.absolute?) ?
|
555
|
-
base.uri :
|
556
|
-
referer.uri) + uri
|
557
|
-
uri = referer.uri + uri
|
558
|
-
# Strip initial "/.." bits from the path
|
559
|
-
uri.path.sub!(/^(\/\.\.)+(?=\/)/, '')
|
560
|
-
end
|
561
|
-
|
562
|
-
unless ['http', 'https', 'file'].include?(uri.scheme.downcase)
|
563
|
-
raise ArgumentError, "unsupported scheme: #{uri.scheme}"
|
564
|
-
end
|
565
|
-
|
566
|
-
uri
|
567
|
-
end
|
568
|
-
|
569
|
-
def resolve_parameters uri, method, parameters
|
570
|
-
case method
|
571
|
-
when :head, :get, :delete, :trace then
|
572
|
-
if parameters and parameters.length > 0
|
573
|
-
uri.query ||= ''
|
574
|
-
uri.query << '&' if uri.query.length > 0
|
575
|
-
uri.query << Mechanize::Util.build_query_string(parameters)
|
576
|
-
end
|
577
|
-
|
578
|
-
return uri, nil
|
579
|
-
end
|
580
|
-
|
581
|
-
return uri, parameters
|
582
|
-
end
|
583
|
-
|
584
|
-
# :section: Response
|
585
|
-
|
586
|
-
def get_meta_refresh response, uri, page
|
587
|
-
return nil unless @follow_meta_refresh
|
588
|
-
|
589
|
-
if page.respond_to?(:meta_refresh) and
|
590
|
-
(redirect = page.meta_refresh.first) then
|
591
|
-
[redirect.delay, redirect.href] unless
|
592
|
-
not @follow_meta_refresh_self and redirect.link_self
|
593
|
-
elsif refresh = response['refresh']
|
594
|
-
delay, href, link_self = Mechanize::Page::MetaRefresh.parse refresh, uri
|
595
|
-
raise Mechanize::Error, 'Invalid refresh http header' unless delay
|
596
|
-
[delay.to_f, href] unless
|
597
|
-
not @follow_meta_refresh_self and link_self
|
598
|
-
end
|
599
|
-
end
|
600
|
-
|
601
|
-
def response_authenticate(response, page, uri, request, headers, params,
|
602
|
-
referer)
|
603
|
-
raise Mechanize::UnauthorizedError, page unless @user || @password
|
604
|
-
|
605
|
-
challenges = @authenticate_parser.parse response['www-authenticate']
|
606
|
-
|
607
|
-
if challenge = challenges.find { |c| c.scheme =~ /^Digest$/i } then
|
608
|
-
realm = challenge.realm uri
|
609
|
-
|
610
|
-
auth_scheme = if response['server'] =~ /Microsoft-IIS/ then
|
611
|
-
:iis_digest
|
612
|
-
else
|
613
|
-
:digest
|
614
|
-
end
|
615
|
-
|
616
|
-
existing_realms = @authenticate_methods[realm.uri][auth_scheme]
|
617
|
-
|
618
|
-
raise Mechanize::UnauthorizedError, page if
|
619
|
-
existing_realms.include? realm
|
620
|
-
|
621
|
-
existing_realms << realm
|
622
|
-
@digest_challenges[realm] = challenge
|
623
|
-
elsif challenge = challenges.find { |c| c.scheme == 'NTLM' } then
|
624
|
-
existing_realms = @authenticate_methods[uri + '/'][:ntlm]
|
625
|
-
|
626
|
-
raise Mechanize::UnauthorizedError, page if
|
627
|
-
existing_realms.include?(realm) and not challenge.params
|
628
|
-
|
629
|
-
existing_realms << realm
|
630
|
-
|
631
|
-
if challenge.params then
|
632
|
-
type_2 = Net::NTLM::Message.decode64 challenge.params
|
633
|
-
|
634
|
-
type_3 = type_2.response({ :user => @user, :password => @password, },
|
635
|
-
{ :ntlmv2 => true }).encode64
|
636
|
-
|
637
|
-
headers['Authorization'] = "NTLM #{type_3}"
|
638
|
-
else
|
639
|
-
type_1 = Net::NTLM::Message::Type1.new.encode64
|
640
|
-
headers['Authorization'] = "NTLM #{type_1}"
|
641
|
-
end
|
642
|
-
elsif challenge = challenges.find { |c| c.scheme == 'Basic' } then
|
643
|
-
realm = challenge.realm uri
|
644
|
-
|
645
|
-
existing_realms = @authenticate_methods[realm.uri][:basic]
|
646
|
-
|
647
|
-
raise Mechanize::UnauthorizedError, page if
|
648
|
-
existing_realms.include? realm
|
649
|
-
|
650
|
-
existing_realms << realm
|
651
|
-
else
|
652
|
-
raise Mechanize::UnauthorizedError, page
|
653
|
-
end
|
654
|
-
|
655
|
-
fetch uri, request.method.downcase.to_sym, headers, params, referer
|
656
|
-
end
|
657
|
-
|
658
|
-
def response_content_encoding response, body_io
|
659
|
-
length = response.content_length
|
660
|
-
|
661
|
-
length = case body_io
|
662
|
-
when IO, Tempfile then
|
663
|
-
body_io.stat.size
|
664
|
-
else
|
665
|
-
body_io.length
|
666
|
-
end unless length
|
667
|
-
|
668
|
-
out_io = nil
|
669
|
-
|
670
|
-
case response['Content-Encoding']
|
671
|
-
when nil, 'none', '7bit' then
|
672
|
-
out_io = body_io
|
673
|
-
when 'deflate' then
|
674
|
-
log.debug('deflate body') if log
|
675
|
-
|
676
|
-
return if length.zero?
|
677
|
-
|
678
|
-
begin
|
679
|
-
out_io = inflate body_io
|
680
|
-
rescue Zlib::BufError, Zlib::DataError
|
681
|
-
log.error('Unable to inflate page, retrying with raw deflate') if log
|
682
|
-
body_io.rewind
|
683
|
-
begin
|
684
|
-
out_io = inflate body_io, -Zlib::MAX_WBITS
|
685
|
-
rescue Zlib::BufError, Zlib::DataError
|
686
|
-
log.error("unable to inflate page: #{$!}") if log
|
687
|
-
nil
|
688
|
-
end
|
689
|
-
end
|
690
|
-
when 'gzip', 'x-gzip' then
|
691
|
-
log.debug('gzip body') if log
|
692
|
-
|
693
|
-
return if length.zero?
|
694
|
-
|
695
|
-
begin
|
696
|
-
zio = Zlib::GzipReader.new body_io
|
697
|
-
out_io = Tempfile.new 'mechanize-decode', :encoding => 'ascii-8bit'
|
698
|
-
out_io.binmode
|
699
|
-
|
700
|
-
until zio.eof? do
|
701
|
-
out_io.write zio.read 16384
|
702
|
-
end
|
703
|
-
rescue Zlib::BufError, Zlib::GzipFile::Error
|
704
|
-
log.error('Unable to gunzip body, trying raw inflate') if log
|
705
|
-
body_io.rewind
|
706
|
-
body_io.read 10
|
707
|
-
|
708
|
-
out_io = inflate body_io, -Zlib::MAX_WBITS
|
709
|
-
rescue Zlib::DataError
|
710
|
-
log.error("unable to gunzip page: #{$!}") if log
|
711
|
-
''
|
712
|
-
ensure
|
713
|
-
zio.close if zio and not zio.closed?
|
714
|
-
end
|
715
|
-
else
|
716
|
-
raise Mechanize::Error,
|
717
|
-
"Unsupported Content-Encoding: #{response['Content-Encoding']}"
|
718
|
-
end
|
719
|
-
|
720
|
-
out_io.flush
|
721
|
-
out_io.rewind
|
722
|
-
|
723
|
-
out_io
|
724
|
-
end
|
725
|
-
|
726
|
-
def response_cookies response, uri, page
|
727
|
-
if Mechanize::Page === page and page.body =~ /Set-Cookie/n
|
728
|
-
page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta|
|
729
|
-
save_cookies(uri, meta['content'])
|
730
|
-
end
|
731
|
-
end
|
732
|
-
|
733
|
-
header_cookies = response.get_fields 'Set-Cookie'
|
734
|
-
|
735
|
-
return unless header_cookies
|
736
|
-
|
737
|
-
header_cookies.each do |set_cookie|
|
738
|
-
save_cookies(uri, set_cookie)
|
739
|
-
end
|
740
|
-
end
|
741
|
-
|
742
|
-
def save_cookies(uri, set_cookie)
|
743
|
-
log = log() # reduce method calls
|
744
|
-
Mechanize::Cookie.parse(uri, set_cookie, log) { |c|
|
745
|
-
if @cookie_jar.add(uri, c)
|
746
|
-
log.debug("saved cookie: #{c}") if log
|
747
|
-
else
|
748
|
-
log.debug("rejected cookie: #{c}") if log
|
749
|
-
end
|
750
|
-
}
|
751
|
-
end
|
752
|
-
|
753
|
-
def response_follow_meta_refresh response, uri, page, redirects
|
754
|
-
delay, new_url = get_meta_refresh(response, uri, page)
|
755
|
-
return nil unless new_url
|
756
|
-
|
757
|
-
raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
|
758
|
-
redirects + 1 > @redirection_limit
|
759
|
-
|
760
|
-
sleep delay
|
761
|
-
@history.push(page, page.uri)
|
762
|
-
fetch new_url, :get, {}, [],
|
763
|
-
Mechanize::Page.new(nil, {'content-type'=>'text/html'}), redirects
|
764
|
-
end
|
765
|
-
|
766
|
-
def response_log response
|
767
|
-
return unless log
|
768
|
-
|
769
|
-
log.info("status: #{response.class} #{response.http_version} " \
|
770
|
-
"#{response.code} #{response.message}")
|
771
|
-
|
772
|
-
response.each_header do |k, v|
|
773
|
-
log.debug("response-header: #{k} => #{v}")
|
774
|
-
end
|
775
|
-
end
|
776
|
-
|
777
|
-
def response_parse response, body_io, uri
|
778
|
-
@context.parse uri, response, body_io
|
779
|
-
end
|
780
|
-
|
781
|
-
def response_read response, request
|
782
|
-
content_length = response.content_length
|
783
|
-
|
784
|
-
if content_length and content_length > @max_file_buffer then
|
785
|
-
body_io = Tempfile.new 'mechanize-raw'
|
786
|
-
body_io.binmode if defined? body_io.binmode
|
787
|
-
else
|
788
|
-
body_io = StringIO.new
|
789
|
-
end
|
790
|
-
|
791
|
-
body_io.set_encoding Encoding::BINARY if body_io.respond_to? :set_encoding
|
792
|
-
total = 0
|
793
|
-
|
794
|
-
begin
|
795
|
-
response.read_body { |part|
|
796
|
-
total += part.length
|
797
|
-
|
798
|
-
if StringIO === body_io and total > @max_file_buffer then
|
799
|
-
new_io = Tempfile.new 'mechanize-raw'
|
800
|
-
new_io.binmode if defined? binmode
|
801
|
-
new_io.set_encoding(body_io.external_encoding)
|
802
|
-
new_io.write body_io.string
|
803
|
-
|
804
|
-
body_io = new_io
|
805
|
-
end
|
806
|
-
|
807
|
-
body_io.write(part)
|
808
|
-
log.debug("Read #{part.length} bytes (#{total} total)") if log
|
809
|
-
}
|
810
|
-
rescue Net::HTTP::Persistent::Error => e
|
811
|
-
body_io.rewind
|
812
|
-
raise Mechanize::ResponseReadError.new(e, response, body_io)
|
813
|
-
end
|
814
|
-
|
815
|
-
body_io.flush
|
816
|
-
body_io.rewind
|
817
|
-
|
818
|
-
raise Mechanize::ResponseCodeError, response if
|
819
|
-
Net::HTTPUnknownResponse === response
|
820
|
-
|
821
|
-
content_length = response.content_length
|
822
|
-
|
823
|
-
unless Net::HTTP::Head === request or Net::HTTPRedirection === response then
|
824
|
-
raise EOFError, "Content-Length (#{content_length}) does not match " \
|
825
|
-
"response body length (#{body_io.length})" if
|
826
|
-
content_length and content_length != body_io.length
|
827
|
-
end
|
828
|
-
|
829
|
-
body_io
|
830
|
-
end
|
831
|
-
|
832
|
-
def response_redirect response, method, page, redirects, referer = current_page
|
833
|
-
case @redirect_ok
|
834
|
-
when true, :all
|
835
|
-
# shortcut
|
836
|
-
when false, nil
|
837
|
-
return page
|
838
|
-
when :permanent
|
839
|
-
return page unless Net::HTTPMovedPermanently === response
|
840
|
-
end
|
841
|
-
|
842
|
-
log.info("follow redirect to: #{response['Location']}") if log
|
843
|
-
|
844
|
-
raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
|
845
|
-
redirects + 1 > @redirection_limit
|
846
|
-
|
847
|
-
redirect_method = method == :head ? :head : :get
|
848
|
-
|
849
|
-
from_uri = page.uri
|
850
|
-
@history.push(page, from_uri)
|
851
|
-
new_uri = from_uri + response['Location'].to_s
|
852
|
-
|
853
|
-
fetch new_uri, redirect_method, {}, [], referer, redirects + 1
|
854
|
-
end
|
855
|
-
|
856
|
-
# :section: Robots
|
857
|
-
|
858
|
-
def get_robots(uri) # :nodoc:
|
859
|
-
fetch(uri).body
|
860
|
-
rescue Mechanize::ResponseCodeError => e
|
861
|
-
return '' if e.response_code == '404'
|
862
|
-
raise e
|
863
|
-
end
|
864
|
-
|
865
|
-
def robots= value
|
866
|
-
require 'webrobots' if value
|
867
|
-
@webrobots = nil if value != @robots
|
868
|
-
@robots = value
|
869
|
-
end
|
870
|
-
|
871
|
-
##
|
872
|
-
# Tests if this agent is allowed to access +url+, consulting the site's
|
873
|
-
# robots.txt.
|
874
|
-
|
875
|
-
def robots_allowed? uri
|
876
|
-
return true if uri.request_uri == '/robots.txt'
|
877
|
-
|
878
|
-
webrobots.allowed? uri
|
879
|
-
end
|
880
|
-
|
881
|
-
# Opposite of robots_allowed?
|
882
|
-
|
883
|
-
def robots_disallowed? url
|
884
|
-
!robots_allowed? url
|
885
|
-
end
|
886
|
-
|
887
|
-
# Returns an error object if there is an error in fetching or parsing
|
888
|
-
# robots.txt of the site +url+.
|
889
|
-
def robots_error(url)
|
890
|
-
webrobots.error(url)
|
891
|
-
end
|
892
|
-
|
893
|
-
# Raises the error if there is an error in fetching or parsing robots.txt of
|
894
|
-
# the site +url+.
|
895
|
-
def robots_error!(url)
|
896
|
-
webrobots.error!(url)
|
897
|
-
end
|
898
|
-
|
899
|
-
# Removes robots.txt cache for the site +url+.
|
900
|
-
def robots_reset(url)
|
901
|
-
webrobots.reset(url)
|
902
|
-
end
|
903
|
-
|
904
|
-
def webrobots
|
905
|
-
@webrobots ||= WebRobots.new(@user_agent, :http_get => method(:get_robots))
|
906
|
-
end
|
907
|
-
|
908
|
-
# :section: SSL
|
909
|
-
|
910
|
-
def certificate
|
911
|
-
@http.certificate
|
912
|
-
end
|
913
|
-
|
914
|
-
# :section: Timeouts
|
915
|
-
|
916
|
-
# Sets the conection idle timeout for persistent connections
|
917
|
-
def idle_timeout= timeout
|
918
|
-
@idle_timeout = timeout
|
919
|
-
@http.idle_timeout = timeout if @http
|
920
|
-
end
|
921
|
-
|
922
|
-
# :section: Utility
|
923
|
-
|
924
|
-
def inflate compressed, window_bits = nil
|
925
|
-
inflate = Zlib::Inflate.new window_bits
|
926
|
-
out_io = Tempfile.new 'mechanize-decode'
|
927
|
-
|
928
|
-
until compressed.eof? do
|
929
|
-
out_io.write inflate.inflate compressed.read 1024
|
930
|
-
end
|
931
|
-
|
932
|
-
out_io.write inflate.finish
|
933
|
-
|
934
|
-
out_io
|
935
|
-
end
|
936
|
-
|
937
|
-
def log
|
938
|
-
@context.log
|
939
|
-
end
|
940
|
-
|
941
|
-
def set_http
|
942
|
-
@http = Net::HTTP::Persistent.new 'mechanize', @proxy_uri
|
943
|
-
|
944
|
-
@http.keep_alive = @keep_alive_time
|
945
|
-
@http.idle_timeout = @idle_timeout if @idle_timeout
|
946
|
-
@http.retry_change_requests = @retry_change_requests
|
947
|
-
|
948
|
-
@http.ca_file = @ca_file
|
949
|
-
@http.cert_store = @cert_store if @cert_store
|
950
|
-
@http.verify_callback = @verify_callback
|
951
|
-
@http.verify_mode = @verify_mode if @verify_mode
|
952
|
-
|
953
|
-
# update our cached value
|
954
|
-
@verify_mode = @http.verify_mode
|
955
|
-
@cert_store = @http.cert_store
|
956
|
-
|
957
|
-
if @cert and @key then
|
958
|
-
cert = if OpenSSL::X509::Certificate === @cert then
|
959
|
-
@cert
|
960
|
-
else
|
961
|
-
OpenSSL::X509::Certificate.new ::File.read @cert
|
962
|
-
end
|
963
|
-
|
964
|
-
key = if OpenSSL::PKey::PKey === @key then
|
965
|
-
@key
|
966
|
-
else
|
967
|
-
OpenSSL::PKey::RSA.new ::File.read(@key), @pass
|
968
|
-
end
|
969
|
-
|
970
|
-
@http.certificate = cert
|
971
|
-
@http.private_key = key
|
972
|
-
end
|
973
|
-
end
|
974
|
-
|
975
|
-
##
|
976
|
-
# Sets the proxy address, port, user, and password +addr+ should be a host,
|
977
|
-
# with no "http://", +port+ may be a port number, service name or port
|
978
|
-
# number string.
|
979
|
-
|
980
|
-
def set_proxy(addr, port, user = nil, pass = nil)
|
981
|
-
return unless addr and port
|
982
|
-
|
983
|
-
unless Integer === port then
|
984
|
-
begin
|
985
|
-
port = Socket.getservbyname port
|
986
|
-
rescue SocketError
|
987
|
-
begin
|
988
|
-
port = Integer port
|
989
|
-
rescue ArgumentError
|
990
|
-
raise ArgumentError, "invalid value for port: #{port.inspect}"
|
991
|
-
end
|
992
|
-
end
|
993
|
-
end
|
994
|
-
|
995
|
-
@proxy_uri = URI "http://#{addr}"
|
996
|
-
@proxy_uri.port = port
|
997
|
-
@proxy_uri.user = user if user
|
998
|
-
@proxy_uri.password = pass if pass
|
999
|
-
|
1000
|
-
@proxy_uri
|
1001
|
-
end
|
1002
|
-
|
1003
|
-
end
|
1004
|
-
|