mechanize 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data/README +15 -0
- data/examples/rubyforge.rb +13 -0
- data/lib/mechanize.rb +447 -0
- data/lib/mechanize/net-overrides/net/http.rb +2107 -0
- data/lib/mechanize/net-overrides/net/https.rb +171 -0
- data/lib/mechanize/net-overrides/net/protocol.rb +380 -0
- data/lib/mechanize/parsing.rb +200 -0
- data/mechanize.gemspec +22 -0
- metadata +59 -0
data/README
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
== Dependencies
|
2
|
+
|
3
|
+
* ruby 1.8.2
|
4
|
+
|
5
|
+
Note that the files in the net-overrides/ directory are taken from Ruby 1.9.0.
|
6
|
+
|
7
|
+
* narf 0.6.3 (www.narf-lib.org)
|
8
|
+
|
9
|
+
After unpacking, install it with:
|
10
|
+
|
11
|
+
ruby install.rb config
|
12
|
+
ruby install.rb setup
|
13
|
+
ruby install.rb install
|
14
|
+
|
15
|
+
Note that only the ruby-htmltools package bunded with narf works.
|
@@ -0,0 +1,13 @@
|
|
1
|
+
$LOAD_PATH.unshift '../lib'
|
2
|
+
require 'mechanize'
|
3
|
+
|
4
|
+
agent = WWW::Mechanize.new {|a| a.log = Logger.new(STDERR) }
|
5
|
+
page = agent.get('http://rubyforge.org/')
|
6
|
+
link = page.links.find {|l| l.node.text =~ /Log In/ }
|
7
|
+
page = agent.click(link)
|
8
|
+
form = page.forms[1]
|
9
|
+
form.fields.find {|f| f.name == 'form_loginname'}.value = ARGV[0]
|
10
|
+
form.fields.find {|f| f.name == 'form_pw'}.value = ARGV[1]
|
11
|
+
page = agent.submit(form, form.buttons.first)
|
12
|
+
|
13
|
+
puts page.body
|
data/lib/mechanize.rb
ADDED
@@ -0,0 +1,447 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de)
|
3
|
+
#
|
4
|
+
# This is a quick hack, to get something like Perl's WWW::Mechanize. Sure, we
|
5
|
+
# have Web::Unit, but, that does not work for me as expected, as it does not
|
6
|
+
# set cookies (I might be wrong), does not automatically redirect and has
|
7
|
+
# problems with some html documents.
|
8
|
+
|
9
|
+
Version = "0.1.0"
|
10
|
+
|
11
|
+
# required due to the missing get_fields method in Ruby 1.8.2
|
12
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides")
|
13
|
+
require 'net/http'
|
14
|
+
require 'net/https'
|
15
|
+
|
16
|
+
require 'web/htmltools/xmltree' # narf
|
17
|
+
require 'mechanize/parsing'
|
18
|
+
require 'uri'
|
19
|
+
require 'logger'
|
20
|
+
require 'webrick'
|
21
|
+
|
22
|
+
module WWW
|
23
|
+
|
24
|
+
class Field
|
25
|
+
attr_accessor :name, :value
|
26
|
+
|
27
|
+
def initialize(name, value)
|
28
|
+
@name, @value = name, value
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns an array of Field objects
|
32
|
+
def self.extract_all_from(root_node)
|
33
|
+
fields = []
|
34
|
+
root_node.each_recursive {|node|
|
35
|
+
if node.name.downcase == 'input' and
|
36
|
+
['text', 'password', 'hidden', 'checkbox'].include?(node.attributes['type'].downcase)
|
37
|
+
fields << Field.new(node.attributes['name'], node.attributes['value'])
|
38
|
+
end
|
39
|
+
}
|
40
|
+
return fields
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
class FileUpload
|
45
|
+
# value is the file-name, not the file-content
|
46
|
+
attr_accessor :name
|
47
|
+
|
48
|
+
attr_accessor :file_name, :file_data
|
49
|
+
|
50
|
+
def initialize(name, file_name)
|
51
|
+
@name, @file_name = name, file_name
|
52
|
+
@file_data = nil
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class Button
|
57
|
+
attr_accessor :name, :value
|
58
|
+
|
59
|
+
def initialize(name, value)
|
60
|
+
@name, @value = name, value
|
61
|
+
end
|
62
|
+
|
63
|
+
# Returns an array of Button objects
|
64
|
+
def self.extract_all_from(root_node)
|
65
|
+
buttons = []
|
66
|
+
root_node.each_recursive {|node|
|
67
|
+
if node.name.downcase == 'input' and
|
68
|
+
['submit'].include?(node.attributes['type'].downcase)
|
69
|
+
buttons << Button.new(node.attributes['name'], node.attributes['value'])
|
70
|
+
end
|
71
|
+
}
|
72
|
+
return buttons
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Class Form does not work in the case there is some invalid (unbalanced) html
|
77
|
+
# involved, such as:
|
78
|
+
#
|
79
|
+
# <td>
|
80
|
+
# <form>
|
81
|
+
# </td>
|
82
|
+
# <td>
|
83
|
+
# <input .../>
|
84
|
+
# </form>
|
85
|
+
# </td>
|
86
|
+
#
|
87
|
+
# GlobalForm takes two nodes, the node where the form tag is located
|
88
|
+
# (form_node), and another node, from which to start looking for form elements
|
89
|
+
# (elements_node) like buttons and the like. For class Form both fall together
|
90
|
+
# into one and the same node.
|
91
|
+
|
92
|
+
class GlobalForm
|
93
|
+
attr_reader :form_node, :elements_node
|
94
|
+
attr_reader :method, :action, :name
|
95
|
+
|
96
|
+
attr_reader :fields, :buttons, :file_uploads
|
97
|
+
|
98
|
+
def initialize(form_node, elements_node)
|
99
|
+
@form_node, @elements_node = form_node, elements_node
|
100
|
+
|
101
|
+
@method = (@form_node.attributes['method'] || 'POST').upcase
|
102
|
+
@action = @form_node.attributes['action']
|
103
|
+
@name = @form_node.attributes['name']
|
104
|
+
|
105
|
+
parse
|
106
|
+
end
|
107
|
+
|
108
|
+
def parse
|
109
|
+
@fields = []
|
110
|
+
@buttons = []
|
111
|
+
@file_uploads = []
|
112
|
+
@elements_node.each_recursive {|node|
|
113
|
+
case node.name.downcase
|
114
|
+
when 'input'
|
115
|
+
case node.attributes['type'].downcase
|
116
|
+
when 'text', 'password', 'hidden', 'checkbox'
|
117
|
+
@fields << Field.new(node.attributes['name'], node.attributes['value'])
|
118
|
+
when 'file'
|
119
|
+
@file_uploads << FileUpload.new(node.attributes['name'], node.attributes['value'])
|
120
|
+
when 'submit'
|
121
|
+
@buttons << Button.new(node.attributes['name'], node.attributes['value'])
|
122
|
+
end
|
123
|
+
end
|
124
|
+
}
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
class Form < GlobalForm
|
130
|
+
attr_reader :node
|
131
|
+
|
132
|
+
def initialize(node)
|
133
|
+
@node = node
|
134
|
+
super(@node, @node)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
class Link
|
139
|
+
attr_reader :node
|
140
|
+
attr_reader :href
|
141
|
+
|
142
|
+
def initialize(node)
|
143
|
+
@node = node
|
144
|
+
@href = node.attributes['href']
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
class Page
|
149
|
+
attr_accessor :uri, :cookies, :response, :body, :code, :watch_for_set
|
150
|
+
|
151
|
+
def initialize(uri=nil, cookies=[], response=nil, body=nil, code=nil)
|
152
|
+
@uri, @cookies, @response, @body, @code = uri, cookies, response, body, code
|
153
|
+
end
|
154
|
+
|
155
|
+
def header
|
156
|
+
@response.header
|
157
|
+
end
|
158
|
+
|
159
|
+
def content_type
|
160
|
+
header['Content-Type']
|
161
|
+
end
|
162
|
+
|
163
|
+
def forms
|
164
|
+
parse_html() unless @forms
|
165
|
+
@forms
|
166
|
+
end
|
167
|
+
|
168
|
+
def links
|
169
|
+
parse_html() unless @links
|
170
|
+
@links
|
171
|
+
end
|
172
|
+
|
173
|
+
def root
|
174
|
+
parse_html() unless @root
|
175
|
+
@root
|
176
|
+
end
|
177
|
+
|
178
|
+
def watches
|
179
|
+
parse_html() unless @watches
|
180
|
+
@watches
|
181
|
+
end
|
182
|
+
|
183
|
+
private
|
184
|
+
|
185
|
+
def parse_html
|
186
|
+
raise "no html" if content_type() != 'text/html'
|
187
|
+
|
188
|
+
# construct parser and feed with HTML
|
189
|
+
parser = HTMLTree::XMLParser.new
|
190
|
+
begin
|
191
|
+
parser.feed(@body)
|
192
|
+
rescue => ex
|
193
|
+
if ex.message =~ /attempted adding second root element to document/ and
|
194
|
+
# Put the whole document inside a single root element, which I simply name
|
195
|
+
# <root>, just to make the parser happy. It's no longer valid HTML, but
|
196
|
+
# without a single root element, it's not valid HTML as well.
|
197
|
+
|
198
|
+
# TODO: leave a possible doctype definition outside this element.
|
199
|
+
parser = HTMLTree::XMLParser.new
|
200
|
+
parser.feed("<root>" + @body + "</root>")
|
201
|
+
else
|
202
|
+
raise
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
@root = parser.document
|
207
|
+
|
208
|
+
@forms = []
|
209
|
+
@links = []
|
210
|
+
@watches = {}
|
211
|
+
|
212
|
+
@root.each_recursive {|node|
|
213
|
+
name = node.name.downcase
|
214
|
+
|
215
|
+
case name
|
216
|
+
when 'form'
|
217
|
+
@forms << Form.new(node)
|
218
|
+
when 'a'
|
219
|
+
@links << Link.new(node)
|
220
|
+
else
|
221
|
+
if @watch_for_set and @watch_for_set.keys.include?( name )
|
222
|
+
@watches[name] = [] unless @watches[name]
|
223
|
+
klass = @watch_for_set[name]
|
224
|
+
@watches[name] << (klass ? klass.new(node) : node)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
}
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
class Mechanize
|
232
|
+
|
233
|
+
AGENT_ALIASES = {
|
234
|
+
'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
|
235
|
+
'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
|
236
|
+
'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-us) AppleWebKit/85 (KHTML, like Gecko) Safari/85',
|
237
|
+
'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
|
238
|
+
'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
|
239
|
+
'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
|
240
|
+
}
|
241
|
+
|
242
|
+
attr_accessor :log
|
243
|
+
attr_accessor :user_agent
|
244
|
+
attr_accessor :cookies
|
245
|
+
attr_accessor :open_timeout, :read_timeout
|
246
|
+
attr_accessor :watch_for_set
|
247
|
+
attr_accessor :max_history
|
248
|
+
|
249
|
+
def initialize
|
250
|
+
@history = []
|
251
|
+
@cookies = []
|
252
|
+
@log = Logger.new(nil)
|
253
|
+
yield self if block_given?
|
254
|
+
end
|
255
|
+
|
256
|
+
def user_agent_alias=(al)
|
257
|
+
self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
|
258
|
+
end
|
259
|
+
|
260
|
+
def get(url)
|
261
|
+
cur_page = current_page() || Page.new
|
262
|
+
|
263
|
+
# fetch the page
|
264
|
+
page = fetch_page(to_absolute_uri(url, cur_page), :get, cur_page)
|
265
|
+
add_to_history(page)
|
266
|
+
page
|
267
|
+
end
|
268
|
+
|
269
|
+
def post(url, query={})
|
270
|
+
cur_page = current_page() || Page.new
|
271
|
+
|
272
|
+
request_data = [build_query_string(query)]
|
273
|
+
|
274
|
+
# this is called before the request is sent
|
275
|
+
pre_request_hook = proc {|request|
|
276
|
+
log.debug("query: #{ query.inspect }")
|
277
|
+
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
|
278
|
+
request.add_header('Content-Length', request_data[0].size.to_s)
|
279
|
+
}
|
280
|
+
|
281
|
+
# fetch the page
|
282
|
+
page = fetch_page(to_absolute_uri(url, cur_page), :post, cur_page, pre_request_hook, request_data)
|
283
|
+
add_to_history(page)
|
284
|
+
page
|
285
|
+
end
|
286
|
+
|
287
|
+
def click(link)
|
288
|
+
uri = to_absolute_uri(link.href)
|
289
|
+
get(uri)
|
290
|
+
end
|
291
|
+
|
292
|
+
def submit(form, button=nil)
|
293
|
+
query = {}
|
294
|
+
form.fields.each do |f|
|
295
|
+
query[f.name] = f.value || ""
|
296
|
+
end
|
297
|
+
query[button.name] = button.value || "" if button
|
298
|
+
|
299
|
+
uri = to_absolute_uri(form.action)
|
300
|
+
case form.method.upcase
|
301
|
+
when 'POST'
|
302
|
+
post(uri, query)
|
303
|
+
when 'GET'
|
304
|
+
get(uri + "?" + build_query_string(query))
|
305
|
+
else
|
306
|
+
raise 'unsupported method'
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
def current_page
|
311
|
+
@history.last
|
312
|
+
end
|
313
|
+
|
314
|
+
alias page current_page
|
315
|
+
|
316
|
+
private
|
317
|
+
|
318
|
+
def to_absolute_uri(url, cur_page=current_page())
|
319
|
+
if url.is_a?(URI)
|
320
|
+
uri = url
|
321
|
+
else
|
322
|
+
uri = URI.parse(url)
|
323
|
+
end
|
324
|
+
|
325
|
+
# construct an absolute uri
|
326
|
+
if uri.relative?
|
327
|
+
if cur_page
|
328
|
+
uri = cur_page.uri + url
|
329
|
+
else
|
330
|
+
raise 'no history. please specify an absolute URL'
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
return uri
|
335
|
+
end
|
336
|
+
|
337
|
+
# uri is an absolute URI
|
338
|
+
def fetch_page(uri, method=:get, cur_page=current_page(), pre_request_hook=nil, request_data=[])
|
339
|
+
raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
|
340
|
+
|
341
|
+
log.info("#{ method.to_s.upcase }: #{ uri.to_s }")
|
342
|
+
|
343
|
+
page = Page.new(uri)
|
344
|
+
|
345
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
346
|
+
http.use_ssl = true if uri.scheme == "https"
|
347
|
+
http.start {
|
348
|
+
|
349
|
+
case method
|
350
|
+
when :get
|
351
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
352
|
+
when :post
|
353
|
+
request = Net::HTTP::Post.new(uri.request_uri)
|
354
|
+
else
|
355
|
+
raise ArgumentError
|
356
|
+
end
|
357
|
+
|
358
|
+
unless @cookies.empty?
|
359
|
+
cookie = @cookies.uniq.join("; ")
|
360
|
+
log.debug("use cookie: #{ cookie }")
|
361
|
+
request.add_header('Cookie', cookie)
|
362
|
+
end
|
363
|
+
|
364
|
+
# Add Referer header to request
|
365
|
+
|
366
|
+
unless cur_page.uri.nil?
|
367
|
+
request.add_header('Referer', cur_page.uri.to_s)
|
368
|
+
end
|
369
|
+
|
370
|
+
# Add User-Agent header to request
|
371
|
+
|
372
|
+
request.add_header('User-Agent', @user_agent) if @user_agent
|
373
|
+
|
374
|
+
# Invoke pre-request-hook (use it to add custom headers or content)
|
375
|
+
|
376
|
+
pre_request_hook.call(request) if pre_request_hook
|
377
|
+
|
378
|
+
# Log specified headers for the request
|
379
|
+
|
380
|
+
request.each_header do |k, v|
|
381
|
+
log.debug("request-header: #{ k } => #{ v }")
|
382
|
+
end
|
383
|
+
|
384
|
+
# Specify timeouts if given
|
385
|
+
|
386
|
+
http.open_timeout = @open_timeout if @open_timeout
|
387
|
+
http.read_timeout = @read_timeout if @read_timeout
|
388
|
+
|
389
|
+
# Send the request
|
390
|
+
|
391
|
+
http.request(request, *request_data) {|response|
|
392
|
+
|
393
|
+
# TODO: expire/validate cookies
|
394
|
+
(response.get_fields('Set-Cookie')||[]).each do |cookie|
|
395
|
+
log.debug("cookie received: #{ cookie }")
|
396
|
+
@cookies << cookie.split(";").first.strip
|
397
|
+
end
|
398
|
+
|
399
|
+
response.each_header {|k,v|
|
400
|
+
log.debug("header: #{ k } : #{ v }")
|
401
|
+
}
|
402
|
+
|
403
|
+
page.response = response
|
404
|
+
page.code = response.code
|
405
|
+
|
406
|
+
response.read_body
|
407
|
+
page.body = response.body
|
408
|
+
|
409
|
+
log.info("status: #{ page.code }")
|
410
|
+
|
411
|
+
page.watch_for_set = @watch_for_set
|
412
|
+
|
413
|
+
case page.code
|
414
|
+
when "200"
|
415
|
+
return page
|
416
|
+
when "302"
|
417
|
+
log.info("follow redirect to: #{ response.header['Location'] }")
|
418
|
+
return fetch_page(to_absolute_uri(response.header['Location'], page), :get, page)
|
419
|
+
else
|
420
|
+
raise
|
421
|
+
end
|
422
|
+
}
|
423
|
+
}
|
424
|
+
end
|
425
|
+
|
426
|
+
def build_query_string(hash)
|
427
|
+
vals = []
|
428
|
+
hash.each_pair {|k,v|
|
429
|
+
vals <<
|
430
|
+
[WEBrick::HTTPUtils.escape_form(k),
|
431
|
+
WEBrick::HTTPUtils.escape_form(v)].join("=")
|
432
|
+
}
|
433
|
+
|
434
|
+
vals.join("&")
|
435
|
+
end
|
436
|
+
|
437
|
+
def add_to_history(page)
|
438
|
+
@history.push(page)
|
439
|
+
if @max_history and @history.size < @max_history
|
440
|
+
# keep only the last @max_history entries
|
441
|
+
@history = @history[@history.size - @max_history, @max_history]
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
end
|
446
|
+
|
447
|
+
end # module WWW
|