mechanize 0.1.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mechanize might be problematic. Click here for more details.

data/README ADDED
@@ -0,0 +1,15 @@
1
+ == Dependencies
2
+
3
+ * ruby 1.8.2
4
+
5
+ Note that the files in the net-overrides/ directory are taken from Ruby 1.9.0.
6
+
7
+ * narf 0.6.3 (www.narf-lib.org)
8
+
9
+ After unpacking, install it with:
10
+
11
+ ruby install.rb config
12
+ ruby install.rb setup
13
+ ruby install.rb install
14
+
15
+ Note that only the ruby-htmltools package bunded with narf works.
@@ -0,0 +1,13 @@
1
+ $LOAD_PATH.unshift '../lib'
2
+ require 'mechanize'
3
+
4
+ agent = WWW::Mechanize.new {|a| a.log = Logger.new(STDERR) }
5
+ page = agent.get('http://rubyforge.org/')
6
+ link = page.links.find {|l| l.node.text =~ /Log In/ }
7
+ page = agent.click(link)
8
+ form = page.forms[1]
9
+ form.fields.find {|f| f.name == 'form_loginname'}.value = ARGV[0]
10
+ form.fields.find {|f| f.name == 'form_pw'}.value = ARGV[1]
11
+ page = agent.submit(form, form.buttons.first)
12
+
13
+ puts page.body
data/lib/mechanize.rb ADDED
@@ -0,0 +1,447 @@
1
+ #
2
+ # Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de)
3
+ #
4
+ # This is a quick hack, to get something like Perl's WWW::Mechanize. Sure, we
5
+ # have Web::Unit, but, that does not work for me as expected, as it does not
6
+ # set cookies (I might be wrong), does not automatically redirect and has
7
+ # problems with some html documents.
8
+
9
+ Version = "0.1.0"
10
+
11
+ # required due to the missing get_fields method in Ruby 1.8.2
12
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides")
13
+ require 'net/http'
14
+ require 'net/https'
15
+
16
+ require 'web/htmltools/xmltree' # narf
17
+ require 'mechanize/parsing'
18
+ require 'uri'
19
+ require 'logger'
20
+ require 'webrick'
21
+
22
+ module WWW
23
+
24
+ class Field
25
+ attr_accessor :name, :value
26
+
27
+ def initialize(name, value)
28
+ @name, @value = name, value
29
+ end
30
+
31
+ # Returns an array of Field objects
32
+ def self.extract_all_from(root_node)
33
+ fields = []
34
+ root_node.each_recursive {|node|
35
+ if node.name.downcase == 'input' and
36
+ ['text', 'password', 'hidden', 'checkbox'].include?(node.attributes['type'].downcase)
37
+ fields << Field.new(node.attributes['name'], node.attributes['value'])
38
+ end
39
+ }
40
+ return fields
41
+ end
42
+ end
43
+
44
+ class FileUpload
45
+ # value is the file-name, not the file-content
46
+ attr_accessor :name
47
+
48
+ attr_accessor :file_name, :file_data
49
+
50
+ def initialize(name, file_name)
51
+ @name, @file_name = name, file_name
52
+ @file_data = nil
53
+ end
54
+ end
55
+
56
+ class Button
57
+ attr_accessor :name, :value
58
+
59
+ def initialize(name, value)
60
+ @name, @value = name, value
61
+ end
62
+
63
+ # Returns an array of Button objects
64
+ def self.extract_all_from(root_node)
65
+ buttons = []
66
+ root_node.each_recursive {|node|
67
+ if node.name.downcase == 'input' and
68
+ ['submit'].include?(node.attributes['type'].downcase)
69
+ buttons << Button.new(node.attributes['name'], node.attributes['value'])
70
+ end
71
+ }
72
+ return buttons
73
+ end
74
+ end
75
+
76
+ # Class Form does not work in the case there is some invalid (unbalanced) html
77
+ # involved, such as:
78
+ #
79
+ # <td>
80
+ # <form>
81
+ # </td>
82
+ # <td>
83
+ # <input .../>
84
+ # </form>
85
+ # </td>
86
+ #
87
+ # GlobalForm takes two nodes, the node where the form tag is located
88
+ # (form_node), and another node, from which to start looking for form elements
89
+ # (elements_node) like buttons and the like. For class Form both fall together
90
+ # into one and the same node.
91
+
92
+ class GlobalForm
93
+ attr_reader :form_node, :elements_node
94
+ attr_reader :method, :action, :name
95
+
96
+ attr_reader :fields, :buttons, :file_uploads
97
+
98
+ def initialize(form_node, elements_node)
99
+ @form_node, @elements_node = form_node, elements_node
100
+
101
+ @method = (@form_node.attributes['method'] || 'POST').upcase
102
+ @action = @form_node.attributes['action']
103
+ @name = @form_node.attributes['name']
104
+
105
+ parse
106
+ end
107
+
108
+ def parse
109
+ @fields = []
110
+ @buttons = []
111
+ @file_uploads = []
112
+ @elements_node.each_recursive {|node|
113
+ case node.name.downcase
114
+ when 'input'
115
+ case node.attributes['type'].downcase
116
+ when 'text', 'password', 'hidden', 'checkbox'
117
+ @fields << Field.new(node.attributes['name'], node.attributes['value'])
118
+ when 'file'
119
+ @file_uploads << FileUpload.new(node.attributes['name'], node.attributes['value'])
120
+ when 'submit'
121
+ @buttons << Button.new(node.attributes['name'], node.attributes['value'])
122
+ end
123
+ end
124
+ }
125
+ end
126
+
127
+ end
128
+
129
+ class Form < GlobalForm
130
+ attr_reader :node
131
+
132
+ def initialize(node)
133
+ @node = node
134
+ super(@node, @node)
135
+ end
136
+ end
137
+
138
+ class Link
139
+ attr_reader :node
140
+ attr_reader :href
141
+
142
+ def initialize(node)
143
+ @node = node
144
+ @href = node.attributes['href']
145
+ end
146
+ end
147
+
148
+ class Page
149
+ attr_accessor :uri, :cookies, :response, :body, :code, :watch_for_set
150
+
151
+ def initialize(uri=nil, cookies=[], response=nil, body=nil, code=nil)
152
+ @uri, @cookies, @response, @body, @code = uri, cookies, response, body, code
153
+ end
154
+
155
+ def header
156
+ @response.header
157
+ end
158
+
159
+ def content_type
160
+ header['Content-Type']
161
+ end
162
+
163
+ def forms
164
+ parse_html() unless @forms
165
+ @forms
166
+ end
167
+
168
+ def links
169
+ parse_html() unless @links
170
+ @links
171
+ end
172
+
173
+ def root
174
+ parse_html() unless @root
175
+ @root
176
+ end
177
+
178
+ def watches
179
+ parse_html() unless @watches
180
+ @watches
181
+ end
182
+
183
+ private
184
+
185
+ def parse_html
186
+ raise "no html" if content_type() != 'text/html'
187
+
188
+ # construct parser and feed with HTML
189
+ parser = HTMLTree::XMLParser.new
190
+ begin
191
+ parser.feed(@body)
192
+ rescue => ex
193
+ if ex.message =~ /attempted adding second root element to document/ and
194
+ # Put the whole document inside a single root element, which I simply name
195
+ # <root>, just to make the parser happy. It's no longer valid HTML, but
196
+ # without a single root element, it's not valid HTML as well.
197
+
198
+ # TODO: leave a possible doctype definition outside this element.
199
+ parser = HTMLTree::XMLParser.new
200
+ parser.feed("<root>" + @body + "</root>")
201
+ else
202
+ raise
203
+ end
204
+ end
205
+
206
+ @root = parser.document
207
+
208
+ @forms = []
209
+ @links = []
210
+ @watches = {}
211
+
212
+ @root.each_recursive {|node|
213
+ name = node.name.downcase
214
+
215
+ case name
216
+ when 'form'
217
+ @forms << Form.new(node)
218
+ when 'a'
219
+ @links << Link.new(node)
220
+ else
221
+ if @watch_for_set and @watch_for_set.keys.include?( name )
222
+ @watches[name] = [] unless @watches[name]
223
+ klass = @watch_for_set[name]
224
+ @watches[name] << (klass ? klass.new(node) : node)
225
+ end
226
+ end
227
+ }
228
+ end
229
+ end
230
+
231
+ class Mechanize
232
+
233
+ AGENT_ALIASES = {
234
+ 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
235
+ 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
236
+ 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-us) AppleWebKit/85 (KHTML, like Gecko) Safari/85',
237
+ 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
238
+ 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
239
+ 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
240
+ }
241
+
242
+ attr_accessor :log
243
+ attr_accessor :user_agent
244
+ attr_accessor :cookies
245
+ attr_accessor :open_timeout, :read_timeout
246
+ attr_accessor :watch_for_set
247
+ attr_accessor :max_history
248
+
249
+ def initialize
250
+ @history = []
251
+ @cookies = []
252
+ @log = Logger.new(nil)
253
+ yield self if block_given?
254
+ end
255
+
256
+ def user_agent_alias=(al)
257
+ self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
258
+ end
259
+
260
+ def get(url)
261
+ cur_page = current_page() || Page.new
262
+
263
+ # fetch the page
264
+ page = fetch_page(to_absolute_uri(url, cur_page), :get, cur_page)
265
+ add_to_history(page)
266
+ page
267
+ end
268
+
269
+ def post(url, query={})
270
+ cur_page = current_page() || Page.new
271
+
272
+ request_data = [build_query_string(query)]
273
+
274
+ # this is called before the request is sent
275
+ pre_request_hook = proc {|request|
276
+ log.debug("query: #{ query.inspect }")
277
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
278
+ request.add_header('Content-Length', request_data[0].size.to_s)
279
+ }
280
+
281
+ # fetch the page
282
+ page = fetch_page(to_absolute_uri(url, cur_page), :post, cur_page, pre_request_hook, request_data)
283
+ add_to_history(page)
284
+ page
285
+ end
286
+
287
+ def click(link)
288
+ uri = to_absolute_uri(link.href)
289
+ get(uri)
290
+ end
291
+
292
+ def submit(form, button=nil)
293
+ query = {}
294
+ form.fields.each do |f|
295
+ query[f.name] = f.value || ""
296
+ end
297
+ query[button.name] = button.value || "" if button
298
+
299
+ uri = to_absolute_uri(form.action)
300
+ case form.method.upcase
301
+ when 'POST'
302
+ post(uri, query)
303
+ when 'GET'
304
+ get(uri + "?" + build_query_string(query))
305
+ else
306
+ raise 'unsupported method'
307
+ end
308
+ end
309
+
310
+ def current_page
311
+ @history.last
312
+ end
313
+
314
+ alias page current_page
315
+
316
+ private
317
+
318
+ def to_absolute_uri(url, cur_page=current_page())
319
+ if url.is_a?(URI)
320
+ uri = url
321
+ else
322
+ uri = URI.parse(url)
323
+ end
324
+
325
+ # construct an absolute uri
326
+ if uri.relative?
327
+ if cur_page
328
+ uri = cur_page.uri + url
329
+ else
330
+ raise 'no history. please specify an absolute URL'
331
+ end
332
+ end
333
+
334
+ return uri
335
+ end
336
+
337
+ # uri is an absolute URI
338
+ def fetch_page(uri, method=:get, cur_page=current_page(), pre_request_hook=nil, request_data=[])
339
+ raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
340
+
341
+ log.info("#{ method.to_s.upcase }: #{ uri.to_s }")
342
+
343
+ page = Page.new(uri)
344
+
345
+ http = Net::HTTP.new(uri.host, uri.port)
346
+ http.use_ssl = true if uri.scheme == "https"
347
+ http.start {
348
+
349
+ case method
350
+ when :get
351
+ request = Net::HTTP::Get.new(uri.request_uri)
352
+ when :post
353
+ request = Net::HTTP::Post.new(uri.request_uri)
354
+ else
355
+ raise ArgumentError
356
+ end
357
+
358
+ unless @cookies.empty?
359
+ cookie = @cookies.uniq.join("; ")
360
+ log.debug("use cookie: #{ cookie }")
361
+ request.add_header('Cookie', cookie)
362
+ end
363
+
364
+ # Add Referer header to request
365
+
366
+ unless cur_page.uri.nil?
367
+ request.add_header('Referer', cur_page.uri.to_s)
368
+ end
369
+
370
+ # Add User-Agent header to request
371
+
372
+ request.add_header('User-Agent', @user_agent) if @user_agent
373
+
374
+ # Invoke pre-request-hook (use it to add custom headers or content)
375
+
376
+ pre_request_hook.call(request) if pre_request_hook
377
+
378
+ # Log specified headers for the request
379
+
380
+ request.each_header do |k, v|
381
+ log.debug("request-header: #{ k } => #{ v }")
382
+ end
383
+
384
+ # Specify timeouts if given
385
+
386
+ http.open_timeout = @open_timeout if @open_timeout
387
+ http.read_timeout = @read_timeout if @read_timeout
388
+
389
+ # Send the request
390
+
391
+ http.request(request, *request_data) {|response|
392
+
393
+ # TODO: expire/validate cookies
394
+ (response.get_fields('Set-Cookie')||[]).each do |cookie|
395
+ log.debug("cookie received: #{ cookie }")
396
+ @cookies << cookie.split(";").first.strip
397
+ end
398
+
399
+ response.each_header {|k,v|
400
+ log.debug("header: #{ k } : #{ v }")
401
+ }
402
+
403
+ page.response = response
404
+ page.code = response.code
405
+
406
+ response.read_body
407
+ page.body = response.body
408
+
409
+ log.info("status: #{ page.code }")
410
+
411
+ page.watch_for_set = @watch_for_set
412
+
413
+ case page.code
414
+ when "200"
415
+ return page
416
+ when "302"
417
+ log.info("follow redirect to: #{ response.header['Location'] }")
418
+ return fetch_page(to_absolute_uri(response.header['Location'], page), :get, page)
419
+ else
420
+ raise
421
+ end
422
+ }
423
+ }
424
+ end
425
+
426
+ def build_query_string(hash)
427
+ vals = []
428
+ hash.each_pair {|k,v|
429
+ vals <<
430
+ [WEBrick::HTTPUtils.escape_form(k),
431
+ WEBrick::HTTPUtils.escape_form(v)].join("=")
432
+ }
433
+
434
+ vals.join("&")
435
+ end
436
+
437
+ def add_to_history(page)
438
+ @history.push(page)
439
+ if @max_history and @history.size < @max_history
440
+ # keep only the last @max_history entries
441
+ @history = @history[@history.size - @max_history, @max_history]
442
+ end
443
+ end
444
+
445
+ end
446
+
447
+ end # module WWW