mechanize 0.4.7 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data/CHANGELOG +17 -0
- data/EXAMPLES +23 -44
- data/NOTES +49 -0
- data/lib/mechanize.rb +95 -80
- data/lib/mechanize/cookie.rb +147 -148
- data/lib/mechanize/cookie.rb.rej +16 -0
- data/lib/mechanize/errors.rb +29 -0
- data/lib/mechanize/form.rb +211 -186
- data/lib/mechanize/form_elements.rb +31 -71
- data/lib/mechanize/list.rb +34 -0
- data/lib/mechanize/mech_version.rb +3 -1
- data/lib/mechanize/module.rb +1 -1
- data/lib/mechanize/page.rb +162 -180
- data/lib/mechanize/page_elements.rb +53 -40
- data/lib/mechanize/parsing.rb +11 -3
- data/lib/mechanize/pluggable_parsers.rb +147 -0
- data/test/data/server.crt +14 -0
- data/test/data/server.csr +11 -0
- data/test/data/server.key +18 -0
- data/test/data/server.pem +15 -0
- data/test/htdocs/no_title_test.html +6 -0
- data/test/parse.rb +39 -0
- data/test/proxy.rb +30 -0
- data/test/server.rb +2 -0
- data/test/servlets.rb +8 -0
- data/test/ssl_server.rb +49 -0
- data/test/tc_authenticate.rb +8 -6
- data/test/tc_cookie_class.rb +28 -18
- data/test/tc_cookie_jar.rb +88 -27
- data/test/tc_cookies.rb +41 -44
- data/test/tc_errors.rb +9 -23
- data/test/tc_forms.rb +36 -32
- data/test/tc_frames.rb +6 -4
- data/test/tc_links.rb +7 -6
- data/test/tc_mech.rb +43 -46
- data/test/tc_page.rb +24 -0
- data/test/tc_pluggable_parser.rb +103 -0
- data/test/tc_post_form.rb +41 -0
- data/test/tc_proxy.rb +25 -0
- data/test/tc_response_code.rb +13 -10
- data/test/tc_save_file.rb +25 -0
- data/test/tc_ssl_server.rb +27 -0
- data/test/tc_upload.rb +8 -6
- data/test/tc_watches.rb +5 -2
- data/test/test_includes.rb +3 -3
- data/test/ts_mech.rb +11 -2
- metadata +100 -86
- data/test/tc_filter.rb +0 -34
@@ -1,4 +1,5 @@
|
|
1
1
|
module WWW
|
2
|
+
class Mechanize
|
2
3
|
# This class represents a field in a form. It handles the following input
|
3
4
|
# tags found in a form:
|
4
5
|
# text, password, hidden, int, textarea
|
@@ -12,22 +13,8 @@ module WWW
|
|
12
13
|
@name, @value = name, value
|
13
14
|
end
|
14
15
|
|
15
|
-
# Returns an array of Field objects
|
16
|
-
# TODO: is this correct?
|
17
|
-
def self.extract_all_from(root_node)
|
18
|
-
fields = []
|
19
|
-
root_node.each_recursive {|node|
|
20
|
-
if (node.name.downcase == 'input' and
|
21
|
-
%w(text password hidden checkbox radio int).include?(node.attributes['type'].downcase)) or
|
22
|
-
%w(textarea option).include?(node.name.downcase)
|
23
|
-
fields << Field.new(node.attributes['name'], node.attributes['value'])
|
24
|
-
end
|
25
|
-
}
|
26
|
-
return fields
|
27
|
-
end
|
28
|
-
|
29
16
|
def inspect
|
30
|
-
"#{name} = #{@value}
|
17
|
+
"#{name} = #{@value}"
|
31
18
|
end
|
32
19
|
end
|
33
20
|
|
@@ -36,45 +23,26 @@ module WWW
|
|
36
23
|
# to upload and WWW::FileUpload#mime_type= to the appropriate mime type
|
37
24
|
# of the file.
|
38
25
|
# See the example in EXAMPLES[link://files/EXAMPLES.html]
|
39
|
-
class FileUpload
|
40
|
-
|
41
|
-
attr_accessor :name
|
26
|
+
class FileUpload < Field
|
27
|
+
attr_accessor :name # Field name
|
28
|
+
attr_accessor :file_name # File name
|
29
|
+
attr_accessor :mime_type # Mime Type (Optional)
|
42
30
|
|
43
|
-
|
31
|
+
alias :file_data :value
|
32
|
+
alias :file_data= :value=
|
44
33
|
|
45
34
|
def initialize(name, file_name)
|
46
|
-
@
|
35
|
+
@file_name = file_name
|
47
36
|
@file_data = nil
|
37
|
+
super(name, @file_data)
|
48
38
|
end
|
49
39
|
end
|
50
40
|
|
51
41
|
# This class represents a Submit button in a form.
|
52
|
-
class Button
|
53
|
-
attr_accessor :name, :value
|
54
|
-
|
55
|
-
def initialize(name, value)
|
56
|
-
@name, @value = name, value
|
57
|
-
end
|
58
|
-
|
42
|
+
class Button < Field
|
59
43
|
def add_to_query(query)
|
60
44
|
query << [@name, @value || ''] if @name
|
61
45
|
end
|
62
|
-
|
63
|
-
# Returns an array of Button objects
|
64
|
-
def self.extract_all_from(root_node)
|
65
|
-
buttons = []
|
66
|
-
root_node.each_recursive {|node|
|
67
|
-
if node.name.downcase == 'input' and
|
68
|
-
['submit'].include?(node.attributes['type'].downcase)
|
69
|
-
buttons << Button.new(node.attributes['name'], node.attributes['value'])
|
70
|
-
end
|
71
|
-
}
|
72
|
-
return buttons
|
73
|
-
end
|
74
|
-
|
75
|
-
def inspect
|
76
|
-
"#{name} = #{@value}\n"
|
77
|
-
end
|
78
46
|
end
|
79
47
|
|
80
48
|
# This class represents an image button in a form. Use the x and y methods
|
@@ -82,6 +50,12 @@ module WWW
|
|
82
50
|
class ImageButton < Button
|
83
51
|
attr_accessor :x, :y
|
84
52
|
|
53
|
+
def initialize(name, value)
|
54
|
+
@x = nil
|
55
|
+
@y = nil
|
56
|
+
super(name, value)
|
57
|
+
end
|
58
|
+
|
85
59
|
def add_to_query(query)
|
86
60
|
if @name
|
87
61
|
query << [@name, @value || '']
|
@@ -93,30 +67,18 @@ module WWW
|
|
93
67
|
|
94
68
|
# This class represents a radio button found in a Form. To activate the
|
95
69
|
# RadioButton in the Form, set the checked method to true.
|
96
|
-
class RadioButton
|
97
|
-
attr_accessor :
|
70
|
+
class RadioButton < Field
|
71
|
+
attr_accessor :checked
|
98
72
|
|
99
73
|
def initialize(name, value, checked)
|
100
|
-
@
|
101
|
-
|
102
|
-
|
103
|
-
def inspect
|
104
|
-
"#{name} = #{@value}\n"
|
74
|
+
@checked = checked
|
75
|
+
super(name, value)
|
105
76
|
end
|
106
77
|
end
|
107
78
|
|
108
79
|
# This class represents a check box found in a Form. To activate the
|
109
80
|
# CheckBox in the Form, set the checked method to true.
|
110
|
-
class CheckBox
|
111
|
-
attr_accessor :name, :value, :checked
|
112
|
-
|
113
|
-
def initialize(name, value, checked)
|
114
|
-
@name, @value, @checked = name, value, checked
|
115
|
-
end
|
116
|
-
|
117
|
-
def inspect
|
118
|
-
"#{name} = #{@value}\n"
|
119
|
-
end
|
81
|
+
class CheckBox < RadioButton
|
120
82
|
end
|
121
83
|
|
122
84
|
# This class represents a select list or drop down box in a Form. Set the
|
@@ -124,13 +86,11 @@ module WWW
|
|
124
86
|
# list of Option that were found. After finding the correct option, set
|
125
87
|
# the select lists value to the option value:
|
126
88
|
# selectlist.value = selectlist.options.first.value
|
127
|
-
class SelectList
|
128
|
-
attr_accessor :
|
129
|
-
attr_reader :value
|
89
|
+
class SelectList < Field
|
90
|
+
attr_accessor :options
|
130
91
|
|
131
92
|
def initialize(name, node)
|
132
|
-
|
133
|
-
@value = nil
|
93
|
+
value = nil
|
134
94
|
@options = WWW::Mechanize::List.new
|
135
95
|
|
136
96
|
# parse
|
@@ -138,19 +98,18 @@ module WWW
|
|
138
98
|
if n.name.downcase == 'option'
|
139
99
|
option = Option.new(n)
|
140
100
|
@options << option
|
141
|
-
|
101
|
+
value = option.value if option.selected
|
142
102
|
end
|
143
103
|
}
|
144
|
-
|
104
|
+
value = @options.first.value if (value == nil && @options.first)
|
105
|
+
super(name, value)
|
145
106
|
end
|
146
107
|
|
108
|
+
alias :old_value= :value=
|
109
|
+
|
147
110
|
def value=(value)
|
148
111
|
@value = value.to_s
|
149
112
|
end
|
150
|
-
|
151
|
-
def inspect
|
152
|
-
"#{name} = #{@value}\n"
|
153
|
-
end
|
154
113
|
end
|
155
114
|
|
156
115
|
# This class contains option an option found within SelectList. A
|
@@ -166,4 +125,5 @@ module WWW
|
|
166
125
|
@selected = node.attributes['selected'] ? true : false
|
167
126
|
end
|
168
127
|
end
|
128
|
+
end
|
169
129
|
end
|
data/lib/mechanize/list.rb
CHANGED
@@ -1,10 +1,44 @@
|
|
1
1
|
module WWW
|
2
2
|
class Mechanize
|
3
|
+
# = Synopsis
|
4
|
+
# This class provides syntax sugar to help find things within Mechanize.
|
5
|
+
# Most calls in Mechanize that return arrays, like the 'links' method
|
6
|
+
# WWW::Mechanize::Page return a Mechanize::List. This class lets you
|
7
|
+
# find things with a particular attribute on the found class.
|
8
|
+
#
|
9
|
+
# If you have an array with objects that response to the method "name",
|
10
|
+
# and you want to find all objects where name equals 'foo', your code
|
11
|
+
# would look like this:
|
12
|
+
#
|
13
|
+
# list.name('foo') # => Mechanize::List
|
14
|
+
#
|
15
|
+
# == A bit more information
|
16
|
+
# Mechanize::List will iterate through all of the objects it contains,
|
17
|
+
# testing to see if the object will respond to the "name" method. If it
|
18
|
+
# does, it will test to see if calling the name method returns a value
|
19
|
+
# equal to the value passed in.
|
20
|
+
#
|
21
|
+
# Finding the list will return another list, so it is possible to chain
|
22
|
+
# calls with Mechanize::List. For example:
|
23
|
+
#
|
24
|
+
# list.name('foo').href('bar.html')
|
25
|
+
#
|
26
|
+
# This code will find all elements with name 'foo' and href 'bar.html'.
|
3
27
|
class List < Array
|
28
|
+
# This method provides syntax sugar so that you can write expressions
|
29
|
+
# like this:
|
30
|
+
# form.fields.with.name('foo').and.href('bar.html')
|
31
|
+
#
|
4
32
|
def with
|
5
33
|
self
|
6
34
|
end
|
7
35
|
|
36
|
+
# This method will allow the you to set the value of the first element
|
37
|
+
# in the list. For example, finding an input field with name 'foo'
|
38
|
+
# and setting the value to 'bar'.
|
39
|
+
#
|
40
|
+
# form.fields.name('foo').value = 'bar'
|
41
|
+
#
|
8
42
|
def value=(arg)
|
9
43
|
first().value=(arg)
|
10
44
|
end
|
data/lib/mechanize/module.rb
CHANGED
data/lib/mechanize/page.rb
CHANGED
@@ -1,192 +1,174 @@
|
|
1
|
-
|
2
|
-
# = Synopsis
|
3
|
-
# This class encapsulates a page.
|
4
|
-
#
|
5
|
-
# == Example
|
6
|
-
# require 'rubygems'
|
7
|
-
# require 'mechanize'
|
8
|
-
# require 'logger'
|
9
|
-
#
|
10
|
-
# class Body
|
11
|
-
# def initialize(node)
|
12
|
-
# puts node.attributes['bgcolor']
|
13
|
-
# end
|
14
|
-
# end
|
15
|
-
#
|
16
|
-
# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
|
17
|
-
# agent.user_agent_alias = 'Mac Safari'
|
18
|
-
# page = agent.get("http://www.google.com/")
|
19
|
-
# page.watch_for_set = { 'body' => Body }
|
20
|
-
#
|
21
|
-
# body = page.watches
|
22
|
-
class Page
|
23
|
-
attr_accessor :uri, :cookies, :response, :body, :code, :watch_for_set
|
24
|
-
attr_finder :frames, :iframes, :links, :forms, :meta, :watches
|
25
|
-
attr_reader :body_filter
|
1
|
+
require 'fileutils'
|
26
2
|
|
27
|
-
|
3
|
+
module WWW
|
4
|
+
class Mechanize
|
5
|
+
# = Synopsis
|
6
|
+
# This class encapsulates an HTML page. If Mechanize finds a content
|
7
|
+
# type of 'text/html', this class will be instantiated and returned.
|
8
|
+
#
|
9
|
+
# == Example
|
10
|
+
# require 'rubygems'
|
11
|
+
# require 'mechanize'
|
12
|
+
#
|
13
|
+
# agent = WWW::Mechanize.new
|
14
|
+
# agent.get('http://google.com/').class #=> WWW::Mechanize::Page
|
15
|
+
#
|
16
|
+
class Page < File
|
17
|
+
attr_accessor :watch_for_set
|
18
|
+
attr_finder :frames, :iframes, :links, :forms, :meta, :watches
|
28
19
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
20
|
+
# Alias our finders so that we can lazily parse the html
|
21
|
+
alias :find_frames :frames
|
22
|
+
alias :find_iframes :iframes
|
23
|
+
alias :find_links :links
|
24
|
+
alias :find_forms :forms
|
25
|
+
alias :find_meta :meta
|
26
|
+
alias :find_watches :watches
|
27
|
+
|
28
|
+
def initialize(uri=nil, response=nil, body=nil, code=nil)
|
29
|
+
super(uri, response, body, code)
|
30
|
+
@frames = nil
|
31
|
+
@iframes = nil
|
32
|
+
@links = nil
|
33
|
+
@forms = nil
|
34
|
+
@meta = nil
|
35
|
+
@watches = nil
|
36
|
+
@root = nil
|
37
|
+
@title = nil
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the response header
|
41
|
+
def header
|
42
|
+
@response
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get the content type
|
46
|
+
def content_type
|
47
|
+
@response['Content-Type']
|
48
|
+
end
|
49
|
+
|
50
|
+
# Get a list of Form associated with this page.
|
51
|
+
def forms(*args)
|
52
|
+
parse_html() unless @forms
|
53
|
+
find_forms(*args)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Get a list of Link associated with this page.
|
57
|
+
def links(*args)
|
58
|
+
parse_html() unless @links
|
59
|
+
find_links(*args)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Get the root XML parse tree for this page.
|
63
|
+
def root
|
64
|
+
parse_html() unless @root
|
65
|
+
@root
|
66
|
+
end
|
67
|
+
|
68
|
+
# This method watches out for a particular tag, and will call back to the
|
69
|
+
# class specified for the tag in the watch_for_set method. See the example
|
70
|
+
# in this class.
|
71
|
+
def watches(*args)
|
72
|
+
parse_html() unless @watches
|
73
|
+
find_watches(*args)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Get a list of Meta links, usually used for refreshing the page.
|
77
|
+
def meta(*args)
|
78
|
+
parse_html() unless @meta
|
79
|
+
find_meta(*args)
|
80
|
+
end
|
57
81
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
# Get the content type
|
64
|
-
def content_type
|
65
|
-
@response['Content-Type']
|
66
|
-
end
|
67
|
-
|
68
|
-
# Get a list of Form associated with this page.
|
69
|
-
def forms(*args)
|
70
|
-
parse_html() unless @forms
|
71
|
-
find_forms(*args)
|
72
|
-
end
|
73
|
-
|
74
|
-
# Get a list of Link associated with this page.
|
75
|
-
def links(*args)
|
76
|
-
parse_html() unless @links
|
77
|
-
find_links(*args)
|
78
|
-
end
|
79
|
-
|
80
|
-
# Get the root XML parse tree for this page.
|
81
|
-
def root
|
82
|
-
parse_html() unless @root
|
83
|
-
@root
|
84
|
-
end
|
85
|
-
|
86
|
-
# This method watches out for a particular tag, and will call back to the
|
87
|
-
# class specified for the tag in the watch_for_set method. See the example
|
88
|
-
# in this class.
|
89
|
-
def watches(*args)
|
90
|
-
parse_html() unless @watches
|
91
|
-
find_watches(*args)
|
92
|
-
end
|
93
|
-
|
94
|
-
# Get a list of Meta links, usually used for refreshing the page.
|
95
|
-
def meta(*args)
|
96
|
-
parse_html() unless @meta
|
97
|
-
find_meta(*args)
|
98
|
-
end
|
82
|
+
# Get a list of Frame from the page
|
83
|
+
def frames(*args)
|
84
|
+
parse_html() unless @frames
|
85
|
+
find_frames(*args)
|
86
|
+
end
|
99
87
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
88
|
+
# Get a list of IFrame from the page
|
89
|
+
def iframes(*args)
|
90
|
+
parse_html() unless @iframes
|
91
|
+
find_iframes(*args)
|
92
|
+
end
|
105
93
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
string << "[frames]\n"
|
116
|
-
frames.each { |l| string << l.inspect }
|
117
|
-
string << "[iframes]\n"
|
118
|
-
iframes.each { |l| string << l.inspect }
|
119
|
-
string << "[links]\n"
|
120
|
-
links.each { |l| string << l.inspect }
|
121
|
-
string << "[forms]\n"
|
122
|
-
forms.each { |l| string << l.inspect }
|
123
|
-
string
|
124
|
-
end
|
94
|
+
# Fetch the title of the page
|
95
|
+
def title
|
96
|
+
parse_html() unless @title
|
97
|
+
@title
|
98
|
+
end
|
99
|
+
|
100
|
+
def inspect
|
101
|
+
"Page: [#{title} '#{uri.to_s}']"
|
102
|
+
end
|
125
103
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
104
|
+
private
|
105
|
+
|
106
|
+
def parse_html
|
107
|
+
raise Mechanize::ContentTypeError.new(content_type()) unless
|
108
|
+
content_type() =~ /^text\/html/
|
109
|
+
|
110
|
+
# construct parser and feed with HTML
|
111
|
+
parser = HTMLTree::XMLParser.new
|
112
|
+
begin
|
113
|
+
parser.feed(@body)
|
114
|
+
rescue => ex
|
115
|
+
if ex.message =~ /attempted adding second root element to document/ and
|
116
|
+
# Put the whole document inside a single root element, which I
|
117
|
+
# simply name <root>, just to make the parser happy. It's no
|
118
|
+
#longer valid HTML, but without a single root element, it's not
|
119
|
+
# valid HTML as well.
|
120
|
+
|
121
|
+
# TODO: leave a possible doctype definition outside this element.
|
122
|
+
parser = HTMLTree::XMLParser.new
|
123
|
+
parser.feed("<root>" + @body + "</root>")
|
124
|
+
else
|
125
|
+
raise
|
126
|
+
end
|
147
127
|
end
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
if
|
174
|
-
|
175
|
-
|
128
|
+
|
129
|
+
@root = parser.document
|
130
|
+
|
131
|
+
@forms = WWW::Mechanize::List.new
|
132
|
+
@links = WWW::Mechanize::List.new
|
133
|
+
@meta = WWW::Mechanize::List.new
|
134
|
+
@frames = WWW::Mechanize::List.new
|
135
|
+
@iframes = WWW::Mechanize::List.new
|
136
|
+
@watches = {}
|
137
|
+
|
138
|
+
@root.each_recursive {|node|
|
139
|
+
name = node.name.downcase
|
140
|
+
|
141
|
+
case name
|
142
|
+
when 'form'
|
143
|
+
form = Form.new(node)
|
144
|
+
form.action ||= @uri
|
145
|
+
@forms << form
|
146
|
+
when 'title'
|
147
|
+
@title = node.text
|
148
|
+
when 'a'
|
149
|
+
@links << Link.new(node)
|
150
|
+
when 'meta'
|
151
|
+
equiv = node.attributes['http-equiv']
|
152
|
+
content = node.attributes['content']
|
153
|
+
if equiv != nil && equiv.downcase == 'refresh'
|
154
|
+
if content != nil && content =~ /^\d+\s*;\s*url\s*=\s*(\S+)/i
|
155
|
+
node.attributes['href'] = $1
|
156
|
+
@meta << Meta.new(node)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
when 'frame'
|
160
|
+
@frames << Frame.new(node)
|
161
|
+
when 'iframe'
|
162
|
+
@iframes << Frame.new(node)
|
163
|
+
else
|
164
|
+
if @watch_for_set and @watch_for_set.keys.include?( name )
|
165
|
+
@watches[name] = [] unless @watches[name]
|
166
|
+
klass = @watch_for_set[name]
|
167
|
+
@watches[name] << (klass ? klass.new(node) : node)
|
176
168
|
end
|
177
169
|
end
|
178
|
-
|
179
|
-
|
180
|
-
when 'iframe'
|
181
|
-
@iframes << Frame.new(node)
|
182
|
-
else
|
183
|
-
if @watch_for_set and @watch_for_set.keys.include?( name )
|
184
|
-
@watches[name] = [] unless @watches[name]
|
185
|
-
klass = @watch_for_set[name]
|
186
|
-
@watches[name] << (klass ? klass.new(node) : node)
|
187
|
-
end
|
188
|
-
end
|
189
|
-
}
|
170
|
+
}
|
171
|
+
end
|
190
172
|
end
|
191
173
|
end
|
192
174
|
end
|