aai10-mechanize 2.0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +6 -0
- data/.gitignore +9 -0
- data/CHANGELOG.rdoc +652 -0
- data/EXAMPLES.rdoc +187 -0
- data/FAQ.rdoc +11 -0
- data/GUIDE.rdoc +163 -0
- data/LICENSE.rdoc +20 -0
- data/Manifest.txt +172 -0
- data/README.rdoc +63 -0
- data/Rakefile +36 -0
- data/aai10-mechanize.gemspec +20 -0
- data/examples/flickr_upload.rb +22 -0
- data/examples/mech-dump.rb +5 -0
- data/examples/proxy_req.rb +7 -0
- data/examples/rubyforge.rb +20 -0
- data/examples/spider.rb +21 -0
- data/lib/mechanize.rb +664 -0
- data/lib/mechanize/content_type_error.rb +14 -0
- data/lib/mechanize/cookie.rb +116 -0
- data/lib/mechanize/cookie_jar.rb +202 -0
- data/lib/mechanize/element_matcher.rb +35 -0
- data/lib/mechanize/file.rb +80 -0
- data/lib/mechanize/file_connection.rb +17 -0
- data/lib/mechanize/file_request.rb +26 -0
- data/lib/mechanize/file_response.rb +74 -0
- data/lib/mechanize/file_saver.rb +37 -0
- data/lib/mechanize/form.rb +478 -0
- data/lib/mechanize/form/button.rb +9 -0
- data/lib/mechanize/form/check_box.rb +11 -0
- data/lib/mechanize/form/field.rb +44 -0
- data/lib/mechanize/form/file_upload.rb +23 -0
- data/lib/mechanize/form/image_button.rb +20 -0
- data/lib/mechanize/form/multi_select_list.rb +83 -0
- data/lib/mechanize/form/option.rb +49 -0
- data/lib/mechanize/form/radio_button.rb +48 -0
- data/lib/mechanize/form/select_list.rb +40 -0
- data/lib/mechanize/headers.rb +25 -0
- data/lib/mechanize/history.rb +83 -0
- data/lib/mechanize/http.rb +3 -0
- data/lib/mechanize/http/agent.rb +738 -0
- data/lib/mechanize/inspect.rb +88 -0
- data/lib/mechanize/monkey_patch.rb +37 -0
- data/lib/mechanize/page.rb +408 -0
- data/lib/mechanize/page/base.rb +8 -0
- data/lib/mechanize/page/frame.rb +27 -0
- data/lib/mechanize/page/image.rb +30 -0
- data/lib/mechanize/page/label.rb +20 -0
- data/lib/mechanize/page/link.rb +82 -0
- data/lib/mechanize/page/meta_refresh.rb +56 -0
- data/lib/mechanize/pluggable_parsers.rb +101 -0
- data/lib/mechanize/redirect_limit_reached_error.rb +16 -0
- data/lib/mechanize/redirect_not_get_or_head_error.rb +19 -0
- data/lib/mechanize/response_code_error.rb +22 -0
- data/lib/mechanize/response_read_error.rb +27 -0
- data/lib/mechanize/robots_disallowed_error.rb +29 -0
- data/lib/mechanize/unsupported_scheme_error.rb +8 -0
- data/lib/mechanize/util.rb +113 -0
- data/test/data/htpasswd +1 -0
- data/test/data/server.crt +16 -0
- data/test/data/server.csr +12 -0
- data/test/data/server.key +15 -0
- data/test/data/server.pem +15 -0
- data/test/helper.rb +175 -0
- data/test/htdocs/alt_text.html +10 -0
- data/test/htdocs/bad_form_test.html +9 -0
- data/test/htdocs/button.jpg +0 -0
- data/test/htdocs/canonical_uri.html +9 -0
- data/test/htdocs/dir with spaces/foo.html +1 -0
- data/test/htdocs/empty_form.html +6 -0
- data/test/htdocs/file_upload.html +26 -0
- data/test/htdocs/find_link.html +41 -0
- data/test/htdocs/form_multi_select.html +16 -0
- data/test/htdocs/form_multival.html +37 -0
- data/test/htdocs/form_no_action.html +18 -0
- data/test/htdocs/form_no_input_name.html +16 -0
- data/test/htdocs/form_select.html +16 -0
- data/test/htdocs/form_select_all.html +16 -0
- data/test/htdocs/form_select_none.html +17 -0
- data/test/htdocs/form_select_noopts.html +10 -0
- data/test/htdocs/form_set_fields.html +14 -0
- data/test/htdocs/form_test.html +188 -0
- data/test/htdocs/frame_referer_test.html +10 -0
- data/test/htdocs/frame_test.html +30 -0
- data/test/htdocs/google.html +13 -0
- data/test/htdocs/iframe_test.html +16 -0
- data/test/htdocs/index.html +6 -0
- data/test/htdocs/link with space.html +5 -0
- data/test/htdocs/meta_cookie.html +11 -0
- data/test/htdocs/no_title_test.html +6 -0
- data/test/htdocs/nofollow.html +9 -0
- data/test/htdocs/noindex.html +9 -0
- data/test/htdocs/norobots.html +8 -0
- data/test/htdocs/rails_3_encoding_hack_form_test.html +27 -0
- data/test/htdocs/rel_nofollow.html +8 -0
- data/test/htdocs/relative/tc_relative_links.html +21 -0
- data/test/htdocs/robots.html +8 -0
- data/test/htdocs/robots.txt +2 -0
- data/test/htdocs/tc_bad_charset.html +9 -0
- data/test/htdocs/tc_bad_links.html +5 -0
- data/test/htdocs/tc_base_images.html +10 -0
- data/test/htdocs/tc_base_link.html +8 -0
- data/test/htdocs/tc_blank_form.html +11 -0
- data/test/htdocs/tc_charset.html +6 -0
- data/test/htdocs/tc_checkboxes.html +19 -0
- data/test/htdocs/tc_encoded_links.html +5 -0
- data/test/htdocs/tc_field_precedence.html +11 -0
- data/test/htdocs/tc_follow_meta.html +8 -0
- data/test/htdocs/tc_form_action.html +48 -0
- data/test/htdocs/tc_images.html +8 -0
- data/test/htdocs/tc_links.html +18 -0
- data/test/htdocs/tc_meta_in_body.html +9 -0
- data/test/htdocs/tc_no_attributes.html +16 -0
- data/test/htdocs/tc_pretty_print.html +17 -0
- data/test/htdocs/tc_radiobuttons.html +17 -0
- data/test/htdocs/tc_referer.html +16 -0
- data/test/htdocs/tc_relative_links.html +19 -0
- data/test/htdocs/tc_textarea.html +23 -0
- data/test/htdocs/test_bad_encoding.html +52 -0
- data/test/htdocs/test_click.html +11 -0
- data/test/htdocs/unusual______.html +5 -0
- data/test/servlets.rb +402 -0
- data/test/ssl_server.rb +48 -0
- data/test/test_cookies.rb +129 -0
- data/test/test_form_action.rb +52 -0
- data/test/test_form_as_hash.rb +59 -0
- data/test/test_form_button.rb +46 -0
- data/test/test_frames.rb +34 -0
- data/test/test_headers.rb +33 -0
- data/test/test_history.rb +118 -0
- data/test/test_history_added.rb +16 -0
- data/test/test_html_unscape_forms.rb +46 -0
- data/test/test_if_modified_since.rb +20 -0
- data/test/test_images.rb +19 -0
- data/test/test_mechanize.rb +852 -0
- data/test/test_mechanize_cookie.rb +345 -0
- data/test/test_mechanize_cookie_jar.rb +433 -0
- data/test/test_mechanize_file.rb +53 -0
- data/test/test_mechanize_file_request.rb +19 -0
- data/test/test_mechanize_file_response.rb +21 -0
- data/test/test_mechanize_form.rb +576 -0
- data/test/test_mechanize_form_check_box.rb +37 -0
- data/test/test_mechanize_form_encoding.rb +120 -0
- data/test/test_mechanize_form_field.rb +21 -0
- data/test/test_mechanize_form_image_button.rb +12 -0
- data/test/test_mechanize_form_textarea.rb +51 -0
- data/test/test_mechanize_http_agent.rb +697 -0
- data/test/test_mechanize_link.rb +84 -0
- data/test/test_mechanize_page_encoding.rb +147 -0
- data/test/test_mechanize_page_link.rb +382 -0
- data/test/test_mechanize_page_meta_refresh.rb +115 -0
- data/test/test_mechanize_redirect_not_get_or_head_error.rb +18 -0
- data/test/test_mechanize_subclass.rb +22 -0
- data/test/test_mechanize_util.rb +92 -0
- data/test/test_multi_select.rb +118 -0
- data/test/test_no_attributes.rb +13 -0
- data/test/test_option.rb +18 -0
- data/test/test_pluggable_parser.rb +136 -0
- data/test/test_post_form.rb +37 -0
- data/test/test_pretty_print.rb +22 -0
- data/test/test_radiobutton.rb +75 -0
- data/test/test_redirect_limit_reached.rb +39 -0
- data/test/test_redirect_ok.rb +25 -0
- data/test/test_referer.rb +81 -0
- data/test/test_relative_links.rb +40 -0
- data/test/test_request.rb +13 -0
- data/test/test_response_code.rb +53 -0
- data/test/test_robots.rb +72 -0
- data/test/test_save_file.rb +48 -0
- data/test/test_scheme.rb +48 -0
- data/test/test_select.rb +119 -0
- data/test/test_select_all.rb +15 -0
- data/test/test_select_none.rb +15 -0
- data/test/test_select_noopts.rb +18 -0
- data/test/test_set_fields.rb +44 -0
- data/test/test_ssl_server.rb +20 -0
- metadata +360 -0
data/EXAMPLES.rdoc
ADDED
@@ -0,0 +1,187 @@
|
|
1
|
+
= Mechanize examples
|
2
|
+
|
3
|
+
Note: Several examples show methods chained to the end of do/end blocks.
|
4
|
+
Do...end is the same as curly braces ({...}). For example, do ... end.submit
|
5
|
+
is the same as { ... }.submit.
|
6
|
+
|
7
|
+
== Google
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'mechanize'
|
11
|
+
|
12
|
+
a = Mechanize.new { |agent|
|
13
|
+
agent.user_agent_alias = 'Mac Safari'
|
14
|
+
}
|
15
|
+
|
16
|
+
a.get('http://google.com/') do |page|
|
17
|
+
search_result = page.form_with(:name => 'f') do |search|
|
18
|
+
search.q = 'Hello world'
|
19
|
+
end.submit
|
20
|
+
|
21
|
+
search_result.links.each do |link|
|
22
|
+
puts link.text
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
== Rubyforge
|
27
|
+
|
28
|
+
require 'rubygems'
|
29
|
+
require 'mechanize'
|
30
|
+
|
31
|
+
a = Mechanize.new
|
32
|
+
a.get('http://rubyforge.org/') do |page|
|
33
|
+
# Click the login link
|
34
|
+
login_page = a.click(page.link_with(:text => /Log In/))
|
35
|
+
|
36
|
+
# Submit the login form
|
37
|
+
my_page = login_page.form_with(:action => '/account/login.php') do |f|
|
38
|
+
f.form_loginname = ARGV[0]
|
39
|
+
f.form_pw = ARGV[1]
|
40
|
+
end.click_button
|
41
|
+
|
42
|
+
my_page.links.each do |link|
|
43
|
+
text = link.text.strip
|
44
|
+
next unless text.length > 0
|
45
|
+
puts text
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
== File Upload
|
50
|
+
|
51
|
+
Upload a file to flickr.
|
52
|
+
|
53
|
+
require 'rubygems'
|
54
|
+
require 'mechanize'
|
55
|
+
|
56
|
+
abort "#{$0} login passwd filename" if (ARGV.size != 3)
|
57
|
+
|
58
|
+
a = Mechanize.new { |agent|
|
59
|
+
# Flickr refreshes after login
|
60
|
+
agent.follow_meta_refresh = true
|
61
|
+
}
|
62
|
+
|
63
|
+
a.get('http://flickr.com/') do |home_page|
|
64
|
+
signin_page = a.click(home_page.link_with(:text => /Sign In/))
|
65
|
+
|
66
|
+
my_page = signin_page.form_with(:name => 'login_form') do |form|
|
67
|
+
form.login = ARGV[0]
|
68
|
+
form.passwd = ARGV[1]
|
69
|
+
end.submit
|
70
|
+
|
71
|
+
# Click the upload link
|
72
|
+
upload_page = a.click(my_page.link_with(:text => /Upload/))
|
73
|
+
|
74
|
+
# We want the basic upload page.
|
75
|
+
upload_page = a.click(upload_page.link_with(:text => /basic Uploader/))
|
76
|
+
|
77
|
+
# Upload the file
|
78
|
+
upload_page.form_with(:method => 'POST') do |upload_form|
|
79
|
+
upload_form.file_uploads.first.file_name = ARGV[2]
|
80
|
+
end.submit
|
81
|
+
end
|
82
|
+
|
83
|
+
== Pluggable Parsers
|
84
|
+
Lets say you want html pages to automatically be parsed with Rubyful Soup.
|
85
|
+
This example shows you how:
|
86
|
+
|
87
|
+
require 'rubygems'
|
88
|
+
require 'mechanize'
|
89
|
+
require 'rubyful_soup'
|
90
|
+
|
91
|
+
class SoupParser < Mechanize::Page
|
92
|
+
attr_reader :soup
|
93
|
+
def initialize(uri = nil, response = nil, body = nil, code = nil)
|
94
|
+
@soup = BeautifulSoup.new(body)
|
95
|
+
super(uri, response, body, code)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
agent = Mechanize.new
|
100
|
+
agent.pluggable_parser.html = SoupParser
|
101
|
+
|
102
|
+
Now all HTML pages will be parsed with the SoupParser class, and automatically
|
103
|
+
give you access to a method called 'soup' where you can get access to the
|
104
|
+
Beautiful Soup for that page.
|
105
|
+
|
106
|
+
== Using a proxy
|
107
|
+
|
108
|
+
require 'rubygems'
|
109
|
+
require 'mechanize'
|
110
|
+
|
111
|
+
agent = Mechanize.new
|
112
|
+
agent.set_proxy('localhost', '8000')
|
113
|
+
page = agent.get(ARGV[0])
|
114
|
+
puts page.body
|
115
|
+
|
116
|
+
== The transact method
|
117
|
+
|
118
|
+
transact runs the given block and then resets the page history. I.e. after the
|
119
|
+
block has been executed, you're back at the original page; no need count how
|
120
|
+
many times to call the back method at the end of a loop (while accounting for
|
121
|
+
possible exceptions).
|
122
|
+
|
123
|
+
This example also demonstrates subclassing Mechanize.
|
124
|
+
|
125
|
+
require 'rubygems'
|
126
|
+
require 'mechanize'
|
127
|
+
|
128
|
+
class TestMech < Mechanize
|
129
|
+
def process
|
130
|
+
get 'http://rubyforge.org/'
|
131
|
+
search_form = page.forms.first
|
132
|
+
search_form.words = 'WWW'
|
133
|
+
submit search_form
|
134
|
+
|
135
|
+
page.links_with(:href => %r{/projects/} ).each do |link|
|
136
|
+
next if link.href =~ %r{/projects/support/}
|
137
|
+
|
138
|
+
puts 'Loading %-30s %s' % [link.href, link.text]
|
139
|
+
begin
|
140
|
+
transact do
|
141
|
+
click link
|
142
|
+
# Do stuff, maybe click more links.
|
143
|
+
end
|
144
|
+
# Now we're back at the original page.
|
145
|
+
|
146
|
+
rescue => e
|
147
|
+
$stderr.puts "#{e.class}: #{e.message}"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
TestMech.new.process
|
154
|
+
|
155
|
+
== Client Certificate Authentication (Mutual Auth)
|
156
|
+
|
157
|
+
In most cases a client certificate is created as an additional layer of security
|
158
|
+
for certain websites. The specific case that this was initially tested on was
|
159
|
+
for automating the download of archived images from a banks (Wachovia) lockbox
|
160
|
+
system. Once the certificate is installed into your browser you will have to
|
161
|
+
export it and split the certificate and private key into separate files.
|
162
|
+
Exported files are usually in .p12 format (IE 7 & Firefox 2.0) which stands for
|
163
|
+
PKCS #12. You can convert them from p12 to pem format by using the following
|
164
|
+
commands:
|
165
|
+
|
166
|
+
openssl.exe pkcs12 -in input_file.p12 -clcerts -out example.key -nocerts -nodes
|
167
|
+
openssl.exe pkcs12 -in input_file.p12 -clcerts -out example.cer -nokeys
|
168
|
+
|
169
|
+
require 'rubygems'
|
170
|
+
require 'mechanize'
|
171
|
+
|
172
|
+
# create Mechanize instance
|
173
|
+
agent = Mechanize.new
|
174
|
+
|
175
|
+
# set the path of the certificate file
|
176
|
+
agent.cert = 'example.cer'
|
177
|
+
|
178
|
+
# set the path of the private key file
|
179
|
+
agent.key = 'example.key'
|
180
|
+
|
181
|
+
# get the login form & fill it out with the username/password
|
182
|
+
login_form = agent.get("http://example.com/login_page").form('Login')
|
183
|
+
login_form.Userid = 'TestUser'
|
184
|
+
login_form.Password = 'TestPassword'
|
185
|
+
|
186
|
+
# submit login form
|
187
|
+
agent.submit(login_form, login_form.buttons.first)
|
data/FAQ.rdoc
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Q:: Why do I keep getting an EOFError?
|
2
|
+
|
3
|
+
A:: For older versions of mechanize turning off keep_alive could help with the
|
4
|
+
problem, but mechanize now has more robust handling of persistent
|
5
|
+
connections.
|
6
|
+
|
7
|
+
Older versions of mechanize would raise an EOFError when a chunked body was
|
8
|
+
not terminated properly, a common bug of IIS servers. Since 2.0
|
9
|
+
Mechanize::ResponseReadError is raised containing the original response and
|
10
|
+
body read so far so if the server is broken you can still retrieve the
|
11
|
+
entire content.
|
data/GUIDE.rdoc
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
= Getting Started With Mechanize
|
2
|
+
|
3
|
+
This guide is meant to get you started using Mechanize. By the end of this
|
4
|
+
guide, you should be able to fetch pages, click links, fill out and submit
|
5
|
+
forms, scrape data, and many other hopefully useful things. This guide
|
6
|
+
really just scratches the surface of what is available, but should be enough
|
7
|
+
information to get you really going!
|
8
|
+
|
9
|
+
== Let's Fetch a Page!
|
10
|
+
|
11
|
+
First thing is first. Make sure that you've required mechanize and that you
|
12
|
+
instantiate a new mechanize object:
|
13
|
+
|
14
|
+
require 'rubygems'
|
15
|
+
require 'mechanize'
|
16
|
+
|
17
|
+
agent = Mechanize.new
|
18
|
+
|
19
|
+
Now we'll use the agent we've created to fetch a page. Let's fetch google
|
20
|
+
with our mechanize agent:
|
21
|
+
|
22
|
+
page = agent.get('http://google.com/')
|
23
|
+
|
24
|
+
What just happened? We told mechanize to go pick up google's main page.
|
25
|
+
Mechanize stored any cookies that were set, and followed any redirects that
|
26
|
+
google may have sent. The agent gave us back a page that we can use to
|
27
|
+
scrape data, find links to click, or find forms to fill out.
|
28
|
+
|
29
|
+
Next, lets try finding some links to click.
|
30
|
+
|
31
|
+
== Finding Links
|
32
|
+
|
33
|
+
Mechanize returns a page object whenever you get a page, post, or submit a
|
34
|
+
form. When a page is fetched, the agent will parse the page and put a list
|
35
|
+
of links on the page object.
|
36
|
+
|
37
|
+
Now that we've fetched google's homepage, lets try listing all of the links:
|
38
|
+
|
39
|
+
page.links.each do |link|
|
40
|
+
puts link.text
|
41
|
+
end
|
42
|
+
|
43
|
+
We can list the links, but Mechanize gives a few shortcuts to help us find a
|
44
|
+
link to click on. Lets say we wanted to click the link whose text is 'News'.
|
45
|
+
Normally, we would have to do this:
|
46
|
+
|
47
|
+
page = agent.page.links.find { |l| l.text == 'News' }.click
|
48
|
+
|
49
|
+
But Mechanize gives us a shortcut. Instead we can say this:
|
50
|
+
|
51
|
+
page = agent.page.link_with(:text => 'News').click
|
52
|
+
|
53
|
+
That shortcut says "find all links with the name 'News'". You're probably
|
54
|
+
thinking "there could be multiple links with that text!", and you would be
|
55
|
+
correct! If you use the plural form, you can access the list.
|
56
|
+
If you wanted to click on the second news link, you could do this:
|
57
|
+
|
58
|
+
agent.page.links_with(:text => 'News')[1].click
|
59
|
+
|
60
|
+
We can even find a link with a certain href like so:
|
61
|
+
|
62
|
+
page.link_with(:href => '/something')
|
63
|
+
|
64
|
+
Or chain them together to find a link with certain text and certain href:
|
65
|
+
|
66
|
+
page.link_with(:text => 'News', :href => '/something')
|
67
|
+
|
68
|
+
These shortcuts that mechanize provides are available on any list that you
|
69
|
+
can fetch like frames, iframes, or forms. Now that we know how to find and
|
70
|
+
click links, lets try something more complicated like filling out a form.
|
71
|
+
|
72
|
+
== Filling Out Forms
|
73
|
+
|
74
|
+
Lets continue with our google example. Here's the code we have so far:
|
75
|
+
require 'rubygems'
|
76
|
+
require 'mechanize'
|
77
|
+
|
78
|
+
agent = Mechanize.new
|
79
|
+
page = agent.get('http://google.com/')
|
80
|
+
|
81
|
+
If we pretty print the page, we can see that there is one form named 'f',
|
82
|
+
that has a couple buttons and a few fields:
|
83
|
+
|
84
|
+
pp page
|
85
|
+
|
86
|
+
Now that we know the name of the form, lets fetch it off the page:
|
87
|
+
|
88
|
+
google_form = page.form('f')
|
89
|
+
|
90
|
+
Mechanize lets you access form input fields in a few different ways, but the
|
91
|
+
most convenient is that you can access input fields as accessors on the
|
92
|
+
object. So lets set the form field named 'q' on the form to 'ruby mechanize':
|
93
|
+
|
94
|
+
google_form.q = 'ruby mechanize'
|
95
|
+
|
96
|
+
To make sure that we set the value, lets pretty print the form, and you should
|
97
|
+
see a line similar to this:
|
98
|
+
|
99
|
+
#<Mechanize::Field:0x1403488 @name="q", @value="ruby mechanize">
|
100
|
+
|
101
|
+
If you saw that the value of 'q' changed, you're on the right track! Now we
|
102
|
+
can submit the form and 'press' the submit button and print the results:
|
103
|
+
|
104
|
+
page = agent.submit(google_form, google_form.buttons.first)
|
105
|
+
pp page
|
106
|
+
|
107
|
+
What we just did was equivalent to putting text in the search field and
|
108
|
+
clicking the 'Google Search' button. If we had submitted the form without
|
109
|
+
a button, it would be like typing in the text field and hitting the return
|
110
|
+
button.
|
111
|
+
|
112
|
+
Lets take a look at the code all together:
|
113
|
+
|
114
|
+
require 'rubygems'
|
115
|
+
require 'mechanize'
|
116
|
+
|
117
|
+
agent = Mechanize.new
|
118
|
+
page = agent.get('http://google.com/')
|
119
|
+
google_form = page.form('f')
|
120
|
+
google_form.q = 'ruby mechanize'
|
121
|
+
page = agent.submit(google_form)
|
122
|
+
pp page
|
123
|
+
|
124
|
+
Before we go on to screen scraping, lets take a look at forms a little more
|
125
|
+
in depth. Unless you want to skip ahead!
|
126
|
+
|
127
|
+
== Advanced Form Techniques
|
128
|
+
|
129
|
+
In this section, I want to touch on using the different types in input fields
|
130
|
+
possible with a form. Password and textarea fields can be treated just like
|
131
|
+
text input fields. Select fields are very similar to text fields, but they
|
132
|
+
have many options associated with them. If you select one option, mechanize
|
133
|
+
will deselect the other options (unless it is a multi select!).
|
134
|
+
|
135
|
+
For example, lets select an option on a list:
|
136
|
+
|
137
|
+
form.field_with(:name => 'list').options[0].select
|
138
|
+
|
139
|
+
Now lets take a look at checkboxes and radio buttons. To select a checkbox,
|
140
|
+
just check it like this:
|
141
|
+
|
142
|
+
form.checkbox_with(:name => 'box').check
|
143
|
+
|
144
|
+
Radio buttons are very similar to checkboxes, but they know how to uncheck
|
145
|
+
other radio buttons of the same name. Just check a radio button like you
|
146
|
+
would a checkbox:
|
147
|
+
|
148
|
+
form.radiobuttons_with(:name => 'box')[1].check
|
149
|
+
|
150
|
+
Mechanize also makes file uploads easy! Just find the file upload field, and
|
151
|
+
tell it what file name you want to upload:
|
152
|
+
|
153
|
+
form.file_uploads.first.file_name = "somefile.jpg"
|
154
|
+
|
155
|
+
== Scraping Data
|
156
|
+
|
157
|
+
Mechanize uses nokogiri[http://nokogiri.org/] to parse
|
158
|
+
html. What does this mean for you? You can treat a mechanize page like
|
159
|
+
an nokogiri object. After you have used Mechanize to navigate to the page
|
160
|
+
that you need to scrape, then scrape it using nokogiri methods:
|
161
|
+
|
162
|
+
agent.get('http://someurl.com/').search(".//p[@class='posted']")
|
163
|
+
|
data/LICENSE.rdoc
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
(The MIT License)
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
'Software'), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Manifest.txt
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
.autotest
|
2
|
+
CHANGELOG.rdoc
|
3
|
+
EXAMPLES.rdoc
|
4
|
+
FAQ.rdoc
|
5
|
+
GUIDE.rdoc
|
6
|
+
LICENSE.rdoc
|
7
|
+
Manifest.txt
|
8
|
+
README.rdoc
|
9
|
+
Rakefile
|
10
|
+
examples/flickr_upload.rb
|
11
|
+
examples/mech-dump.rb
|
12
|
+
examples/proxy_req.rb
|
13
|
+
examples/rubyforge.rb
|
14
|
+
examples/spider.rb
|
15
|
+
lib/mechanize.rb
|
16
|
+
lib/mechanize/content_type_error.rb
|
17
|
+
lib/mechanize/cookie.rb
|
18
|
+
lib/mechanize/cookie_jar.rb
|
19
|
+
lib/mechanize/element_matcher.rb
|
20
|
+
lib/mechanize/file.rb
|
21
|
+
lib/mechanize/file_connection.rb
|
22
|
+
lib/mechanize/file_request.rb
|
23
|
+
lib/mechanize/file_response.rb
|
24
|
+
lib/mechanize/file_saver.rb
|
25
|
+
lib/mechanize/form.rb
|
26
|
+
lib/mechanize/form/button.rb
|
27
|
+
lib/mechanize/form/check_box.rb
|
28
|
+
lib/mechanize/form/field.rb
|
29
|
+
lib/mechanize/form/file_upload.rb
|
30
|
+
lib/mechanize/form/image_button.rb
|
31
|
+
lib/mechanize/form/multi_select_list.rb
|
32
|
+
lib/mechanize/form/option.rb
|
33
|
+
lib/mechanize/form/radio_button.rb
|
34
|
+
lib/mechanize/form/select_list.rb
|
35
|
+
lib/mechanize/headers.rb
|
36
|
+
lib/mechanize/history.rb
|
37
|
+
lib/mechanize/http.rb
|
38
|
+
lib/mechanize/http/agent.rb
|
39
|
+
lib/mechanize/inspect.rb
|
40
|
+
lib/mechanize/monkey_patch.rb
|
41
|
+
lib/mechanize/page.rb
|
42
|
+
lib/mechanize/page/base.rb
|
43
|
+
lib/mechanize/page/frame.rb
|
44
|
+
lib/mechanize/page/image.rb
|
45
|
+
lib/mechanize/page/label.rb
|
46
|
+
lib/mechanize/page/link.rb
|
47
|
+
lib/mechanize/page/meta_refresh.rb
|
48
|
+
lib/mechanize/pluggable_parsers.rb
|
49
|
+
lib/mechanize/redirect_limit_reached_error.rb
|
50
|
+
lib/mechanize/redirect_not_get_or_head_error.rb
|
51
|
+
lib/mechanize/response_code_error.rb
|
52
|
+
lib/mechanize/response_read_error.rb
|
53
|
+
lib/mechanize/robots_disallowed_error.rb
|
54
|
+
lib/mechanize/unsupported_scheme_error.rb
|
55
|
+
lib/mechanize/util.rb
|
56
|
+
test/data/htpasswd
|
57
|
+
test/data/server.crt
|
58
|
+
test/data/server.csr
|
59
|
+
test/data/server.key
|
60
|
+
test/data/server.pem
|
61
|
+
test/helper.rb
|
62
|
+
test/htdocs/alt_text.html
|
63
|
+
test/htdocs/bad_form_test.html
|
64
|
+
test/htdocs/button.jpg
|
65
|
+
test/htdocs/canonical_uri.html
|
66
|
+
test/htdocs/dir with spaces/foo.html
|
67
|
+
test/htdocs/empty_form.html
|
68
|
+
test/htdocs/file_upload.html
|
69
|
+
test/htdocs/find_link.html
|
70
|
+
test/htdocs/form_multi_select.html
|
71
|
+
test/htdocs/form_multival.html
|
72
|
+
test/htdocs/form_no_action.html
|
73
|
+
test/htdocs/form_no_input_name.html
|
74
|
+
test/htdocs/form_select.html
|
75
|
+
test/htdocs/form_select_all.html
|
76
|
+
test/htdocs/form_select_none.html
|
77
|
+
test/htdocs/form_select_noopts.html
|
78
|
+
test/htdocs/form_set_fields.html
|
79
|
+
test/htdocs/form_test.html
|
80
|
+
test/htdocs/frame_referer_test.html
|
81
|
+
test/htdocs/frame_test.html
|
82
|
+
test/htdocs/google.html
|
83
|
+
test/htdocs/iframe_test.html
|
84
|
+
test/htdocs/index.html
|
85
|
+
test/htdocs/link with space.html
|
86
|
+
test/htdocs/meta_cookie.html
|
87
|
+
test/htdocs/no_title_test.html
|
88
|
+
test/htdocs/nofollow.html
|
89
|
+
test/htdocs/noindex.html
|
90
|
+
test/htdocs/norobots.html
|
91
|
+
test/htdocs/rails_3_encoding_hack_form_test.html
|
92
|
+
test/htdocs/rel_nofollow.html
|
93
|
+
test/htdocs/relative/tc_relative_links.html
|
94
|
+
test/htdocs/robots.html
|
95
|
+
test/htdocs/robots.txt
|
96
|
+
test/htdocs/tc_bad_charset.html
|
97
|
+
test/htdocs/tc_bad_links.html
|
98
|
+
test/htdocs/tc_base_images.html
|
99
|
+
test/htdocs/tc_base_link.html
|
100
|
+
test/htdocs/tc_blank_form.html
|
101
|
+
test/htdocs/tc_charset.html
|
102
|
+
test/htdocs/tc_checkboxes.html
|
103
|
+
test/htdocs/tc_encoded_links.html
|
104
|
+
test/htdocs/tc_field_precedence.html
|
105
|
+
test/htdocs/tc_follow_meta.html
|
106
|
+
test/htdocs/tc_form_action.html
|
107
|
+
test/htdocs/tc_images.html
|
108
|
+
test/htdocs/tc_links.html
|
109
|
+
test/htdocs/tc_meta_in_body.html
|
110
|
+
test/htdocs/tc_no_attributes.html
|
111
|
+
test/htdocs/tc_pretty_print.html
|
112
|
+
test/htdocs/tc_radiobuttons.html
|
113
|
+
test/htdocs/tc_referer.html
|
114
|
+
test/htdocs/tc_relative_links.html
|
115
|
+
test/htdocs/tc_textarea.html
|
116
|
+
test/htdocs/test_bad_encoding.html
|
117
|
+
test/htdocs/test_click.html
|
118
|
+
test/htdocs/unusual______.html
|
119
|
+
test/servlets.rb
|
120
|
+
test/ssl_server.rb
|
121
|
+
test/test_cookies.rb
|
122
|
+
test/test_form_action.rb
|
123
|
+
test/test_form_as_hash.rb
|
124
|
+
test/test_form_button.rb
|
125
|
+
test/test_frames.rb
|
126
|
+
test/test_headers.rb
|
127
|
+
test/test_history.rb
|
128
|
+
test/test_history_added.rb
|
129
|
+
test/test_html_unscape_forms.rb
|
130
|
+
test/test_if_modified_since.rb
|
131
|
+
test/test_images.rb
|
132
|
+
test/test_mechanize.rb
|
133
|
+
test/test_mechanize_cookie.rb
|
134
|
+
test/test_mechanize_cookie_jar.rb
|
135
|
+
test/test_mechanize_file.rb
|
136
|
+
test/test_mechanize_file_request.rb
|
137
|
+
test/test_mechanize_file_response.rb
|
138
|
+
test/test_mechanize_form.rb
|
139
|
+
test/test_mechanize_form_check_box.rb
|
140
|
+
test/test_mechanize_form_encoding.rb
|
141
|
+
test/test_mechanize_form_field.rb
|
142
|
+
test/test_mechanize_form_image_button.rb
|
143
|
+
test/test_mechanize_form_textarea.rb
|
144
|
+
test/test_mechanize_http_agent.rb
|
145
|
+
test/test_mechanize_link.rb
|
146
|
+
test/test_mechanize_page_encoding.rb
|
147
|
+
test/test_mechanize_page_link.rb
|
148
|
+
test/test_mechanize_page_meta_refresh.rb
|
149
|
+
test/test_mechanize_redirect_not_get_or_head_error.rb
|
150
|
+
test/test_mechanize_subclass.rb
|
151
|
+
test/test_mechanize_util.rb
|
152
|
+
test/test_multi_select.rb
|
153
|
+
test/test_no_attributes.rb
|
154
|
+
test/test_option.rb
|
155
|
+
test/test_pluggable_parser.rb
|
156
|
+
test/test_post_form.rb
|
157
|
+
test/test_pretty_print.rb
|
158
|
+
test/test_radiobutton.rb
|
159
|
+
test/test_redirect_limit_reached.rb
|
160
|
+
test/test_referer.rb
|
161
|
+
test/test_relative_links.rb
|
162
|
+
test/test_request.rb
|
163
|
+
test/test_response_code.rb
|
164
|
+
test/test_robots.rb
|
165
|
+
test/test_save_file.rb
|
166
|
+
test/test_scheme.rb
|
167
|
+
test/test_select.rb
|
168
|
+
test/test_select_all.rb
|
169
|
+
test/test_select_none.rb
|
170
|
+
test/test_select_noopts.rb
|
171
|
+
test/test_set_fields.rb
|
172
|
+
test/test_ssl_server.rb
|