diamond-mechanize 2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +718 -0
- data/EXAMPLES.rdoc +187 -0
- data/FAQ.rdoc +11 -0
- data/GUIDE.rdoc +163 -0
- data/LICENSE.rdoc +20 -0
- data/Manifest.txt +159 -0
- data/README.rdoc +64 -0
- data/Rakefile +49 -0
- data/lib/mechanize.rb +1079 -0
- data/lib/mechanize/content_type_error.rb +13 -0
- data/lib/mechanize/cookie.rb +232 -0
- data/lib/mechanize/cookie_jar.rb +194 -0
- data/lib/mechanize/download.rb +59 -0
- data/lib/mechanize/element_matcher.rb +36 -0
- data/lib/mechanize/file.rb +65 -0
- data/lib/mechanize/file_connection.rb +17 -0
- data/lib/mechanize/file_request.rb +26 -0
- data/lib/mechanize/file_response.rb +74 -0
- data/lib/mechanize/file_saver.rb +39 -0
- data/lib/mechanize/form.rb +543 -0
- data/lib/mechanize/form/button.rb +6 -0
- data/lib/mechanize/form/check_box.rb +12 -0
- data/lib/mechanize/form/field.rb +54 -0
- data/lib/mechanize/form/file_upload.rb +21 -0
- data/lib/mechanize/form/hidden.rb +3 -0
- data/lib/mechanize/form/image_button.rb +19 -0
- data/lib/mechanize/form/keygen.rb +34 -0
- data/lib/mechanize/form/multi_select_list.rb +94 -0
- data/lib/mechanize/form/option.rb +50 -0
- data/lib/mechanize/form/radio_button.rb +55 -0
- data/lib/mechanize/form/reset.rb +3 -0
- data/lib/mechanize/form/select_list.rb +44 -0
- data/lib/mechanize/form/submit.rb +3 -0
- data/lib/mechanize/form/text.rb +3 -0
- data/lib/mechanize/form/textarea.rb +3 -0
- data/lib/mechanize/headers.rb +23 -0
- data/lib/mechanize/history.rb +82 -0
- data/lib/mechanize/http.rb +8 -0
- data/lib/mechanize/http/agent.rb +1004 -0
- data/lib/mechanize/http/auth_challenge.rb +59 -0
- data/lib/mechanize/http/auth_realm.rb +31 -0
- data/lib/mechanize/http/content_disposition_parser.rb +188 -0
- data/lib/mechanize/http/www_authenticate_parser.rb +155 -0
- data/lib/mechanize/monkey_patch.rb +16 -0
- data/lib/mechanize/page.rb +440 -0
- data/lib/mechanize/page/base.rb +7 -0
- data/lib/mechanize/page/frame.rb +27 -0
- data/lib/mechanize/page/image.rb +30 -0
- data/lib/mechanize/page/label.rb +20 -0
- data/lib/mechanize/page/link.rb +98 -0
- data/lib/mechanize/page/meta_refresh.rb +68 -0
- data/lib/mechanize/parser.rb +173 -0
- data/lib/mechanize/pluggable_parsers.rb +144 -0
- data/lib/mechanize/redirect_limit_reached_error.rb +19 -0
- data/lib/mechanize/redirect_not_get_or_head_error.rb +21 -0
- data/lib/mechanize/response_code_error.rb +21 -0
- data/lib/mechanize/response_read_error.rb +27 -0
- data/lib/mechanize/robots_disallowed_error.rb +28 -0
- data/lib/mechanize/test_case.rb +663 -0
- data/lib/mechanize/unauthorized_error.rb +3 -0
- data/lib/mechanize/unsupported_scheme_error.rb +6 -0
- data/lib/mechanize/util.rb +101 -0
- data/test/data/htpasswd +1 -0
- data/test/data/server.crt +16 -0
- data/test/data/server.csr +12 -0
- data/test/data/server.key +15 -0
- data/test/data/server.pem +15 -0
- data/test/htdocs/alt_text.html +10 -0
- data/test/htdocs/bad_form_test.html +9 -0
- data/test/htdocs/button.jpg +0 -0
- data/test/htdocs/canonical_uri.html +9 -0
- data/test/htdocs/dir with spaces/foo.html +1 -0
- data/test/htdocs/empty_form.html +6 -0
- data/test/htdocs/file_upload.html +26 -0
- data/test/htdocs/find_link.html +41 -0
- data/test/htdocs/form_multi_select.html +16 -0
- data/test/htdocs/form_multival.html +37 -0
- data/test/htdocs/form_no_action.html +18 -0
- data/test/htdocs/form_no_input_name.html +16 -0
- data/test/htdocs/form_order_test.html +11 -0
- data/test/htdocs/form_select.html +16 -0
- data/test/htdocs/form_set_fields.html +14 -0
- data/test/htdocs/form_test.html +188 -0
- data/test/htdocs/frame_referer_test.html +10 -0
- data/test/htdocs/frame_test.html +30 -0
- data/test/htdocs/google.html +13 -0
- data/test/htdocs/index.html +6 -0
- data/test/htdocs/link with space.html +5 -0
- data/test/htdocs/meta_cookie.html +11 -0
- data/test/htdocs/no_title_test.html +6 -0
- data/test/htdocs/noindex.html +9 -0
- data/test/htdocs/rails_3_encoding_hack_form_test.html +27 -0
- data/test/htdocs/relative/tc_relative_links.html +21 -0
- data/test/htdocs/robots.html +8 -0
- data/test/htdocs/robots.txt +2 -0
- data/test/htdocs/tc_bad_charset.html +9 -0
- data/test/htdocs/tc_bad_links.html +5 -0
- data/test/htdocs/tc_base_link.html +8 -0
- data/test/htdocs/tc_blank_form.html +11 -0
- data/test/htdocs/tc_charset.html +6 -0
- data/test/htdocs/tc_checkboxes.html +19 -0
- data/test/htdocs/tc_encoded_links.html +5 -0
- data/test/htdocs/tc_field_precedence.html +11 -0
- data/test/htdocs/tc_follow_meta.html +8 -0
- data/test/htdocs/tc_form_action.html +48 -0
- data/test/htdocs/tc_links.html +19 -0
- data/test/htdocs/tc_meta_in_body.html +9 -0
- data/test/htdocs/tc_pretty_print.html +17 -0
- data/test/htdocs/tc_referer.html +16 -0
- data/test/htdocs/tc_relative_links.html +19 -0
- data/test/htdocs/tc_textarea.html +23 -0
- data/test/htdocs/test_click.html +11 -0
- data/test/htdocs/unusual______.html +5 -0
- data/test/test_mechanize.rb +1164 -0
- data/test/test_mechanize_cookie.rb +451 -0
- data/test/test_mechanize_cookie_jar.rb +483 -0
- data/test/test_mechanize_download.rb +43 -0
- data/test/test_mechanize_file.rb +61 -0
- data/test/test_mechanize_file_connection.rb +21 -0
- data/test/test_mechanize_file_request.rb +19 -0
- data/test/test_mechanize_file_saver.rb +21 -0
- data/test/test_mechanize_form.rb +875 -0
- data/test/test_mechanize_form_check_box.rb +38 -0
- data/test/test_mechanize_form_encoding.rb +114 -0
- data/test/test_mechanize_form_field.rb +63 -0
- data/test/test_mechanize_form_file_upload.rb +20 -0
- data/test/test_mechanize_form_image_button.rb +12 -0
- data/test/test_mechanize_form_keygen.rb +32 -0
- data/test/test_mechanize_form_multi_select_list.rb +84 -0
- data/test/test_mechanize_form_option.rb +55 -0
- data/test/test_mechanize_form_radio_button.rb +78 -0
- data/test/test_mechanize_form_select_list.rb +76 -0
- data/test/test_mechanize_form_textarea.rb +52 -0
- data/test/test_mechanize_headers.rb +35 -0
- data/test/test_mechanize_history.rb +103 -0
- data/test/test_mechanize_http_agent.rb +1225 -0
- data/test/test_mechanize_http_auth_challenge.rb +39 -0
- data/test/test_mechanize_http_auth_realm.rb +49 -0
- data/test/test_mechanize_http_content_disposition_parser.rb +118 -0
- data/test/test_mechanize_http_www_authenticate_parser.rb +146 -0
- data/test/test_mechanize_link.rb +80 -0
- data/test/test_mechanize_page.rb +118 -0
- data/test/test_mechanize_page_encoding.rb +182 -0
- data/test/test_mechanize_page_frame.rb +16 -0
- data/test/test_mechanize_page_link.rb +390 -0
- data/test/test_mechanize_page_meta_refresh.rb +127 -0
- data/test/test_mechanize_parser.rb +289 -0
- data/test/test_mechanize_pluggable_parser.rb +52 -0
- data/test/test_mechanize_redirect_limit_reached_error.rb +24 -0
- data/test/test_mechanize_redirect_not_get_or_head_error.rb +14 -0
- data/test/test_mechanize_subclass.rb +22 -0
- data/test/test_mechanize_util.rb +103 -0
- data/test/test_multi_select.rb +119 -0
- metadata +216 -0
data/EXAMPLES.rdoc
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
= Mechanize examples
|
|
2
|
+
|
|
3
|
+
Note: Several examples show methods chained to the end of do/end blocks.
|
|
4
|
+
Do...end is the same as curly braces ({...}). For example, do ... end.submit
|
|
5
|
+
is the same as { ... }.submit.
|
|
6
|
+
|
|
7
|
+
== Google
|
|
8
|
+
|
|
9
|
+
require 'rubygems'
|
|
10
|
+
require 'mechanize'
|
|
11
|
+
|
|
12
|
+
a = Mechanize.new { |agent|
|
|
13
|
+
agent.user_agent_alias = 'Mac Safari'
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
a.get('http://google.com/') do |page|
|
|
17
|
+
search_result = page.form_with(:name => 'f') do |search|
|
|
18
|
+
search.q = 'Hello world'
|
|
19
|
+
end.submit
|
|
20
|
+
|
|
21
|
+
search_result.links.each do |link|
|
|
22
|
+
puts link.text
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
== Rubyforge
|
|
27
|
+
|
|
28
|
+
require 'rubygems'
|
|
29
|
+
require 'mechanize'
|
|
30
|
+
|
|
31
|
+
a = Mechanize.new
|
|
32
|
+
a.get('http://rubyforge.org/') do |page|
|
|
33
|
+
# Click the login link
|
|
34
|
+
login_page = a.click(page.link_with(:text => /Log In/))
|
|
35
|
+
|
|
36
|
+
# Submit the login form
|
|
37
|
+
my_page = login_page.form_with(:action => '/account/login.php') do |f|
|
|
38
|
+
f.form_loginname = ARGV[0]
|
|
39
|
+
f.form_pw = ARGV[1]
|
|
40
|
+
end.click_button
|
|
41
|
+
|
|
42
|
+
my_page.links.each do |link|
|
|
43
|
+
text = link.text.strip
|
|
44
|
+
next unless text.length > 0
|
|
45
|
+
puts text
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
== File Upload
|
|
50
|
+
|
|
51
|
+
Upload a file to flickr.
|
|
52
|
+
|
|
53
|
+
require 'rubygems'
|
|
54
|
+
require 'mechanize'
|
|
55
|
+
|
|
56
|
+
abort "#{$0} login passwd filename" if (ARGV.size != 3)
|
|
57
|
+
|
|
58
|
+
a = Mechanize.new { |agent|
|
|
59
|
+
# Flickr refreshes after login
|
|
60
|
+
agent.follow_meta_refresh = true
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
a.get('http://flickr.com/') do |home_page|
|
|
64
|
+
signin_page = a.click(home_page.link_with(:text => /Sign In/))
|
|
65
|
+
|
|
66
|
+
my_page = signin_page.form_with(:name => 'login_form') do |form|
|
|
67
|
+
form.login = ARGV[0]
|
|
68
|
+
form.passwd = ARGV[1]
|
|
69
|
+
end.submit
|
|
70
|
+
|
|
71
|
+
# Click the upload link
|
|
72
|
+
upload_page = a.click(my_page.link_with(:text => /Upload/))
|
|
73
|
+
|
|
74
|
+
# We want the basic upload page.
|
|
75
|
+
upload_page = a.click(upload_page.link_with(:text => /basic Uploader/))
|
|
76
|
+
|
|
77
|
+
# Upload the file
|
|
78
|
+
upload_page.form_with(:method => 'POST') do |upload_form|
|
|
79
|
+
upload_form.file_uploads.first.file_name = ARGV[2]
|
|
80
|
+
end.submit
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
== Pluggable Parsers
|
|
84
|
+
Lets say you want html pages to automatically be parsed with Rubyful Soup.
|
|
85
|
+
This example shows you how:
|
|
86
|
+
|
|
87
|
+
require 'rubygems'
|
|
88
|
+
require 'mechanize'
|
|
89
|
+
require 'rubyful_soup'
|
|
90
|
+
|
|
91
|
+
class SoupParser < Mechanize::Page
|
|
92
|
+
attr_reader :soup
|
|
93
|
+
def initialize(uri = nil, response = nil, body = nil, code = nil)
|
|
94
|
+
@soup = BeautifulSoup.new(body)
|
|
95
|
+
super(uri, response, body, code)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
agent = Mechanize.new
|
|
100
|
+
agent.pluggable_parser.html = SoupParser
|
|
101
|
+
|
|
102
|
+
Now all HTML pages will be parsed with the SoupParser class, and automatically
|
|
103
|
+
give you access to a method called 'soup' where you can get access to the
|
|
104
|
+
Beautiful Soup for that page.
|
|
105
|
+
|
|
106
|
+
== Using a proxy
|
|
107
|
+
|
|
108
|
+
require 'rubygems'
|
|
109
|
+
require 'mechanize'
|
|
110
|
+
|
|
111
|
+
agent = Mechanize.new
|
|
112
|
+
agent.set_proxy 'localhost', 8000
|
|
113
|
+
page = agent.get(ARGV[0])
|
|
114
|
+
puts page.body
|
|
115
|
+
|
|
116
|
+
== The transact method
|
|
117
|
+
|
|
118
|
+
transact runs the given block and then resets the page history. I.e. after the
|
|
119
|
+
block has been executed, you're back at the original page; no need count how
|
|
120
|
+
many times to call the back method at the end of a loop (while accounting for
|
|
121
|
+
possible exceptions).
|
|
122
|
+
|
|
123
|
+
This example also demonstrates subclassing Mechanize.
|
|
124
|
+
|
|
125
|
+
require 'rubygems'
|
|
126
|
+
require 'mechanize'
|
|
127
|
+
|
|
128
|
+
class TestMech < Mechanize
|
|
129
|
+
def process
|
|
130
|
+
get 'http://rubyforge.org/'
|
|
131
|
+
search_form = page.forms.first
|
|
132
|
+
search_form.words = 'WWW'
|
|
133
|
+
submit search_form
|
|
134
|
+
|
|
135
|
+
page.links_with(:href => %r{/projects/} ).each do |link|
|
|
136
|
+
next if link.href =~ %r{/projects/support/}
|
|
137
|
+
|
|
138
|
+
puts 'Loading %-30s %s' % [link.href, link.text]
|
|
139
|
+
begin
|
|
140
|
+
transact do
|
|
141
|
+
click link
|
|
142
|
+
# Do stuff, maybe click more links.
|
|
143
|
+
end
|
|
144
|
+
# Now we're back at the original page.
|
|
145
|
+
|
|
146
|
+
rescue => e
|
|
147
|
+
$stderr.puts "#{e.class}: #{e.message}"
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
TestMech.new.process
|
|
154
|
+
|
|
155
|
+
== Client Certificate Authentication (Mutual Auth)
|
|
156
|
+
|
|
157
|
+
In most cases a client certificate is created as an additional layer of security
|
|
158
|
+
for certain websites. The specific case that this was initially tested on was
|
|
159
|
+
for automating the download of archived images from a banks (Wachovia) lockbox
|
|
160
|
+
system. Once the certificate is installed into your browser you will have to
|
|
161
|
+
export it and split the certificate and private key into separate files.
|
|
162
|
+
Exported files are usually in .p12 format (IE 7 & Firefox 2.0) which stands for
|
|
163
|
+
PKCS #12. You can convert them from p12 to pem format by using the following
|
|
164
|
+
commands:
|
|
165
|
+
|
|
166
|
+
openssl.exe pkcs12 -in input_file.p12 -clcerts -out example.key -nocerts -nodes
|
|
167
|
+
openssl.exe pkcs12 -in input_file.p12 -clcerts -out example.cer -nokeys
|
|
168
|
+
|
|
169
|
+
require 'rubygems'
|
|
170
|
+
require 'mechanize'
|
|
171
|
+
|
|
172
|
+
# create Mechanize instance
|
|
173
|
+
agent = Mechanize.new
|
|
174
|
+
|
|
175
|
+
# set the path of the certificate file
|
|
176
|
+
agent.cert = 'example.cer'
|
|
177
|
+
|
|
178
|
+
# set the path of the private key file
|
|
179
|
+
agent.key = 'example.key'
|
|
180
|
+
|
|
181
|
+
# get the login form & fill it out with the username/password
|
|
182
|
+
login_form = agent.get("http://example.com/login_page").form('Login')
|
|
183
|
+
login_form.Userid = 'TestUser'
|
|
184
|
+
login_form.Password = 'TestPassword'
|
|
185
|
+
|
|
186
|
+
# submit login form
|
|
187
|
+
agent.submit(login_form, login_form.buttons.first)
|
data/FAQ.rdoc
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Q:: Why do I keep getting an EOFError?
|
|
2
|
+
|
|
3
|
+
A:: For older versions of mechanize turning off keep_alive could help with the
|
|
4
|
+
problem, but mechanize now has more robust handling of persistent
|
|
5
|
+
connections.
|
|
6
|
+
|
|
7
|
+
Older versions of mechanize would raise an EOFError when a chunked body was
|
|
8
|
+
not terminated properly, a common bug of IIS servers. Since 2.0
|
|
9
|
+
Mechanize::ResponseReadError is raised containing the original response and
|
|
10
|
+
body read so far so if the server is broken you can still retrieve the
|
|
11
|
+
entire content.
|
data/GUIDE.rdoc
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
= Getting Started With Mechanize
|
|
2
|
+
|
|
3
|
+
This guide is meant to get you started using Mechanize. By the end of this
|
|
4
|
+
guide, you should be able to fetch pages, click links, fill out and submit
|
|
5
|
+
forms, scrape data, and many other hopefully useful things. This guide
|
|
6
|
+
really just scratches the surface of what is available, but should be enough
|
|
7
|
+
information to get you really going!
|
|
8
|
+
|
|
9
|
+
== Let's Fetch a Page!
|
|
10
|
+
|
|
11
|
+
First thing is first. Make sure that you've required mechanize and that you
|
|
12
|
+
instantiate a new mechanize object:
|
|
13
|
+
|
|
14
|
+
require 'rubygems'
|
|
15
|
+
require 'mechanize'
|
|
16
|
+
|
|
17
|
+
agent = Mechanize.new
|
|
18
|
+
|
|
19
|
+
Now we'll use the agent we've created to fetch a page. Let's fetch google
|
|
20
|
+
with our mechanize agent:
|
|
21
|
+
|
|
22
|
+
page = agent.get('http://google.com/')
|
|
23
|
+
|
|
24
|
+
What just happened? We told mechanize to go pick up google's main page.
|
|
25
|
+
Mechanize stored any cookies that were set, and followed any redirects that
|
|
26
|
+
google may have sent. The agent gave us back a page that we can use to
|
|
27
|
+
scrape data, find links to click, or find forms to fill out.
|
|
28
|
+
|
|
29
|
+
Next, lets try finding some links to click.
|
|
30
|
+
|
|
31
|
+
== Finding Links
|
|
32
|
+
|
|
33
|
+
Mechanize returns a page object whenever you get a page, post, or submit a
|
|
34
|
+
form. When a page is fetched, the agent will parse the page and put a list
|
|
35
|
+
of links on the page object.
|
|
36
|
+
|
|
37
|
+
Now that we've fetched google's homepage, lets try listing all of the links:
|
|
38
|
+
|
|
39
|
+
page.links.each do |link|
|
|
40
|
+
puts link.text
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
We can list the links, but Mechanize gives a few shortcuts to help us find a
|
|
44
|
+
link to click on. Lets say we wanted to click the link whose text is 'News'.
|
|
45
|
+
Normally, we would have to do this:
|
|
46
|
+
|
|
47
|
+
page = agent.page.links.find { |l| l.text == 'News' }.click
|
|
48
|
+
|
|
49
|
+
But Mechanize gives us a shortcut. Instead we can say this:
|
|
50
|
+
|
|
51
|
+
page = agent.page.link_with(:text => 'News').click
|
|
52
|
+
|
|
53
|
+
That shortcut says "find all links with the name 'News'". You're probably
|
|
54
|
+
thinking "there could be multiple links with that text!", and you would be
|
|
55
|
+
correct! If you use the plural form, you can access the list.
|
|
56
|
+
If you wanted to click on the second news link, you could do this:
|
|
57
|
+
|
|
58
|
+
agent.page.links_with(:text => 'News')[1].click
|
|
59
|
+
|
|
60
|
+
We can even find a link with a certain href like so:
|
|
61
|
+
|
|
62
|
+
page.link_with(:href => '/something')
|
|
63
|
+
|
|
64
|
+
Or chain them together to find a link with certain text and certain href:
|
|
65
|
+
|
|
66
|
+
page.link_with(:text => 'News', :href => '/something')
|
|
67
|
+
|
|
68
|
+
These shortcuts that mechanize provides are available on any list that you
|
|
69
|
+
can fetch like frames, iframes, or forms. Now that we know how to find and
|
|
70
|
+
click links, lets try something more complicated like filling out a form.
|
|
71
|
+
|
|
72
|
+
== Filling Out Forms
|
|
73
|
+
|
|
74
|
+
Lets continue with our google example. Here's the code we have so far:
|
|
75
|
+
require 'rubygems'
|
|
76
|
+
require 'mechanize'
|
|
77
|
+
|
|
78
|
+
agent = Mechanize.new
|
|
79
|
+
page = agent.get('http://google.com/')
|
|
80
|
+
|
|
81
|
+
If we pretty print the page, we can see that there is one form named 'f',
|
|
82
|
+
that has a couple buttons and a few fields:
|
|
83
|
+
|
|
84
|
+
pp page
|
|
85
|
+
|
|
86
|
+
Now that we know the name of the form, lets fetch it off the page:
|
|
87
|
+
|
|
88
|
+
google_form = page.form('f')
|
|
89
|
+
|
|
90
|
+
Mechanize lets you access form input fields in a few different ways, but the
|
|
91
|
+
most convenient is that you can access input fields as accessors on the
|
|
92
|
+
object. So lets set the form field named 'q' on the form to 'ruby mechanize':
|
|
93
|
+
|
|
94
|
+
google_form.q = 'ruby mechanize'
|
|
95
|
+
|
|
96
|
+
To make sure that we set the value, lets pretty print the form, and you should
|
|
97
|
+
see a line similar to this:
|
|
98
|
+
|
|
99
|
+
#<Mechanize::Field:0x1403488 @name="q", @value="ruby mechanize">
|
|
100
|
+
|
|
101
|
+
If you saw that the value of 'q' changed, you're on the right track! Now we
|
|
102
|
+
can submit the form and 'press' the submit button and print the results:
|
|
103
|
+
|
|
104
|
+
page = agent.submit(google_form, google_form.buttons.first)
|
|
105
|
+
pp page
|
|
106
|
+
|
|
107
|
+
What we just did was equivalent to putting text in the search field and
|
|
108
|
+
clicking the 'Google Search' button. If we had submitted the form without
|
|
109
|
+
a button, it would be like typing in the text field and hitting the return
|
|
110
|
+
button.
|
|
111
|
+
|
|
112
|
+
Lets take a look at the code all together:
|
|
113
|
+
|
|
114
|
+
require 'rubygems'
|
|
115
|
+
require 'mechanize'
|
|
116
|
+
|
|
117
|
+
agent = Mechanize.new
|
|
118
|
+
page = agent.get('http://google.com/')
|
|
119
|
+
google_form = page.form('f')
|
|
120
|
+
google_form.q = 'ruby mechanize'
|
|
121
|
+
page = agent.submit(google_form)
|
|
122
|
+
pp page
|
|
123
|
+
|
|
124
|
+
Before we go on to screen scraping, lets take a look at forms a little more
|
|
125
|
+
in depth. Unless you want to skip ahead!
|
|
126
|
+
|
|
127
|
+
== Advanced Form Techniques
|
|
128
|
+
|
|
129
|
+
In this section, I want to touch on using the different types in input fields
|
|
130
|
+
possible with a form. Password and textarea fields can be treated just like
|
|
131
|
+
text input fields. Select fields are very similar to text fields, but they
|
|
132
|
+
have many options associated with them. If you select one option, mechanize
|
|
133
|
+
will deselect the other options (unless it is a multi select!).
|
|
134
|
+
|
|
135
|
+
For example, lets select an option on a list:
|
|
136
|
+
|
|
137
|
+
form.field_with(:name => 'list').options[0].select
|
|
138
|
+
|
|
139
|
+
Now lets take a look at checkboxes and radio buttons. To select a checkbox,
|
|
140
|
+
just check it like this:
|
|
141
|
+
|
|
142
|
+
form.checkbox_with(:name => 'box').check
|
|
143
|
+
|
|
144
|
+
Radio buttons are very similar to checkboxes, but they know how to uncheck
|
|
145
|
+
other radio buttons of the same name. Just check a radio button like you
|
|
146
|
+
would a checkbox:
|
|
147
|
+
|
|
148
|
+
form.radiobuttons_with(:name => 'box')[1].check
|
|
149
|
+
|
|
150
|
+
Mechanize also makes file uploads easy! Just find the file upload field, and
|
|
151
|
+
tell it what file name you want to upload:
|
|
152
|
+
|
|
153
|
+
form.file_uploads.first.file_name = "somefile.jpg"
|
|
154
|
+
|
|
155
|
+
== Scraping Data
|
|
156
|
+
|
|
157
|
+
Mechanize uses nokogiri[http://nokogiri.org/] to parse
|
|
158
|
+
html. What does this mean for you? You can treat a mechanize page like
|
|
159
|
+
an nokogiri object. After you have used Mechanize to navigate to the page
|
|
160
|
+
that you need to scrape, then scrape it using nokogiri methods:
|
|
161
|
+
|
|
162
|
+
agent.get('http://someurl.com/').search(".//p[@class='posted']")
|
|
163
|
+
|
data/LICENSE.rdoc
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
(The MIT License)
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
'Software'), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
17
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
18
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
19
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
20
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Manifest.txt
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
.autotest
|
|
2
|
+
CHANGELOG.rdoc
|
|
3
|
+
EXAMPLES.rdoc
|
|
4
|
+
FAQ.rdoc
|
|
5
|
+
GUIDE.rdoc
|
|
6
|
+
LICENSE.rdoc
|
|
7
|
+
Manifest.txt
|
|
8
|
+
README.rdoc
|
|
9
|
+
Rakefile
|
|
10
|
+
examples/flickr_upload.rb
|
|
11
|
+
examples/mech-dump.rb
|
|
12
|
+
examples/proxy_req.rb
|
|
13
|
+
examples/rubyforge.rb
|
|
14
|
+
examples/spider.rb
|
|
15
|
+
lib/mechanize.rb
|
|
16
|
+
lib/mechanize/content_type_error.rb
|
|
17
|
+
lib/mechanize/cookie.rb
|
|
18
|
+
lib/mechanize/cookie_jar.rb
|
|
19
|
+
lib/mechanize/download.rb
|
|
20
|
+
lib/mechanize/element_matcher.rb
|
|
21
|
+
lib/mechanize/file.rb
|
|
22
|
+
lib/mechanize/file_connection.rb
|
|
23
|
+
lib/mechanize/file_request.rb
|
|
24
|
+
lib/mechanize/file_response.rb
|
|
25
|
+
lib/mechanize/file_saver.rb
|
|
26
|
+
lib/mechanize/form.rb
|
|
27
|
+
lib/mechanize/form/button.rb
|
|
28
|
+
lib/mechanize/form/check_box.rb
|
|
29
|
+
lib/mechanize/form/field.rb
|
|
30
|
+
lib/mechanize/form/file_upload.rb
|
|
31
|
+
lib/mechanize/form/hidden.rb
|
|
32
|
+
lib/mechanize/form/image_button.rb
|
|
33
|
+
lib/mechanize/form/keygen.rb
|
|
34
|
+
lib/mechanize/form/multi_select_list.rb
|
|
35
|
+
lib/mechanize/form/option.rb
|
|
36
|
+
lib/mechanize/form/radio_button.rb
|
|
37
|
+
lib/mechanize/form/reset.rb
|
|
38
|
+
lib/mechanize/form/select_list.rb
|
|
39
|
+
lib/mechanize/form/submit.rb
|
|
40
|
+
lib/mechanize/form/text.rb
|
|
41
|
+
lib/mechanize/form/textarea.rb
|
|
42
|
+
lib/mechanize/headers.rb
|
|
43
|
+
lib/mechanize/history.rb
|
|
44
|
+
lib/mechanize/http.rb
|
|
45
|
+
lib/mechanize/http/agent.rb
|
|
46
|
+
lib/mechanize/http/auth_challenge.rb
|
|
47
|
+
lib/mechanize/http/auth_realm.rb
|
|
48
|
+
lib/mechanize/http/content_disposition_parser.rb
|
|
49
|
+
lib/mechanize/http/www_authenticate_parser.rb
|
|
50
|
+
lib/mechanize/monkey_patch.rb
|
|
51
|
+
lib/mechanize/page.rb
|
|
52
|
+
lib/mechanize/page/base.rb
|
|
53
|
+
lib/mechanize/page/frame.rb
|
|
54
|
+
lib/mechanize/page/image.rb
|
|
55
|
+
lib/mechanize/page/label.rb
|
|
56
|
+
lib/mechanize/page/link.rb
|
|
57
|
+
lib/mechanize/page/meta_refresh.rb
|
|
58
|
+
lib/mechanize/parser.rb
|
|
59
|
+
lib/mechanize/pluggable_parsers.rb
|
|
60
|
+
lib/mechanize/redirect_limit_reached_error.rb
|
|
61
|
+
lib/mechanize/redirect_not_get_or_head_error.rb
|
|
62
|
+
lib/mechanize/response_code_error.rb
|
|
63
|
+
lib/mechanize/response_read_error.rb
|
|
64
|
+
lib/mechanize/robots_disallowed_error.rb
|
|
65
|
+
lib/mechanize/test_case.rb
|
|
66
|
+
lib/mechanize/unauthorized_error.rb
|
|
67
|
+
lib/mechanize/unsupported_scheme_error.rb
|
|
68
|
+
lib/mechanize/util.rb
|
|
69
|
+
test/data/htpasswd
|
|
70
|
+
test/data/server.crt
|
|
71
|
+
test/data/server.csr
|
|
72
|
+
test/data/server.key
|
|
73
|
+
test/data/server.pem
|
|
74
|
+
test/htdocs/alt_text.html
|
|
75
|
+
test/htdocs/bad_form_test.html
|
|
76
|
+
test/htdocs/button.jpg
|
|
77
|
+
test/htdocs/canonical_uri.html
|
|
78
|
+
test/htdocs/dir with spaces/foo.html
|
|
79
|
+
test/htdocs/empty_form.html
|
|
80
|
+
test/htdocs/file_upload.html
|
|
81
|
+
test/htdocs/find_link.html
|
|
82
|
+
test/htdocs/form_multi_select.html
|
|
83
|
+
test/htdocs/form_multival.html
|
|
84
|
+
test/htdocs/form_no_action.html
|
|
85
|
+
test/htdocs/form_no_input_name.html
|
|
86
|
+
test/htdocs/form_order_test.html
|
|
87
|
+
test/htdocs/form_select.html
|
|
88
|
+
test/htdocs/form_set_fields.html
|
|
89
|
+
test/htdocs/form_test.html
|
|
90
|
+
test/htdocs/frame_referer_test.html
|
|
91
|
+
test/htdocs/frame_test.html
|
|
92
|
+
test/htdocs/google.html
|
|
93
|
+
test/htdocs/index.html
|
|
94
|
+
test/htdocs/link with space.html
|
|
95
|
+
test/htdocs/meta_cookie.html
|
|
96
|
+
test/htdocs/no_title_test.html
|
|
97
|
+
test/htdocs/noindex.html
|
|
98
|
+
test/htdocs/rails_3_encoding_hack_form_test.html
|
|
99
|
+
test/htdocs/relative/tc_relative_links.html
|
|
100
|
+
test/htdocs/robots.html
|
|
101
|
+
test/htdocs/robots.txt
|
|
102
|
+
test/htdocs/tc_bad_charset.html
|
|
103
|
+
test/htdocs/tc_bad_links.html
|
|
104
|
+
test/htdocs/tc_base_link.html
|
|
105
|
+
test/htdocs/tc_blank_form.html
|
|
106
|
+
test/htdocs/tc_charset.html
|
|
107
|
+
test/htdocs/tc_checkboxes.html
|
|
108
|
+
test/htdocs/tc_encoded_links.html
|
|
109
|
+
test/htdocs/tc_field_precedence.html
|
|
110
|
+
test/htdocs/tc_follow_meta.html
|
|
111
|
+
test/htdocs/tc_form_action.html
|
|
112
|
+
test/htdocs/tc_links.html
|
|
113
|
+
test/htdocs/tc_meta_in_body.html
|
|
114
|
+
test/htdocs/tc_pretty_print.html
|
|
115
|
+
test/htdocs/tc_referer.html
|
|
116
|
+
test/htdocs/tc_relative_links.html
|
|
117
|
+
test/htdocs/tc_textarea.html
|
|
118
|
+
test/htdocs/test_click.html
|
|
119
|
+
test/htdocs/unusual______.html
|
|
120
|
+
test/test_mechanize.rb
|
|
121
|
+
test/test_mechanize_cookie.rb
|
|
122
|
+
test/test_mechanize_cookie_jar.rb
|
|
123
|
+
test/test_mechanize_download.rb
|
|
124
|
+
test/test_mechanize_file.rb
|
|
125
|
+
test/test_mechanize_file_connection.rb
|
|
126
|
+
test/test_mechanize_file_request.rb
|
|
127
|
+
test/test_mechanize_file_saver.rb
|
|
128
|
+
test/test_mechanize_form.rb
|
|
129
|
+
test/test_mechanize_form_check_box.rb
|
|
130
|
+
test/test_mechanize_form_encoding.rb
|
|
131
|
+
test/test_mechanize_form_field.rb
|
|
132
|
+
test/test_mechanize_form_file_upload.rb
|
|
133
|
+
test/test_mechanize_form_image_button.rb
|
|
134
|
+
test/test_mechanize_form_keygen.rb
|
|
135
|
+
test/test_mechanize_form_multi_select_list.rb
|
|
136
|
+
test/test_mechanize_form_option.rb
|
|
137
|
+
test/test_mechanize_form_radio_button.rb
|
|
138
|
+
test/test_mechanize_form_select_list.rb
|
|
139
|
+
test/test_mechanize_form_textarea.rb
|
|
140
|
+
test/test_mechanize_headers.rb
|
|
141
|
+
test/test_mechanize_history.rb
|
|
142
|
+
test/test_mechanize_http_agent.rb
|
|
143
|
+
test/test_mechanize_http_auth_challenge.rb
|
|
144
|
+
test/test_mechanize_http_auth_realm.rb
|
|
145
|
+
test/test_mechanize_http_content_disposition_parser.rb
|
|
146
|
+
test/test_mechanize_http_www_authenticate_parser.rb
|
|
147
|
+
test/test_mechanize_link.rb
|
|
148
|
+
test/test_mechanize_page.rb
|
|
149
|
+
test/test_mechanize_page_encoding.rb
|
|
150
|
+
test/test_mechanize_page_frame.rb
|
|
151
|
+
test/test_mechanize_page_link.rb
|
|
152
|
+
test/test_mechanize_page_meta_refresh.rb
|
|
153
|
+
test/test_mechanize_parser.rb
|
|
154
|
+
test/test_mechanize_pluggable_parser.rb
|
|
155
|
+
test/test_mechanize_redirect_limit_reached_error.rb
|
|
156
|
+
test/test_mechanize_redirect_not_get_or_head_error.rb
|
|
157
|
+
test/test_mechanize_subclass.rb
|
|
158
|
+
test/test_mechanize_util.rb
|
|
159
|
+
test/test_multi_select.rb
|