diamond-mechanize 2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (154) hide show
  1. data/CHANGELOG.rdoc +718 -0
  2. data/EXAMPLES.rdoc +187 -0
  3. data/FAQ.rdoc +11 -0
  4. data/GUIDE.rdoc +163 -0
  5. data/LICENSE.rdoc +20 -0
  6. data/Manifest.txt +159 -0
  7. data/README.rdoc +64 -0
  8. data/Rakefile +49 -0
  9. data/lib/mechanize.rb +1079 -0
  10. data/lib/mechanize/content_type_error.rb +13 -0
  11. data/lib/mechanize/cookie.rb +232 -0
  12. data/lib/mechanize/cookie_jar.rb +194 -0
  13. data/lib/mechanize/download.rb +59 -0
  14. data/lib/mechanize/element_matcher.rb +36 -0
  15. data/lib/mechanize/file.rb +65 -0
  16. data/lib/mechanize/file_connection.rb +17 -0
  17. data/lib/mechanize/file_request.rb +26 -0
  18. data/lib/mechanize/file_response.rb +74 -0
  19. data/lib/mechanize/file_saver.rb +39 -0
  20. data/lib/mechanize/form.rb +543 -0
  21. data/lib/mechanize/form/button.rb +6 -0
  22. data/lib/mechanize/form/check_box.rb +12 -0
  23. data/lib/mechanize/form/field.rb +54 -0
  24. data/lib/mechanize/form/file_upload.rb +21 -0
  25. data/lib/mechanize/form/hidden.rb +3 -0
  26. data/lib/mechanize/form/image_button.rb +19 -0
  27. data/lib/mechanize/form/keygen.rb +34 -0
  28. data/lib/mechanize/form/multi_select_list.rb +94 -0
  29. data/lib/mechanize/form/option.rb +50 -0
  30. data/lib/mechanize/form/radio_button.rb +55 -0
  31. data/lib/mechanize/form/reset.rb +3 -0
  32. data/lib/mechanize/form/select_list.rb +44 -0
  33. data/lib/mechanize/form/submit.rb +3 -0
  34. data/lib/mechanize/form/text.rb +3 -0
  35. data/lib/mechanize/form/textarea.rb +3 -0
  36. data/lib/mechanize/headers.rb +23 -0
  37. data/lib/mechanize/history.rb +82 -0
  38. data/lib/mechanize/http.rb +8 -0
  39. data/lib/mechanize/http/agent.rb +1004 -0
  40. data/lib/mechanize/http/auth_challenge.rb +59 -0
  41. data/lib/mechanize/http/auth_realm.rb +31 -0
  42. data/lib/mechanize/http/content_disposition_parser.rb +188 -0
  43. data/lib/mechanize/http/www_authenticate_parser.rb +155 -0
  44. data/lib/mechanize/monkey_patch.rb +16 -0
  45. data/lib/mechanize/page.rb +440 -0
  46. data/lib/mechanize/page/base.rb +7 -0
  47. data/lib/mechanize/page/frame.rb +27 -0
  48. data/lib/mechanize/page/image.rb +30 -0
  49. data/lib/mechanize/page/label.rb +20 -0
  50. data/lib/mechanize/page/link.rb +98 -0
  51. data/lib/mechanize/page/meta_refresh.rb +68 -0
  52. data/lib/mechanize/parser.rb +173 -0
  53. data/lib/mechanize/pluggable_parsers.rb +144 -0
  54. data/lib/mechanize/redirect_limit_reached_error.rb +19 -0
  55. data/lib/mechanize/redirect_not_get_or_head_error.rb +21 -0
  56. data/lib/mechanize/response_code_error.rb +21 -0
  57. data/lib/mechanize/response_read_error.rb +27 -0
  58. data/lib/mechanize/robots_disallowed_error.rb +28 -0
  59. data/lib/mechanize/test_case.rb +663 -0
  60. data/lib/mechanize/unauthorized_error.rb +3 -0
  61. data/lib/mechanize/unsupported_scheme_error.rb +6 -0
  62. data/lib/mechanize/util.rb +101 -0
  63. data/test/data/htpasswd +1 -0
  64. data/test/data/server.crt +16 -0
  65. data/test/data/server.csr +12 -0
  66. data/test/data/server.key +15 -0
  67. data/test/data/server.pem +15 -0
  68. data/test/htdocs/alt_text.html +10 -0
  69. data/test/htdocs/bad_form_test.html +9 -0
  70. data/test/htdocs/button.jpg +0 -0
  71. data/test/htdocs/canonical_uri.html +9 -0
  72. data/test/htdocs/dir with spaces/foo.html +1 -0
  73. data/test/htdocs/empty_form.html +6 -0
  74. data/test/htdocs/file_upload.html +26 -0
  75. data/test/htdocs/find_link.html +41 -0
  76. data/test/htdocs/form_multi_select.html +16 -0
  77. data/test/htdocs/form_multival.html +37 -0
  78. data/test/htdocs/form_no_action.html +18 -0
  79. data/test/htdocs/form_no_input_name.html +16 -0
  80. data/test/htdocs/form_order_test.html +11 -0
  81. data/test/htdocs/form_select.html +16 -0
  82. data/test/htdocs/form_set_fields.html +14 -0
  83. data/test/htdocs/form_test.html +188 -0
  84. data/test/htdocs/frame_referer_test.html +10 -0
  85. data/test/htdocs/frame_test.html +30 -0
  86. data/test/htdocs/google.html +13 -0
  87. data/test/htdocs/index.html +6 -0
  88. data/test/htdocs/link with space.html +5 -0
  89. data/test/htdocs/meta_cookie.html +11 -0
  90. data/test/htdocs/no_title_test.html +6 -0
  91. data/test/htdocs/noindex.html +9 -0
  92. data/test/htdocs/rails_3_encoding_hack_form_test.html +27 -0
  93. data/test/htdocs/relative/tc_relative_links.html +21 -0
  94. data/test/htdocs/robots.html +8 -0
  95. data/test/htdocs/robots.txt +2 -0
  96. data/test/htdocs/tc_bad_charset.html +9 -0
  97. data/test/htdocs/tc_bad_links.html +5 -0
  98. data/test/htdocs/tc_base_link.html +8 -0
  99. data/test/htdocs/tc_blank_form.html +11 -0
  100. data/test/htdocs/tc_charset.html +6 -0
  101. data/test/htdocs/tc_checkboxes.html +19 -0
  102. data/test/htdocs/tc_encoded_links.html +5 -0
  103. data/test/htdocs/tc_field_precedence.html +11 -0
  104. data/test/htdocs/tc_follow_meta.html +8 -0
  105. data/test/htdocs/tc_form_action.html +48 -0
  106. data/test/htdocs/tc_links.html +19 -0
  107. data/test/htdocs/tc_meta_in_body.html +9 -0
  108. data/test/htdocs/tc_pretty_print.html +17 -0
  109. data/test/htdocs/tc_referer.html +16 -0
  110. data/test/htdocs/tc_relative_links.html +19 -0
  111. data/test/htdocs/tc_textarea.html +23 -0
  112. data/test/htdocs/test_click.html +11 -0
  113. data/test/htdocs/unusual______.html +5 -0
  114. data/test/test_mechanize.rb +1164 -0
  115. data/test/test_mechanize_cookie.rb +451 -0
  116. data/test/test_mechanize_cookie_jar.rb +483 -0
  117. data/test/test_mechanize_download.rb +43 -0
  118. data/test/test_mechanize_file.rb +61 -0
  119. data/test/test_mechanize_file_connection.rb +21 -0
  120. data/test/test_mechanize_file_request.rb +19 -0
  121. data/test/test_mechanize_file_saver.rb +21 -0
  122. data/test/test_mechanize_form.rb +875 -0
  123. data/test/test_mechanize_form_check_box.rb +38 -0
  124. data/test/test_mechanize_form_encoding.rb +114 -0
  125. data/test/test_mechanize_form_field.rb +63 -0
  126. data/test/test_mechanize_form_file_upload.rb +20 -0
  127. data/test/test_mechanize_form_image_button.rb +12 -0
  128. data/test/test_mechanize_form_keygen.rb +32 -0
  129. data/test/test_mechanize_form_multi_select_list.rb +84 -0
  130. data/test/test_mechanize_form_option.rb +55 -0
  131. data/test/test_mechanize_form_radio_button.rb +78 -0
  132. data/test/test_mechanize_form_select_list.rb +76 -0
  133. data/test/test_mechanize_form_textarea.rb +52 -0
  134. data/test/test_mechanize_headers.rb +35 -0
  135. data/test/test_mechanize_history.rb +103 -0
  136. data/test/test_mechanize_http_agent.rb +1225 -0
  137. data/test/test_mechanize_http_auth_challenge.rb +39 -0
  138. data/test/test_mechanize_http_auth_realm.rb +49 -0
  139. data/test/test_mechanize_http_content_disposition_parser.rb +118 -0
  140. data/test/test_mechanize_http_www_authenticate_parser.rb +146 -0
  141. data/test/test_mechanize_link.rb +80 -0
  142. data/test/test_mechanize_page.rb +118 -0
  143. data/test/test_mechanize_page_encoding.rb +182 -0
  144. data/test/test_mechanize_page_frame.rb +16 -0
  145. data/test/test_mechanize_page_link.rb +390 -0
  146. data/test/test_mechanize_page_meta_refresh.rb +127 -0
  147. data/test/test_mechanize_parser.rb +289 -0
  148. data/test/test_mechanize_pluggable_parser.rb +52 -0
  149. data/test/test_mechanize_redirect_limit_reached_error.rb +24 -0
  150. data/test/test_mechanize_redirect_not_get_or_head_error.rb +14 -0
  151. data/test/test_mechanize_subclass.rb +22 -0
  152. data/test/test_mechanize_util.rb +103 -0
  153. data/test/test_multi_select.rb +119 -0
  154. metadata +216 -0
@@ -0,0 +1,440 @@
1
+ ##
2
+ # This class encapsulates an HTML page. If Mechanize finds a content
3
+ # type of 'text/html', this class will be instantiated and returned.
4
+ #
5
+ # Example:
6
+ #
7
+ # require 'mechanize'
8
+ #
9
+ # agent = Mechanize.new
10
+ # agent.get('http://google.com/').class # => Mechanize::Page
11
+
12
+ class Mechanize::Page < Mechanize::File
13
+ extend Forwardable
14
+ extend Mechanize::ElementMatcher
15
+
16
+ attr_accessor :mech
17
+
18
+ ##
19
+ # Possible encodings for this page based on HTTP headers and meta elements
20
+
21
+ attr_reader :encodings
22
+
23
+ def initialize(uri=nil, response=nil, body=nil, code=nil, mech=nil)
24
+ raise Mechanize::ContentTypeError, response['content-type'] unless
25
+ response['content-type'] =~ /^(text\/html)|(application\/xhtml\+xml)/i
26
+
27
+ @meta_content_type = nil
28
+ @encoding = nil
29
+ @encodings = [nil]
30
+ raise 'no' if mech and not Mechanize === mech
31
+ @mech = mech
32
+
33
+ reset
34
+
35
+ @encodings << Mechanize::Util.detect_charset(body) if body
36
+
37
+ @encodings.concat self.class.response_header_charset(response)
38
+
39
+ if body
40
+ # Force the encoding to be 8BIT so we can perform regular expressions.
41
+ # We'll set it to the detected encoding later
42
+ body.force_encoding 'ASCII-8BIT' if body.respond_to? :force_encoding
43
+
44
+ @encodings.concat self.class.meta_charset body
45
+
46
+ meta_content_type = self.class.meta_content_type body
47
+ @meta_content_type = meta_content_type if meta_content_type
48
+ end
49
+
50
+ @encodings << mech.default_encoding if mech and mech.default_encoding
51
+
52
+ super uri, response, body, code
53
+ end
54
+
55
+ def title
56
+ @title ||=
57
+ if doc = parser
58
+ title = doc.search('title').inner_text
59
+ title.empty? ? nil : title
60
+ end
61
+ end
62
+
63
+ def response_header_charset
64
+ self.class.response_header_charset(response)
65
+ end
66
+
67
+ def meta_charset
68
+ self.class.meta_charset(body)
69
+ end
70
+
71
+ def detected_encoding
72
+ Mechanize::Util.detect_charset(body)
73
+ end
74
+
75
+ def encoding=(encoding)
76
+ reset
77
+
78
+ @encoding = encoding
79
+
80
+ if @parser
81
+ parser_encoding = @parser.encoding
82
+ if (parser_encoding && parser_encoding.downcase) != (encoding && encoding.downcase)
83
+ # lazy reinitialize the parser with the new encoding
84
+ @parser = nil
85
+ end
86
+ end
87
+
88
+ encoding
89
+ end
90
+
91
+ def encoding
92
+ parser.respond_to?(:encoding) ? parser.encoding : nil
93
+ end
94
+
95
+ # Return whether parser result has errors related to encoding or not.
96
+ # false indicates just parser has no encoding errors, not encoding is vaild.
97
+ def encoding_error?(parser=nil)
98
+ parser = self.parser unless parser
99
+ return false if parser.errors.empty?
100
+ parser.errors.any? do |error|
101
+ error.message =~ /(indicate\ encoding)|
102
+ (Invalid\ char)|
103
+ (input\ conversion\ failed)/x
104
+ end
105
+ end
106
+
107
+ def parser
108
+ return @parser if @parser
109
+ return nil unless @body
110
+
111
+ if @encoding then
112
+ @parser = @mech.html_parser.parse html_body, nil, @encoding
113
+ elsif mech.force_default_encoding then
114
+ @parser = @mech.html_parser.parse html_body, nil, @mech.default_encoding
115
+ else
116
+ @encodings.reverse_each do |encoding|
117
+ @parser = @mech.html_parser.parse html_body, nil, encoding
118
+
119
+ break unless encoding_error? @parser
120
+ end
121
+ end
122
+
123
+ @parser
124
+ end
125
+
126
+ alias :root :parser
127
+
128
+ def pretty_print(q) # :nodoc:
129
+ q.object_group(self) {
130
+ q.breakable
131
+ q.group(1, '{url', '}') {q.breakable; q.pp uri }
132
+ q.breakable
133
+ q.group(1, '{meta_refresh', '}') {
134
+ meta_refresh.each { |link| q.breakable; q.pp link }
135
+ }
136
+ q.breakable
137
+ q.group(1, '{title', '}') { q.breakable; q.pp title }
138
+ q.breakable
139
+ q.group(1, '{iframes', '}') {
140
+ iframes.each { |link| q.breakable; q.pp link }
141
+ }
142
+ q.breakable
143
+ q.group(1, '{frames', '}') {
144
+ frames.each { |link| q.breakable; q.pp link }
145
+ }
146
+ q.breakable
147
+ q.group(1, '{links', '}') {
148
+ links.each { |link| q.breakable; q.pp link }
149
+ }
150
+ q.breakable
151
+ q.group(1, '{forms', '}') {
152
+ forms.each { |form| q.breakable; q.pp form }
153
+ }
154
+ }
155
+ end
156
+
157
+ alias inspect pretty_inspect # :nodoc:
158
+
159
+ def reset
160
+ @bases = nil
161
+ @forms = nil
162
+ @frames = nil
163
+ @iframes = nil
164
+ @links = nil
165
+ @labels = nil
166
+ @labels_hash = nil
167
+ @meta_refresh = nil
168
+ @parser = nil
169
+ @title = nil
170
+ end
171
+
172
+ # Return the canonical URI for the page if there is a link tag
173
+ # with href="canonical".
174
+ def canonical_uri
175
+ link = at('link[@rel="canonical"][@href]')
176
+ return unless link
177
+ href = link['href']
178
+
179
+ URI href
180
+ rescue URI::InvalidURIError
181
+ URI Mechanize::Util.uri_escape href
182
+ end
183
+
184
+ # Get the content type
185
+ def content_type
186
+ @meta_content_type || response['content-type']
187
+ end
188
+
189
+ # Search through the page like HPricot
190
+ def_delegator :parser, :search, :search
191
+ def_delegator :parser, :/, :/
192
+ def_delegator :parser, :at, :at
193
+
194
+ ##
195
+ # :method: form_with(criteria)
196
+ #
197
+ # Find a single form matching +criteria+.
198
+ # Example:
199
+ # page.form_with(:action => '/post/login.php') do |f|
200
+ # ...
201
+ # end
202
+
203
+ ##
204
+ # :method: forms_with(criteria)
205
+ #
206
+ # Find all forms form matching +criteria+.
207
+ # Example:
208
+ # page.forms_with(:action => '/post/login.php').each do |f|
209
+ # ...
210
+ # end
211
+
212
+ elements_with :form
213
+
214
+ ##
215
+ # :method: link_with(criteria)
216
+ #
217
+ # Find a single link matching +criteria+.
218
+ # Example:
219
+ # page.link_with(:href => /foo/).click
220
+
221
+ ##
222
+ # :method: links_with(criteria)
223
+ #
224
+ # Find all links matching +criteria+.
225
+ # Example:
226
+ # page.links_with(:href => /foo/).each do |link|
227
+ # puts link.href
228
+ # end
229
+
230
+ elements_with :link
231
+
232
+ ##
233
+ # :method: base_with(criteria)
234
+ #
235
+ # Find a single base tag matching +criteria+.
236
+ # Example:
237
+ # page.base_with(:href => /foo/).click
238
+
239
+ ##
240
+ # :method: bases_with(criteria)
241
+ #
242
+ # Find all base tags matching +criteria+.
243
+ # Example:
244
+ # page.bases_with(:href => /foo/).each do |base|
245
+ # puts base.href
246
+ # end
247
+
248
+ elements_with :base
249
+
250
+ ##
251
+ # :method: frame_with(criteria)
252
+ #
253
+ # Find a single frame tag matching +criteria+.
254
+ # Example:
255
+ # page.frame_with(:src => /foo/).click
256
+
257
+ ##
258
+ # :method: frames_with(criteria)
259
+ #
260
+ # Find all frame tags matching +criteria+.
261
+ # Example:
262
+ # page.frames_with(:src => /foo/).each do |frame|
263
+ # p frame.src
264
+ # end
265
+
266
+ elements_with :frame
267
+
268
+ ##
269
+ # :method: iframe_with(criteria)
270
+ #
271
+ # Find a single iframe tag matching +criteria+.
272
+ # Example:
273
+ # page.iframe_with(:src => /foo/).click
274
+
275
+ ##
276
+ # :method: iframes_with(criteria)
277
+ #
278
+ # Find all iframe tags matching +criteria+.
279
+ # Example:
280
+ # page.iframes_with(:src => /foo/).each do |iframe|
281
+ # p iframe.src
282
+ # end
283
+
284
+ elements_with :iframe
285
+
286
+ ##
287
+ # Return a list of all link and area tags
288
+ def links
289
+ @links ||= %w{ a area }.map do |tag|
290
+ search(tag).map do |node|
291
+ Link.new(node, @mech, self)
292
+ end
293
+ end.flatten
294
+ end
295
+
296
+ ##
297
+ # Return a list of all form tags
298
+ def forms
299
+ @forms ||= search('form').map do |html_form|
300
+ form = Mechanize::Form.new(html_form, @mech, self)
301
+ form.action ||= @uri.to_s
302
+ form
303
+ end
304
+ end
305
+
306
+ ##
307
+ # Return a list of all meta refresh elements
308
+
309
+ def meta_refresh
310
+ query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'
311
+
312
+ @meta_refresh ||= search(query).map do |node|
313
+ MetaRefresh.from_node node, self, uri
314
+ end.compact
315
+ end
316
+
317
+ ##
318
+ # Return a list of all base tags
319
+ def bases
320
+ @bases ||=
321
+ search('base').map { |node| Base.new(node, @mech, self) }
322
+ end
323
+
324
+ ##
325
+ # Return a list of all frame tags
326
+ def frames
327
+ @frames ||=
328
+ search('frame').map { |node| Frame.new(node, @mech, self) }
329
+ end
330
+
331
+ ##
332
+ # Return a list of all iframe tags
333
+ def iframes
334
+ @iframes ||=
335
+ search('iframe').map { |node| Frame.new(node, @mech, self) }
336
+ end
337
+
338
+ ##
339
+ # Return a list of all img tags
340
+ def images
341
+ @images ||=
342
+ search('img').map { |node| Image.new(node, self) }
343
+ end
344
+
345
+ def image_urls
346
+ @image_urls ||= images.map(&:url).uniq
347
+ end
348
+
349
+ ##
350
+ # Return a list of all label tags
351
+ def labels
352
+ @labels ||=
353
+ search('label').map { |node| Label.new(node, self) }
354
+ end
355
+
356
+ def labels_hash
357
+ unless @labels_hash
358
+ hash = {}
359
+ labels.each do |label|
360
+ hash[label.node['for']] = label if label.for
361
+ end
362
+ @labels_hash = hash
363
+ end
364
+ return @labels_hash
365
+ end
366
+
367
+ def self.charset content_type
368
+ charset = content_type[/charset=([^; ]+)/i, 1]
369
+ return nil if charset == 'none'
370
+ charset
371
+ end
372
+
373
+ def self.response_header_charset response
374
+ charsets = []
375
+ response.each do |header, value|
376
+ next unless header == 'content-type'
377
+ next unless value =~ /charset/i
378
+ charsets << charset(value)
379
+ end
380
+ charsets
381
+ end
382
+
383
+ ##
384
+ # Retrieves all charsets from +meta+ tags in +body+
385
+
386
+ def self.meta_charset body
387
+ # HACK use .map
388
+ body.scan(/<meta .*?>/i).map do |meta|
389
+ if meta =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then
390
+ $2
391
+ elsif meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
392
+ meta =~ /content\s*=\s*(["'])?(.*?)\1/i
393
+
394
+ m_charset = charset $2 if $2
395
+
396
+ m_charset if m_charset
397
+ end
398
+ end.compact
399
+ end
400
+
401
+ ##
402
+ # Retrieves the last <tt>content-type</tt> set by a +meta+ tag in +body+
403
+
404
+ def self.meta_content_type body
405
+ body.scan(/<meta .*?>/i).reverse.map do |meta|
406
+ if meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
407
+ meta =~ /content=(["'])?(.*?)\1/i
408
+
409
+ return $2
410
+ end
411
+ end
412
+
413
+ nil
414
+ end
415
+
416
+ private
417
+
418
+ def html_body
419
+ if @body
420
+ @body.empty? ? '<html></html>' : @body
421
+ else
422
+ ''
423
+ end
424
+ end
425
+
426
+ def self.charset_from_content_type content_type
427
+ charset = content_type[/charset=([^; ]+)/i, 1]
428
+ return nil if charset == 'none'
429
+ charset
430
+ end
431
+ end
432
+
433
+ require 'mechanize/headers'
434
+ require 'mechanize/page/image'
435
+ require 'mechanize/page/label'
436
+ require 'mechanize/page/link'
437
+ require 'mechanize/page/base'
438
+ require 'mechanize/page/frame'
439
+ require 'mechanize/page/meta_refresh'
440
+
@@ -0,0 +1,7 @@
1
+ ##
2
+ # A base element on an HTML page. Mechanize treats base tags just like 'a'
3
+ # tags. Base objects will contain links, but most likely will have no text.
4
+
5
+ class Mechanize::Page::Base < Mechanize::Page::Link
6
+ end
7
+
@@ -0,0 +1,27 @@
1
+ # A Frame object wrapse a frame HTML element. Frame objects can be treated
2
+ # just like Link objects. They contain #src, the #link they refer to and a
3
+ # #name, the name of the frame they refer to. #src and #name are aliased to
4
+ # #href and #text respectively so that a Frame object can be treated just like
5
+ # a Link.
6
+
7
+ class Mechanize::Page::Frame < Mechanize::Page::Link
8
+
9
+ alias :src :href
10
+
11
+ attr_reader :text
12
+ alias :name :text
13
+
14
+ def initialize(node, mech, referer)
15
+ super(node, mech, referer)
16
+ @node = node
17
+ @text = node['name']
18
+ @href = node['src']
19
+ @content = nil
20
+ end
21
+
22
+ def content
23
+ @content ||= @mech.get @href, [], page
24
+ end
25
+
26
+ end
27
+