mechanize 2.0.1 → 2.1.pre.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mechanize might be problematic. Click here for more details.

Files changed (148) hide show
  1. data.tar.gz.sig +0 -0
  2. data/CHANGELOG.rdoc +82 -0
  3. data/EXAMPLES.rdoc +1 -1
  4. data/FAQ.rdoc +9 -9
  5. data/Manifest.txt +35 -48
  6. data/README.rdoc +2 -1
  7. data/Rakefile +16 -3
  8. data/lib/mechanize.rb +809 -392
  9. data/lib/mechanize/content_type_error.rb +10 -11
  10. data/lib/mechanize/cookie.rb +193 -60
  11. data/lib/mechanize/cookie_jar.rb +39 -86
  12. data/lib/mechanize/download.rb +59 -0
  13. data/lib/mechanize/element_matcher.rb +1 -0
  14. data/lib/mechanize/file.rb +61 -76
  15. data/lib/mechanize/file_saver.rb +37 -35
  16. data/lib/mechanize/form.rb +475 -410
  17. data/lib/mechanize/form/button.rb +4 -7
  18. data/lib/mechanize/form/check_box.rb +10 -9
  19. data/lib/mechanize/form/field.rb +52 -42
  20. data/lib/mechanize/form/file_upload.rb +17 -19
  21. data/lib/mechanize/form/hidden.rb +3 -0
  22. data/lib/mechanize/form/image_button.rb +15 -16
  23. data/lib/mechanize/form/keygen.rb +34 -0
  24. data/lib/mechanize/form/multi_select_list.rb +20 -9
  25. data/lib/mechanize/form/option.rb +48 -47
  26. data/lib/mechanize/form/radio_button.rb +52 -45
  27. data/lib/mechanize/form/reset.rb +3 -0
  28. data/lib/mechanize/form/select_list.rb +10 -6
  29. data/lib/mechanize/form/submit.rb +3 -0
  30. data/lib/mechanize/form/text.rb +3 -0
  31. data/lib/mechanize/form/textarea.rb +3 -0
  32. data/lib/mechanize/headers.rb +17 -19
  33. data/lib/mechanize/history.rb +60 -61
  34. data/lib/mechanize/http.rb +5 -0
  35. data/lib/mechanize/http/agent.rb +485 -218
  36. data/lib/mechanize/http/auth_challenge.rb +59 -0
  37. data/lib/mechanize/http/auth_realm.rb +31 -0
  38. data/lib/mechanize/http/content_disposition_parser.rb +188 -0
  39. data/lib/mechanize/http/www_authenticate_parser.rb +155 -0
  40. data/lib/mechanize/monkey_patch.rb +14 -35
  41. data/lib/mechanize/page.rb +34 -2
  42. data/lib/mechanize/page/base.rb +6 -7
  43. data/lib/mechanize/page/frame.rb +5 -5
  44. data/lib/mechanize/page/image.rb +23 -23
  45. data/lib/mechanize/page/label.rb +16 -16
  46. data/lib/mechanize/page/link.rb +16 -0
  47. data/lib/mechanize/page/meta_refresh.rb +19 -7
  48. data/lib/mechanize/parser.rb +173 -0
  49. data/lib/mechanize/pluggable_parsers.rb +126 -83
  50. data/lib/mechanize/redirect_limit_reached_error.rb +16 -13
  51. data/lib/mechanize/redirect_not_get_or_head_error.rb +18 -16
  52. data/lib/mechanize/response_code_error.rb +16 -17
  53. data/lib/mechanize/robots_disallowed_error.rb +22 -23
  54. data/lib/mechanize/test_case.rb +659 -0
  55. data/lib/mechanize/unauthorized_error.rb +3 -0
  56. data/lib/mechanize/unsupported_scheme_error.rb +4 -6
  57. data/lib/mechanize/util.rb +0 -12
  58. data/test/htdocs/form_order_test.html +11 -0
  59. data/test/htdocs/form_test.html +2 -2
  60. data/test/htdocs/tc_links.html +1 -0
  61. data/test/test_mechanize.rb +367 -59
  62. data/test/test_mechanize_cookie.rb +69 -4
  63. data/test/test_mechanize_cookie_jar.rb +200 -124
  64. data/test/test_mechanize_download.rb +43 -0
  65. data/test/test_mechanize_file.rb +53 -45
  66. data/test/{test_mechanize_file_response.rb → test_mechanize_file_connection.rb} +2 -2
  67. data/test/test_mechanize_file_request.rb +2 -2
  68. data/test/test_mechanize_file_saver.rb +21 -0
  69. data/test/test_mechanize_form.rb +345 -46
  70. data/test/test_mechanize_form_check_box.rb +5 -4
  71. data/test/test_mechanize_form_encoding.rb +10 -16
  72. data/test/test_mechanize_form_field.rb +45 -3
  73. data/test/test_mechanize_form_file_upload.rb +20 -0
  74. data/test/test_mechanize_form_image_button.rb +2 -2
  75. data/test/test_mechanize_form_keygen.rb +32 -0
  76. data/test/test_mechanize_form_multi_select_list.rb +84 -0
  77. data/test/test_mechanize_form_option.rb +55 -0
  78. data/test/test_mechanize_form_radio_button.rb +78 -0
  79. data/test/test_mechanize_form_select_list.rb +76 -0
  80. data/test/test_mechanize_form_textarea.rb +8 -7
  81. data/test/{test_headers.rb → test_mechanize_headers.rb} +4 -2
  82. data/test/test_mechanize_history.rb +103 -0
  83. data/test/test_mechanize_http_agent.rb +525 -17
  84. data/test/test_mechanize_http_auth_challenge.rb +39 -0
  85. data/test/test_mechanize_http_auth_realm.rb +49 -0
  86. data/test/test_mechanize_http_content_disposition_parser.rb +118 -0
  87. data/test/test_mechanize_http_www_authenticate_parser.rb +146 -0
  88. data/test/test_mechanize_link.rb +10 -14
  89. data/test/test_mechanize_page.rb +118 -0
  90. data/test/test_mechanize_page_encoding.rb +48 -13
  91. data/test/test_mechanize_page_frame.rb +16 -0
  92. data/test/test_mechanize_page_link.rb +27 -19
  93. data/test/test_mechanize_page_meta_refresh.rb +26 -14
  94. data/test/test_mechanize_parser.rb +289 -0
  95. data/test/test_mechanize_pluggable_parser.rb +52 -0
  96. data/test/test_mechanize_redirect_limit_reached_error.rb +24 -0
  97. data/test/test_mechanize_redirect_not_get_or_head_error.rb +3 -7
  98. data/test/test_mechanize_subclass.rb +2 -2
  99. data/test/test_mechanize_util.rb +24 -13
  100. data/test/test_multi_select.rb +23 -22
  101. metadata +145 -114
  102. metadata.gz.sig +0 -0
  103. data/lib/mechanize/inspect.rb +0 -88
  104. data/test/helper.rb +0 -175
  105. data/test/htdocs/form_select_all.html +0 -16
  106. data/test/htdocs/form_select_none.html +0 -17
  107. data/test/htdocs/form_select_noopts.html +0 -10
  108. data/test/htdocs/iframe_test.html +0 -16
  109. data/test/htdocs/nofollow.html +0 -9
  110. data/test/htdocs/norobots.html +0 -8
  111. data/test/htdocs/rel_nofollow.html +0 -8
  112. data/test/htdocs/tc_base_images.html +0 -10
  113. data/test/htdocs/tc_images.html +0 -8
  114. data/test/htdocs/tc_no_attributes.html +0 -16
  115. data/test/htdocs/tc_radiobuttons.html +0 -17
  116. data/test/htdocs/test_bad_encoding.html +0 -52
  117. data/test/servlets.rb +0 -402
  118. data/test/ssl_server.rb +0 -48
  119. data/test/test_cookies.rb +0 -129
  120. data/test/test_form_action.rb +0 -52
  121. data/test/test_form_as_hash.rb +0 -59
  122. data/test/test_form_button.rb +0 -46
  123. data/test/test_frames.rb +0 -34
  124. data/test/test_history.rb +0 -118
  125. data/test/test_history_added.rb +0 -16
  126. data/test/test_html_unscape_forms.rb +0 -46
  127. data/test/test_if_modified_since.rb +0 -20
  128. data/test/test_images.rb +0 -19
  129. data/test/test_no_attributes.rb +0 -13
  130. data/test/test_option.rb +0 -18
  131. data/test/test_pluggable_parser.rb +0 -136
  132. data/test/test_post_form.rb +0 -37
  133. data/test/test_pretty_print.rb +0 -22
  134. data/test/test_radiobutton.rb +0 -75
  135. data/test/test_redirect_limit_reached.rb +0 -39
  136. data/test/test_referer.rb +0 -81
  137. data/test/test_relative_links.rb +0 -40
  138. data/test/test_request.rb +0 -13
  139. data/test/test_response_code.rb +0 -53
  140. data/test/test_robots.rb +0 -72
  141. data/test/test_save_file.rb +0 -48
  142. data/test/test_scheme.rb +0 -48
  143. data/test/test_select.rb +0 -119
  144. data/test/test_select_all.rb +0 -15
  145. data/test/test_select_none.rb +0 -15
  146. data/test/test_select_noopts.rb +0 -18
  147. data/test/test_set_fields.rb +0 -44
  148. data/test/test_ssl_server.rb +0 -20
@@ -125,6 +125,37 @@ class Mechanize::Page < Mechanize::File
125
125
 
126
126
  alias :root :parser
127
127
 
128
+ def pretty_print(q) # :nodoc:
129
+ q.object_group(self) {
130
+ q.breakable
131
+ q.group(1, '{url', '}') {q.breakable; q.pp uri }
132
+ q.breakable
133
+ q.group(1, '{meta_refresh', '}') {
134
+ meta_refresh.each { |link| q.breakable; q.pp link }
135
+ }
136
+ q.breakable
137
+ q.group(1, '{title', '}') { q.breakable; q.pp title }
138
+ q.breakable
139
+ q.group(1, '{iframes', '}') {
140
+ iframes.each { |link| q.breakable; q.pp link }
141
+ }
142
+ q.breakable
143
+ q.group(1, '{frames', '}') {
144
+ frames.each { |link| q.breakable; q.pp link }
145
+ }
146
+ q.breakable
147
+ q.group(1, '{links', '}') {
148
+ links.each { |link| q.breakable; q.pp link }
149
+ }
150
+ q.breakable
151
+ q.group(1, '{forms', '}') {
152
+ forms.each { |form| q.breakable; q.pp form }
153
+ }
154
+ }
155
+ end
156
+
157
+ alias inspect pretty_inspect # :nodoc:
158
+
128
159
  def reset
129
160
  @bases = nil
130
161
  @forms = nil
@@ -342,6 +373,7 @@ class Mechanize::Page < Mechanize::File
342
373
  def self.response_header_charset response
343
374
  charsets = []
344
375
  response.each do |header, value|
376
+ next unless header == 'content-type'
345
377
  next unless value =~ /charset/i
346
378
  charsets << charset(value)
347
379
  end
@@ -357,9 +389,9 @@ class Mechanize::Page < Mechanize::File
357
389
  if meta =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then
358
390
  $2
359
391
  elsif meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
360
- meta =~ /content=(["'])?(.*?)\1/i
392
+ meta =~ /content\s*=\s*(["'])?(.*?)\1/i
361
393
 
362
- m_charset = charset $2
394
+ m_charset = charset $2 if $2
363
395
 
364
396
  m_charset if m_charset
365
397
  end
@@ -1,8 +1,7 @@
1
- class Mechanize
2
- class Page < Mechanize::File
3
- # This class encapsulates a Base tag. Mechanize treats base tags just
4
- # like 'a' tags. Base objects will contain links, but most likely will
5
- # have no text.
6
- class Base < Link; end
7
- end
1
+ ##
2
+ # A base element on an HTML page. Mechanize treats base tags just like 'a'
3
+ # tags. Base objects will contain links, but most likely will have no text.
4
+
5
+ class Mechanize::Page::Base < Mechanize::Page::Link
8
6
  end
7
+
@@ -1,8 +1,8 @@
1
- # This class encapsulates a 'frame' tag. Frame objects can be treated just
2
- # like Link objects. They contain #src, the #link they refer to and a #name,
3
- # the name of the frame they refer to. #src and #name are aliased to #href
4
- # and #text respectively so that a Frame object can be treated just like a
5
- # Link.
1
+ # A Frame object wrapse a frame HTML element. Frame objects can be treated
2
+ # just like Link objects. They contain #src, the #link they refer to and a
3
+ # #name, the name of the frame they refer to. #src and #name are aliased to
4
+ # #href and #text respectively so that a Frame object can be treated just like
5
+ # a Link.
6
6
 
7
7
  class Mechanize::Page::Frame < Mechanize::Page::Link
8
8
 
@@ -1,30 +1,30 @@
1
- class Mechanize
2
- class Page < Mechanize::File
3
- class Image
4
- attr_reader :node
5
- attr_reader :page
1
+ ##
2
+ # An image element on an HTML page
6
3
 
7
- def initialize(node, page)
8
- @node = node
9
- @page = page
10
- end
4
+ class Mechanize::Page::Image
5
+ attr_reader :node
6
+ attr_reader :page
11
7
 
12
- def src
13
- @node['src']
14
- end
8
+ def initialize(node, page)
9
+ @node = node
10
+ @page = page
11
+ end
15
12
 
16
- def url
17
- case src
18
- when %r{^https?://}
19
- src
20
- else
21
- if page.bases[0]
22
- (page.bases[0].href + src).to_s
23
- else
24
- (page.uri + src).to_s
25
- end
26
- end
13
+ def src
14
+ @node['src']
15
+ end
16
+
17
+ def url
18
+ case src
19
+ when %r{^https?://}
20
+ src
21
+ else
22
+ if page.bases[0]
23
+ (page.bases[0].href + src).to_s
24
+ else
25
+ (page.uri + src).to_s
27
26
  end
28
27
  end
29
28
  end
30
29
  end
30
+
@@ -1,20 +1,20 @@
1
- class Mechanize
2
- class Page < Mechanize::File
3
- class Label
4
- attr_reader :node
5
- attr_reader :text
6
- attr_reader :page
7
- alias :to_s :text
1
+ ##
2
+ # A form label on an HTML page
8
3
 
9
- def initialize(node, page)
10
- @node = node
11
- @text = node.inner_text
12
- @page = page
13
- end
4
+ class Mechanize::Page::Label
5
+ attr_reader :node
6
+ attr_reader :text
7
+ attr_reader :page
8
+ alias :to_s :text
14
9
 
15
- def for
16
- (id = @node['for']) && page.search("##{id}") || nil
17
- end
18
- end
10
+ def initialize(node, page)
11
+ @node = node
12
+ @text = node.inner_text
13
+ @page = page
14
+ end
15
+
16
+ def for
17
+ (id = @node['for']) && page.search("##{id}") || nil
19
18
  end
20
19
  end
20
+
@@ -37,6 +37,22 @@ class Mechanize::Page::Link
37
37
  node['id']
38
38
  end
39
39
 
40
+ # This method is a shorthand to get a link's DOM class
41
+ # Common usage:
42
+ # page.link_with(:dom_class => "links_exact_class")
43
+ def dom_class
44
+ node['class']
45
+ end
46
+
47
+ def pretty_print(q) # :nodoc:
48
+ q.object_group(self) {
49
+ q.breakable; q.pp text
50
+ q.breakable; q.pp href
51
+ }
52
+ end
53
+
54
+ alias inspect pretty_inspect # :nodoc:
55
+
40
56
  # A list of words in the rel attribute, all lower-cased.
41
57
  def rel
42
58
  @rel ||= (val = attributes['rel']) ? val.downcase.split(' ') : []
@@ -5,13 +5,23 @@
5
5
 
6
6
  class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
7
7
 
8
+ ##
9
+ # Time to wait before next refresh
10
+
8
11
  attr_reader :delay
9
12
 
13
+ ##
14
+ # This MetaRefresh links did not contain a url= in the content attribute and
15
+ # links to itself.
16
+
17
+ attr_reader :link_self
18
+
10
19
  ##
11
20
  # Matches the content attribute of a meta refresh element. After the match:
12
21
  #
13
22
  # $1:: delay
14
23
  # $3:: url
24
+
15
25
  CONTENT_REGEXP = /^\s*(\d+\.?\d*)(;|;\s*url=\s*['"]?(\S*?)['"]?)?\s*$/i
16
26
 
17
27
  ##
@@ -19,37 +29,39 @@ class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
19
29
  # element. Parse requires the uri of the current page to infer a url when
20
30
  # no url is specified.
21
31
  #
22
- # Returns a MetaRefresh instance.
32
+ # Returns an array of [delay, url]. (both in string)
23
33
  #
24
34
  # Returns nil if the delay and url cannot be parsed.
25
35
 
26
36
  def self.parse content, base_uri
27
37
  return unless content =~ CONTENT_REGEXP
28
38
 
39
+ link_self = $3.nil? || $3.empty?
29
40
  delay, refresh_uri = $1, $3
30
41
 
31
42
  dest = base_uri
32
43
  dest += refresh_uri if refresh_uri
33
44
 
34
- return delay, dest
45
+ return delay, dest, link_self
35
46
  end
36
47
 
37
48
  def self.from_node node, page, uri
38
49
  http_equiv = node['http-equiv']
39
50
  return unless http_equiv and http_equiv.downcase == 'refresh'
40
51
 
41
- delay, uri = parse node['content'], uri
52
+ delay, uri, link_self = parse node['content'], uri
42
53
 
43
54
  return unless delay
44
55
 
45
- new node, page, delay, uri.to_s
56
+ new node, page, delay, uri.to_s, link_self
46
57
  end
47
58
 
48
- def initialize node, page, delay, href
59
+ def initialize node, page, delay, href, link_self = false
49
60
  super node, page.mech, page
50
61
 
51
- @delay = delay.to_i
52
- @href = href
62
+ @delay = delay =~ /\./ ? delay.to_f : delay.to_i
63
+ @href = href
64
+ @link_self = link_self
53
65
  end
54
66
 
55
67
  end
@@ -0,0 +1,173 @@
1
+ ##
2
+ # The parser module provides standard methods for accessing the headers and
3
+ # content of a response that are shared across pluggable parsers.
4
+
5
+ module Mechanize::Parser
6
+
7
+ extend Forwardable
8
+
9
+ special_filenames = Regexp.union %w[
10
+ AUX
11
+ COM1
12
+ COM2
13
+ COM3
14
+ COM4
15
+ COM5
16
+ COM6
17
+ COM7
18
+ COM8
19
+ COM9
20
+ CON
21
+ LPT1
22
+ LPT2
23
+ LPT3
24
+ LPT4
25
+ LPT5
26
+ LPT6
27
+ LPT7
28
+ LPT8
29
+ LPT9
30
+ NUL
31
+ PRN
32
+ ]
33
+
34
+ ##
35
+ # Special filenames that must be escaped
36
+
37
+ SPECIAL_FILENAMES = /\A#{special_filenames}/i
38
+
39
+ ##
40
+ # The URI this file was retrieved from
41
+
42
+ attr_accessor :uri
43
+
44
+ ##
45
+ # The Mechanize::Headers for this file
46
+
47
+ attr_accessor :response
48
+
49
+ alias header response
50
+
51
+ ##
52
+ # The HTTP response code
53
+
54
+ attr_accessor :code
55
+
56
+ ##
57
+ # :method: [](header)
58
+ #
59
+ # Access HTTP +header+ by name
60
+
61
+ def_delegator :header, :[], :[]
62
+
63
+ ##
64
+ # :method: []=(header, value)
65
+ #
66
+ # Set HTTP +header+ to +value+
67
+
68
+ def_delegator :header, :[]=, :[]=
69
+
70
+ ##
71
+ # :method: key?(header)
72
+ #
73
+ # Is the named +header+ present?
74
+
75
+ def_delegator :header, :key?, :key?
76
+
77
+ ##
78
+ # :method: each
79
+ #
80
+ # Enumerate HTTP headers
81
+
82
+ def_delegator :header, :each, :each
83
+
84
+ ##
85
+ # :method: each
86
+ #
87
+ # Enumerate HTTP headers in capitalized (canonical) form
88
+
89
+ def_delegator :header, :canonical_each, :canonical_each
90
+
91
+ ##
92
+ # Extracts the filename from a Content-Disposition header in the #response
93
+ # or from the URI. If +full_path+ is true the filename will include the
94
+ # host name and path to the resource, otherwise a filename in the current
95
+ # directory is given.
96
+
97
+ def extract_filename full_path = @full_path
98
+ handled = false
99
+
100
+ if @uri then
101
+ uri = @uri
102
+ uri += 'index.html' if uri.path.end_with? '/'
103
+
104
+ path = uri.path.split(/\//)
105
+ filename = path.pop || 'index.html'
106
+ else
107
+ path = []
108
+ filename = 'index.html'
109
+ end
110
+
111
+ # Set the filename
112
+ if disposition = @response['content-disposition'] then
113
+ content_disposition =
114
+ Mechanize::HTTP::ContentDispositionParser.parse disposition
115
+
116
+ if content_disposition then
117
+ filename = content_disposition.filename
118
+ filename = filename.split(/[\\\/]/).last
119
+ handled = true
120
+ end
121
+ end
122
+
123
+ if not handled and @uri then
124
+ filename << '.html' unless filename =~ /\./
125
+ filename << "?#{@uri.query}" if @uri.query
126
+ end
127
+
128
+ if SPECIAL_FILENAMES =~ filename then
129
+ filename = "_#{filename}"
130
+ end
131
+
132
+ filename = filename.tr "\x00-\x20<>:\"/\\|?*", '_'
133
+
134
+ @filename = if full_path then
135
+ File.join @uri.host, path, filename
136
+ else
137
+ filename
138
+ end
139
+ end
140
+
141
+ ##
142
+ # Creates a Mechanize::Header from the Net::HTTPResponse +response+.
143
+ #
144
+ # This allows the Net::HTTPResponse to be garbage collected sooner.
145
+
146
+ def fill_header response
147
+ @response = Mechanize::Headers.new
148
+
149
+ response.each { |k,v|
150
+ @response[k] = v
151
+ } if response
152
+
153
+ @response
154
+ end
155
+
156
+ ##
157
+ # Finds a free filename based on +filename+, but is not race-free
158
+
159
+ def find_free_name filename
160
+ filename = @filename unless filename
161
+
162
+ number = 1
163
+
164
+ while File.exist? filename do
165
+ filename = "#{@filename}.#{number}"
166
+ number += 1
167
+ end
168
+
169
+ filename
170
+ end
171
+
172
+ end
173
+