mechanize 2.0.pre.2 → 2.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mechanize might be problematic. Click here for more details.

Files changed (46) hide show
  1. data.tar.gz.sig +0 -0
  2. data/CHANGELOG.rdoc +22 -0
  3. data/Manifest.txt +11 -8
  4. data/Rakefile +2 -2
  5. data/examples/flickr_upload.rb +6 -7
  6. data/examples/mech-dump.rb +0 -2
  7. data/examples/proxy_req.rb +0 -2
  8. data/examples/rubyforge.rb +1 -3
  9. data/examples/spider.rb +2 -3
  10. data/lib/mechanize.rb +228 -680
  11. data/lib/mechanize/form/field.rb +1 -1
  12. data/lib/mechanize/history.rb +23 -5
  13. data/lib/mechanize/http.rb +3 -0
  14. data/lib/mechanize/http/agent.rb +738 -0
  15. data/lib/mechanize/inspect.rb +2 -2
  16. data/lib/mechanize/page.rb +101 -42
  17. data/lib/mechanize/page/frame.rb +24 -17
  18. data/lib/mechanize/page/link.rb +72 -54
  19. data/lib/mechanize/page/meta_refresh.rb +56 -0
  20. data/lib/mechanize/response_read_error.rb +27 -0
  21. data/test/htdocs/frame_referer_test.html +10 -0
  22. data/test/htdocs/tc_referer.html +4 -0
  23. data/test/test_frames.rb +9 -0
  24. data/test/test_history.rb +74 -98
  25. data/test/test_mechanize.rb +334 -812
  26. data/test/test_mechanize_form.rb +32 -3
  27. data/test/{test_textarea.rb → test_mechanize_form_textarea.rb} +1 -1
  28. data/test/test_mechanize_http_agent.rb +697 -0
  29. data/test/test_mechanize_link.rb +83 -0
  30. data/test/test_mechanize_page_encoding.rb +147 -0
  31. data/test/test_mechanize_page_link.rb +379 -0
  32. data/test/test_mechanize_page_meta_refresh.rb +115 -0
  33. data/test/test_pretty_print.rb +1 -1
  34. data/test/test_referer.rb +29 -5
  35. data/test/test_response_code.rb +21 -20
  36. data/test/test_robots.rb +13 -17
  37. data/test/test_scheme.rb +1 -1
  38. metadata +30 -31
  39. metadata.gz.sig +0 -0
  40. data/lib/mechanize/page/meta.rb +0 -48
  41. data/test/test_form_no_inputname.rb +0 -15
  42. data/test/test_links.rb +0 -146
  43. data/test/test_mechanize_page.rb +0 -224
  44. data/test/test_meta.rb +0 -67
  45. data/test/test_upload.rb +0 -109
  46. data/test/test_verbs.rb +0 -25
@@ -17,8 +17,8 @@ class Mechanize
17
17
  q.breakable
18
18
  q.group(1, '{url', '}') {q.breakable; q.pp uri }
19
19
  q.breakable
20
- q.group(1, '{meta', '}') {
21
- meta.each { |link| q.breakable; q.pp link }
20
+ q.group(1, '{meta_refresh', '}') {
21
+ meta_refresh.each { |link| q.breakable; q.pp link }
22
22
  }
23
23
  q.breakable
24
24
  q.group(1, '{title', '}') { q.breakable; q.pp title }
@@ -24,36 +24,32 @@ class Mechanize::Page < Mechanize::File
24
24
  raise Mechanize::ContentTypeError, response['content-type'] unless
25
25
  response['content-type'] =~ /^(text\/html)|(application\/xhtml\+xml)/i
26
26
 
27
+ @meta_content_type = nil
27
28
  @encoding = nil
28
29
  @encodings = [nil]
30
+ raise 'no' if mech and not Mechanize === mech
29
31
  @mech = mech
30
32
 
31
33
  reset
32
34
 
33
35
  @encodings << Mechanize::Util.detect_charset(body) if body
34
36
 
35
- response.each do |header, value|
36
- next unless value =~ /charset/i
37
- @encodings << charset(value)
38
- end
37
+ @encodings.concat self.class.response_header_charset(response)
39
38
 
40
39
  if body
41
40
  # Force the encoding to be 8BIT so we can perform regular expressions.
42
41
  # We'll set it to the detected encoding later
43
- body.force_encoding('ASCII-8BIT') if body.respond_to?(:force_encoding)
42
+ body.force_encoding 'ASCII-8BIT' if body.respond_to? :force_encoding
44
43
 
45
- body.scan(/<meta .*?>/i) do |meta|
46
- next unless meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i
44
+ @encodings.concat self.class.meta_charset body
47
45
 
48
- meta =~ /content=(["'])?(.*?)\1/i
49
-
50
- encoding = charset $2
51
-
52
- @encodings << encoding if encoding
53
- end
46
+ meta_content_type = self.class.meta_content_type body
47
+ @meta_content_type = meta_content_type if meta_content_type
54
48
  end
55
49
 
56
- super(uri, response, body, code)
50
+ @encodings << mech.default_encoding if mech and mech.default_encoding
51
+
52
+ super uri, response, body, code
57
53
  end
58
54
 
59
55
  def title
@@ -64,10 +60,16 @@ class Mechanize::Page < Mechanize::File
64
60
  end
65
61
  end
66
62
 
67
- def charset content_type
68
- charset = content_type[/charset=([^; ]+)/i, 1]
69
- return nil if charset == 'none'
70
- charset
63
+ def response_header_charset
64
+ self.class.response_header_charset(response)
65
+ end
66
+
67
+ def meta_charset
68
+ self.class.meta_charset(body)
69
+ end
70
+
71
+ def detected_encoding
72
+ Mechanize::Util.detect_charset(body)
71
73
  end
72
74
 
73
75
  def encoding=(encoding)
@@ -90,23 +92,31 @@ class Mechanize::Page < Mechanize::File
90
92
  parser.respond_to?(:encoding) ? parser.encoding : nil
91
93
  end
92
94
 
95
+ # Return whether parser result has errors related to encoding or not.
96
+ # false indicates just parser has no encoding errors, not encoding is vaild.
97
+ def encoding_error?(parser=nil)
98
+ parser = self.parser unless parser
99
+ return false if parser.errors.empty?
100
+ parser.errors.any? do |error|
101
+ error.message =~ /(indicate\ encoding)|
102
+ (Invalid\ char)|
103
+ (input\ conversion\ failed)/x
104
+ end
105
+ end
106
+
93
107
  def parser
94
108
  return @parser if @parser
95
109
  return nil unless @body
96
110
 
97
111
  if @encoding then
98
- @parser = mech.html_parser.parse(html_body, nil, @encoding)
112
+ @parser = @mech.html_parser.parse html_body, nil, @encoding
113
+ elsif mech.force_default_encoding then
114
+ @parser = @mech.html_parser.parse html_body, nil, @mech.default_encoding
99
115
  else
100
116
  @encodings.reverse_each do |encoding|
101
- @parser = mech.html_parser.parse(html_body, nil, encoding)
117
+ @parser = @mech.html_parser.parse html_body, nil, encoding
102
118
 
103
- break if @parser.errors.empty?
104
-
105
- break unless @parser.errors.any? do |error|
106
- error.message =~ /(indicate\ encoding)|
107
- (Invalid\ char)|
108
- (input\ conversion failed)/x
109
- end
119
+ break unless encoding_error? @parser
110
120
  end
111
121
  end
112
122
 
@@ -123,7 +133,7 @@ class Mechanize::Page < Mechanize::File
123
133
  @links = nil
124
134
  @labels = nil
125
135
  @labels_hash = nil
126
- @meta = nil
136
+ @meta_refresh = nil
127
137
  @parser = nil
128
138
  @title = nil
129
139
  end
@@ -142,7 +152,7 @@ class Mechanize::Page < Mechanize::File
142
152
 
143
153
  # Get the content type
144
154
  def content_type
145
- response['content-type']
155
+ @meta_content_type || response['content-type']
146
156
  end
147
157
 
148
158
  # Search through the page like HPricot
@@ -263,18 +273,13 @@ class Mechanize::Page < Mechanize::File
263
273
  end
264
274
 
265
275
  ##
266
- # Return a list of all meta tags
267
- def meta
268
- @meta ||= search('head > meta').map do |node|
269
- next unless node['http-equiv'] && node['content']
270
- (equiv, content) = node['http-equiv'], node['content']
271
- if equiv && equiv.downcase == 'refresh'
272
- Meta.parse(content, uri) do |delay, href|
273
- node['delay'] = delay
274
- node['href'] = href
275
- Meta.new(node, @mech, self)
276
- end
277
- end
276
+ # Return a list of all meta refresh elements
277
+
278
+ def meta_refresh
279
+ query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'
280
+
281
+ @meta_refresh ||= search(query).map do |node|
282
+ MetaRefresh.from_node node, self, uri
278
283
  end.compact
279
284
  end
280
285
 
@@ -328,6 +333,54 @@ class Mechanize::Page < Mechanize::File
328
333
  return @labels_hash
329
334
  end
330
335
 
336
+ def self.charset content_type
337
+ charset = content_type[/charset=([^; ]+)/i, 1]
338
+ return nil if charset == 'none'
339
+ charset
340
+ end
341
+
342
+ def self.response_header_charset response
343
+ charsets = []
344
+ response.each do |header, value|
345
+ next unless value =~ /charset/i
346
+ charsets << charset(value)
347
+ end
348
+ charsets
349
+ end
350
+
351
+ ##
352
+ # Retrieves all charsets from +meta+ tags in +body+
353
+
354
+ def self.meta_charset body
355
+ # HACK use .map
356
+ body.scan(/<meta .*?>/i).map do |meta|
357
+ if meta =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then
358
+ $2
359
+ elsif meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
360
+ meta =~ /content=(["'])?(.*?)\1/i
361
+
362
+ m_charset = charset $2
363
+
364
+ m_charset if m_charset
365
+ end
366
+ end.compact
367
+ end
368
+
369
+ ##
370
+ # Retrieves the last <tt>content-type</tt> set by a +meta+ tag in +body+
371
+
372
+ def self.meta_content_type body
373
+ body.scan(/<meta .*?>/i).reverse.map do |meta|
374
+ if meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
375
+ meta =~ /content=(["'])?(.*?)\1/i
376
+
377
+ return $2
378
+ end
379
+ end
380
+
381
+ nil
382
+ end
383
+
331
384
  private
332
385
 
333
386
  def html_body
@@ -337,6 +390,12 @@ class Mechanize::Page < Mechanize::File
337
390
  ''
338
391
  end
339
392
  end
393
+
394
+ def self.charset_from_content_type content_type
395
+ charset = content_type[/charset=([^; ]+)/i, 1]
396
+ return nil if charset == 'none'
397
+ charset
398
+ end
340
399
  end
341
400
 
342
401
  require 'mechanize/headers'
@@ -345,5 +404,5 @@ require 'mechanize/page/label'
345
404
  require 'mechanize/page/link'
346
405
  require 'mechanize/page/base'
347
406
  require 'mechanize/page/frame'
348
- require 'mechanize/page/meta'
407
+ require 'mechanize/page/meta_refresh'
349
408
 
@@ -1,20 +1,27 @@
1
- class Mechanize
2
- class Page < Mechanize::File
3
- # This class encapsulates a 'frame' tag. Frame objects can be treated
4
- # just like Link objects. They contain src, the link they refer to,
5
- # name, the name of the frame. 'src' and 'name' are aliased to 'href'
6
- # and 'text' respectively so that a Frame object can be treated just
7
- # like a Link.
8
- class Frame < Link
9
- alias :src :href
10
- alias :name :text
1
+ # This class encapsulates a 'frame' tag. Frame objects can be treated just
2
+ # like Link objects. They contain #src, the #link they refer to and a #name,
3
+ # the name of the frame they refer to. #src and #name are aliased to #href
4
+ # and #text respectively so that a Frame object can be treated just like a
5
+ # Link.
11
6
 
12
- def initialize(node, mech, referer)
13
- super(node, mech, referer)
14
- @node = node
15
- @text = node['name']
16
- @href = node['src']
17
- end
18
- end
7
+ class Mechanize::Page::Frame < Mechanize::Page::Link
8
+
9
+ alias :src :href
10
+
11
+ attr_reader :text
12
+ alias :name :text
13
+
14
+ def initialize(node, mech, referer)
15
+ super(node, mech, referer)
16
+ @node = node
17
+ @text = node['name']
18
+ @href = node['src']
19
+ @content = nil
19
20
  end
21
+
22
+ def content
23
+ @content ||= @mech.get @href, [], page
24
+ end
25
+
20
26
  end
27
+
@@ -1,64 +1,82 @@
1
- class Mechanize
2
- class Page < Mechanize::File
3
- # This class encapsulates links. It contains the text and the URI for
4
- # 'a' tags parsed out of an HTML page. If the link contains an image,
5
- # the alt text will be used for that image.
6
- #
7
- # For example, the text for the following links with both be 'Hello World':
8
- #
9
- # <a href="http://rubyforge.org">Hello World</a>
10
- # <a href="http://rubyforge.org"><img src="test.jpg" alt="Hello World"></a>
11
- class Link
12
- attr_reader :node
13
- attr_reader :href
14
- attr_reader :text
15
- attr_reader :attributes
16
- attr_reader :page
17
- alias :to_s :text
18
- alias :referer :page
1
+ ##
2
+ # This class encapsulates links. It contains the text and the URI for
3
+ # 'a' tags parsed out of an HTML page. If the link contains an image,
4
+ # the alt text will be used for that image.
5
+ #
6
+ # For example, the text for the following links with both be 'Hello World':
7
+ #
8
+ # <a href="http://example">Hello World</a>
9
+ # <a href="http://example"><img src="test.jpg" alt="Hello World"></a>
19
10
 
20
- def initialize(node, mech, page)
21
- @node = node
22
- @href = node['href']
23
- @text = node.inner_text
24
- @page = page
25
- @mech = mech
26
- @attributes = node
11
+ class Mechanize::Page::Link
12
+ attr_reader :node
13
+ attr_reader :href
14
+ attr_reader :attributes
15
+ attr_reader :page
16
+ alias :referer :page
27
17
 
28
- # If there is no text, try to find an image and use it's alt text
29
- if (@text.nil? || @text.length == 0) && node.search('img').length > 0
30
- @text = ''
31
- node.search('img').each do |e|
32
- @text << ( e['alt'] || '')
33
- end
34
- end
18
+ def initialize(node, mech, page)
19
+ @node = node
20
+ @attributes = node
21
+ @href = node['href']
22
+ @mech = mech
23
+ @page = page
24
+ @text = nil
25
+ @uri = nil
26
+ end
27
+
28
+ # Click on this link
29
+ def click
30
+ @mech.click self
31
+ end
35
32
 
36
- end
33
+ # This method is a shorthand to get link's DOM id.
34
+ # Common usage:
35
+ # page.link_with(:dom_id => "links_exact_id")
36
+ def dom_id
37
+ node['id']
38
+ end
37
39
 
38
- def uri
39
- @href && URI.parse(WEBrick::HTTPUtils.escape(@href))
40
- end
40
+ # A list of words in the rel attribute, all lower-cased.
41
+ def rel
42
+ @rel ||= (val = attributes['rel']) ? val.downcase.split(' ') : []
43
+ end
44
+
45
+ # Test if the rel attribute includes +kind+.
46
+ def rel? kind
47
+ rel.include? kind
48
+ end
41
49
 
42
- # A list of words in the rel attribute, all lower-cased.
43
- def rel
44
- @rel ||= (val = attributes['rel']) ? val.downcase.split(' ') : []
45
- end
50
+ # The text content of this link
51
+ def text
52
+ return @text if @text
46
53
 
47
- # Test if the rel attribute includes +kind+.
48
- def rel?(kind)
49
- rel.include?(kind)
50
- end
54
+ @text = @node.inner_text
51
55
 
52
- # Click on this link
53
- def click
54
- @mech.click self
55
- end
56
-
57
- # This method is a shorthand to get link's DOM id.
58
- # Common usage: page.link_with(:dom_id => "links_exact_id")
59
- def dom_id
60
- node['id']
61
- end
56
+ # If there is no text, try to find an image and use it's alt text
57
+ if (@text.nil? or @text.empty?) and imgs = @node.search('img') then
58
+ @text = imgs.map do |e|
59
+ e['alt']
60
+ end.join
62
61
  end
62
+
63
+ @text
63
64
  end
65
+
66
+ alias :to_s :text
67
+
68
+ # A URI for the #href for this link. The link is first parsed as a raw
69
+ # link. If that fails parsing an escaped link is attepmted.
70
+
71
+ def uri
72
+ @uri ||= if @href then
73
+ begin
74
+ URI.parse @href
75
+ rescue URI::InvalidURIError
76
+ URI.parse WEBrick::HTTPUtils.escape @href
77
+ end
78
+ end
79
+ end
80
+
64
81
  end
82
+
@@ -0,0 +1,56 @@
1
+ ##
2
+ # This class encapsulates a meta element with a refresh http-equiv. Mechanize
3
+ # treats meta refresh elements just like 'a' tags. MetaRefresh objects will
4
+ # contain links, but most likely will have no text.
5
+
6
+ class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
7
+
8
+ attr_reader :delay
9
+
10
+ ##
11
+ # Matches the content attribute of a meta refresh element. After the match:
12
+ #
13
+ # $1:: delay
14
+ # $3:: url
15
+ CONTENT_REGEXP = /^\s*(\d+\.?\d*)(;|;\s*url=\s*['"]?(\S*?)['"]?)?\s*$/i
16
+
17
+ ##
18
+ # Parses the delay and url from the content attribute of a meta refresh
19
+ # element. Parse requires the uri of the current page to infer a url when
20
+ # no url is specified.
21
+ #
22
+ # Returns a MetaRefresh instance.
23
+ #
24
+ # Returns nil if the delay and url cannot be parsed.
25
+
26
+ def self.parse content, base_uri
27
+ return unless content =~ CONTENT_REGEXP
28
+
29
+ delay, refresh_uri = $1, $3
30
+
31
+ dest = base_uri
32
+ dest += refresh_uri if refresh_uri
33
+
34
+ return delay, dest
35
+ end
36
+
37
+ def self.from_node node, page, uri
38
+ http_equiv = node['http-equiv']
39
+ return unless http_equiv and http_equiv.downcase == 'refresh'
40
+
41
+ delay, uri = parse node['content'], uri
42
+
43
+ return unless delay
44
+
45
+ new node, page, delay, uri.to_s
46
+ end
47
+
48
+ def initialize node, page, delay, href
49
+ super node, page.mech, page
50
+
51
+ @delay = delay.to_i
52
+ @href = href
53
+ end
54
+
55
+ end
56
+