mechanize 2.0.pre.2 → 2.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +22 -0
- data/Manifest.txt +11 -8
- data/Rakefile +2 -2
- data/examples/flickr_upload.rb +6 -7
- data/examples/mech-dump.rb +0 -2
- data/examples/proxy_req.rb +0 -2
- data/examples/rubyforge.rb +1 -3
- data/examples/spider.rb +2 -3
- data/lib/mechanize.rb +228 -680
- data/lib/mechanize/form/field.rb +1 -1
- data/lib/mechanize/history.rb +23 -5
- data/lib/mechanize/http.rb +3 -0
- data/lib/mechanize/http/agent.rb +738 -0
- data/lib/mechanize/inspect.rb +2 -2
- data/lib/mechanize/page.rb +101 -42
- data/lib/mechanize/page/frame.rb +24 -17
- data/lib/mechanize/page/link.rb +72 -54
- data/lib/mechanize/page/meta_refresh.rb +56 -0
- data/lib/mechanize/response_read_error.rb +27 -0
- data/test/htdocs/frame_referer_test.html +10 -0
- data/test/htdocs/tc_referer.html +4 -0
- data/test/test_frames.rb +9 -0
- data/test/test_history.rb +74 -98
- data/test/test_mechanize.rb +334 -812
- data/test/test_mechanize_form.rb +32 -3
- data/test/{test_textarea.rb → test_mechanize_form_textarea.rb} +1 -1
- data/test/test_mechanize_http_agent.rb +697 -0
- data/test/test_mechanize_link.rb +83 -0
- data/test/test_mechanize_page_encoding.rb +147 -0
- data/test/test_mechanize_page_link.rb +379 -0
- data/test/test_mechanize_page_meta_refresh.rb +115 -0
- data/test/test_pretty_print.rb +1 -1
- data/test/test_referer.rb +29 -5
- data/test/test_response_code.rb +21 -20
- data/test/test_robots.rb +13 -17
- data/test/test_scheme.rb +1 -1
- metadata +30 -31
- metadata.gz.sig +0 -0
- data/lib/mechanize/page/meta.rb +0 -48
- data/test/test_form_no_inputname.rb +0 -15
- data/test/test_links.rb +0 -146
- data/test/test_mechanize_page.rb +0 -224
- data/test/test_meta.rb +0 -67
- data/test/test_upload.rb +0 -109
- data/test/test_verbs.rb +0 -25
data/lib/mechanize/inspect.rb
CHANGED
@@ -17,8 +17,8 @@ class Mechanize
|
|
17
17
|
q.breakable
|
18
18
|
q.group(1, '{url', '}') {q.breakable; q.pp uri }
|
19
19
|
q.breakable
|
20
|
-
q.group(1, '{
|
21
|
-
|
20
|
+
q.group(1, '{meta_refresh', '}') {
|
21
|
+
meta_refresh.each { |link| q.breakable; q.pp link }
|
22
22
|
}
|
23
23
|
q.breakable
|
24
24
|
q.group(1, '{title', '}') { q.breakable; q.pp title }
|
data/lib/mechanize/page.rb
CHANGED
@@ -24,36 +24,32 @@ class Mechanize::Page < Mechanize::File
|
|
24
24
|
raise Mechanize::ContentTypeError, response['content-type'] unless
|
25
25
|
response['content-type'] =~ /^(text\/html)|(application\/xhtml\+xml)/i
|
26
26
|
|
27
|
+
@meta_content_type = nil
|
27
28
|
@encoding = nil
|
28
29
|
@encodings = [nil]
|
30
|
+
raise 'no' if mech and not Mechanize === mech
|
29
31
|
@mech = mech
|
30
32
|
|
31
33
|
reset
|
32
34
|
|
33
35
|
@encodings << Mechanize::Util.detect_charset(body) if body
|
34
36
|
|
35
|
-
|
36
|
-
next unless value =~ /charset/i
|
37
|
-
@encodings << charset(value)
|
38
|
-
end
|
37
|
+
@encodings.concat self.class.response_header_charset(response)
|
39
38
|
|
40
39
|
if body
|
41
40
|
# Force the encoding to be 8BIT so we can perform regular expressions.
|
42
41
|
# We'll set it to the detected encoding later
|
43
|
-
body.force_encoding
|
42
|
+
body.force_encoding 'ASCII-8BIT' if body.respond_to? :force_encoding
|
44
43
|
|
45
|
-
|
46
|
-
next unless meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i
|
44
|
+
@encodings.concat self.class.meta_charset body
|
47
45
|
|
48
|
-
|
49
|
-
|
50
|
-
encoding = charset $2
|
51
|
-
|
52
|
-
@encodings << encoding if encoding
|
53
|
-
end
|
46
|
+
meta_content_type = self.class.meta_content_type body
|
47
|
+
@meta_content_type = meta_content_type if meta_content_type
|
54
48
|
end
|
55
49
|
|
56
|
-
|
50
|
+
@encodings << mech.default_encoding if mech and mech.default_encoding
|
51
|
+
|
52
|
+
super uri, response, body, code
|
57
53
|
end
|
58
54
|
|
59
55
|
def title
|
@@ -64,10 +60,16 @@ class Mechanize::Page < Mechanize::File
|
|
64
60
|
end
|
65
61
|
end
|
66
62
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
63
|
+
def response_header_charset
|
64
|
+
self.class.response_header_charset(response)
|
65
|
+
end
|
66
|
+
|
67
|
+
def meta_charset
|
68
|
+
self.class.meta_charset(body)
|
69
|
+
end
|
70
|
+
|
71
|
+
def detected_encoding
|
72
|
+
Mechanize::Util.detect_charset(body)
|
71
73
|
end
|
72
74
|
|
73
75
|
def encoding=(encoding)
|
@@ -90,23 +92,31 @@ class Mechanize::Page < Mechanize::File
|
|
90
92
|
parser.respond_to?(:encoding) ? parser.encoding : nil
|
91
93
|
end
|
92
94
|
|
95
|
+
# Return whether parser result has errors related to encoding or not.
|
96
|
+
# false indicates just parser has no encoding errors, not encoding is vaild.
|
97
|
+
def encoding_error?(parser=nil)
|
98
|
+
parser = self.parser unless parser
|
99
|
+
return false if parser.errors.empty?
|
100
|
+
parser.errors.any? do |error|
|
101
|
+
error.message =~ /(indicate\ encoding)|
|
102
|
+
(Invalid\ char)|
|
103
|
+
(input\ conversion\ failed)/x
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
93
107
|
def parser
|
94
108
|
return @parser if @parser
|
95
109
|
return nil unless @body
|
96
110
|
|
97
111
|
if @encoding then
|
98
|
-
@parser = mech.html_parser.parse
|
112
|
+
@parser = @mech.html_parser.parse html_body, nil, @encoding
|
113
|
+
elsif mech.force_default_encoding then
|
114
|
+
@parser = @mech.html_parser.parse html_body, nil, @mech.default_encoding
|
99
115
|
else
|
100
116
|
@encodings.reverse_each do |encoding|
|
101
|
-
@parser = mech.html_parser.parse
|
117
|
+
@parser = @mech.html_parser.parse html_body, nil, encoding
|
102
118
|
|
103
|
-
break
|
104
|
-
|
105
|
-
break unless @parser.errors.any? do |error|
|
106
|
-
error.message =~ /(indicate\ encoding)|
|
107
|
-
(Invalid\ char)|
|
108
|
-
(input\ conversion failed)/x
|
109
|
-
end
|
119
|
+
break unless encoding_error? @parser
|
110
120
|
end
|
111
121
|
end
|
112
122
|
|
@@ -123,7 +133,7 @@ class Mechanize::Page < Mechanize::File
|
|
123
133
|
@links = nil
|
124
134
|
@labels = nil
|
125
135
|
@labels_hash = nil
|
126
|
-
@
|
136
|
+
@meta_refresh = nil
|
127
137
|
@parser = nil
|
128
138
|
@title = nil
|
129
139
|
end
|
@@ -142,7 +152,7 @@ class Mechanize::Page < Mechanize::File
|
|
142
152
|
|
143
153
|
# Get the content type
|
144
154
|
def content_type
|
145
|
-
response['content-type']
|
155
|
+
@meta_content_type || response['content-type']
|
146
156
|
end
|
147
157
|
|
148
158
|
# Search through the page like HPricot
|
@@ -263,18 +273,13 @@ class Mechanize::Page < Mechanize::File
|
|
263
273
|
end
|
264
274
|
|
265
275
|
##
|
266
|
-
# Return a list of all meta
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
node['delay'] = delay
|
274
|
-
node['href'] = href
|
275
|
-
Meta.new(node, @mech, self)
|
276
|
-
end
|
277
|
-
end
|
276
|
+
# Return a list of all meta refresh elements
|
277
|
+
|
278
|
+
def meta_refresh
|
279
|
+
query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'
|
280
|
+
|
281
|
+
@meta_refresh ||= search(query).map do |node|
|
282
|
+
MetaRefresh.from_node node, self, uri
|
278
283
|
end.compact
|
279
284
|
end
|
280
285
|
|
@@ -328,6 +333,54 @@ class Mechanize::Page < Mechanize::File
|
|
328
333
|
return @labels_hash
|
329
334
|
end
|
330
335
|
|
336
|
+
def self.charset content_type
|
337
|
+
charset = content_type[/charset=([^; ]+)/i, 1]
|
338
|
+
return nil if charset == 'none'
|
339
|
+
charset
|
340
|
+
end
|
341
|
+
|
342
|
+
def self.response_header_charset response
|
343
|
+
charsets = []
|
344
|
+
response.each do |header, value|
|
345
|
+
next unless value =~ /charset/i
|
346
|
+
charsets << charset(value)
|
347
|
+
end
|
348
|
+
charsets
|
349
|
+
end
|
350
|
+
|
351
|
+
##
|
352
|
+
# Retrieves all charsets from +meta+ tags in +body+
|
353
|
+
|
354
|
+
def self.meta_charset body
|
355
|
+
# HACK use .map
|
356
|
+
body.scan(/<meta .*?>/i).map do |meta|
|
357
|
+
if meta =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then
|
358
|
+
$2
|
359
|
+
elsif meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
|
360
|
+
meta =~ /content=(["'])?(.*?)\1/i
|
361
|
+
|
362
|
+
m_charset = charset $2
|
363
|
+
|
364
|
+
m_charset if m_charset
|
365
|
+
end
|
366
|
+
end.compact
|
367
|
+
end
|
368
|
+
|
369
|
+
##
|
370
|
+
# Retrieves the last <tt>content-type</tt> set by a +meta+ tag in +body+
|
371
|
+
|
372
|
+
def self.meta_content_type body
|
373
|
+
body.scan(/<meta .*?>/i).reverse.map do |meta|
|
374
|
+
if meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
|
375
|
+
meta =~ /content=(["'])?(.*?)\1/i
|
376
|
+
|
377
|
+
return $2
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
nil
|
382
|
+
end
|
383
|
+
|
331
384
|
private
|
332
385
|
|
333
386
|
def html_body
|
@@ -337,6 +390,12 @@ class Mechanize::Page < Mechanize::File
|
|
337
390
|
''
|
338
391
|
end
|
339
392
|
end
|
393
|
+
|
394
|
+
def self.charset_from_content_type content_type
|
395
|
+
charset = content_type[/charset=([^; ]+)/i, 1]
|
396
|
+
return nil if charset == 'none'
|
397
|
+
charset
|
398
|
+
end
|
340
399
|
end
|
341
400
|
|
342
401
|
require 'mechanize/headers'
|
@@ -345,5 +404,5 @@ require 'mechanize/page/label'
|
|
345
404
|
require 'mechanize/page/link'
|
346
405
|
require 'mechanize/page/base'
|
347
406
|
require 'mechanize/page/frame'
|
348
|
-
require 'mechanize/page/
|
407
|
+
require 'mechanize/page/meta_refresh'
|
349
408
|
|
data/lib/mechanize/page/frame.rb
CHANGED
@@ -1,20 +1,27 @@
|
|
1
|
-
class
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
# and 'text' respectively so that a Frame object can be treated just
|
7
|
-
# like a Link.
|
8
|
-
class Frame < Link
|
9
|
-
alias :src :href
|
10
|
-
alias :name :text
|
1
|
+
# This class encapsulates a 'frame' tag. Frame objects can be treated just
|
2
|
+
# like Link objects. They contain #src, the #link they refer to and a #name,
|
3
|
+
# the name of the frame they refer to. #src and #name are aliased to #href
|
4
|
+
# and #text respectively so that a Frame object can be treated just like a
|
5
|
+
# Link.
|
11
6
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
7
|
+
class Mechanize::Page::Frame < Mechanize::Page::Link
|
8
|
+
|
9
|
+
alias :src :href
|
10
|
+
|
11
|
+
attr_reader :text
|
12
|
+
alias :name :text
|
13
|
+
|
14
|
+
def initialize(node, mech, referer)
|
15
|
+
super(node, mech, referer)
|
16
|
+
@node = node
|
17
|
+
@text = node['name']
|
18
|
+
@href = node['src']
|
19
|
+
@content = nil
|
19
20
|
end
|
21
|
+
|
22
|
+
def content
|
23
|
+
@content ||= @mech.get @href, [], page
|
24
|
+
end
|
25
|
+
|
20
26
|
end
|
27
|
+
|
data/lib/mechanize/page/link.rb
CHANGED
@@ -1,64 +1,82 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
# <a href="http://rubyforge.org"><img src="test.jpg" alt="Hello World"></a>
|
11
|
-
class Link
|
12
|
-
attr_reader :node
|
13
|
-
attr_reader :href
|
14
|
-
attr_reader :text
|
15
|
-
attr_reader :attributes
|
16
|
-
attr_reader :page
|
17
|
-
alias :to_s :text
|
18
|
-
alias :referer :page
|
1
|
+
##
|
2
|
+
# This class encapsulates links. It contains the text and the URI for
|
3
|
+
# 'a' tags parsed out of an HTML page. If the link contains an image,
|
4
|
+
# the alt text will be used for that image.
|
5
|
+
#
|
6
|
+
# For example, the text for the following links with both be 'Hello World':
|
7
|
+
#
|
8
|
+
# <a href="http://example">Hello World</a>
|
9
|
+
# <a href="http://example"><img src="test.jpg" alt="Hello World"></a>
|
19
10
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
@attributes = node
|
11
|
+
class Mechanize::Page::Link
|
12
|
+
attr_reader :node
|
13
|
+
attr_reader :href
|
14
|
+
attr_reader :attributes
|
15
|
+
attr_reader :page
|
16
|
+
alias :referer :page
|
27
17
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
18
|
+
def initialize(node, mech, page)
|
19
|
+
@node = node
|
20
|
+
@attributes = node
|
21
|
+
@href = node['href']
|
22
|
+
@mech = mech
|
23
|
+
@page = page
|
24
|
+
@text = nil
|
25
|
+
@uri = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
# Click on this link
|
29
|
+
def click
|
30
|
+
@mech.click self
|
31
|
+
end
|
35
32
|
|
36
|
-
|
33
|
+
# This method is a shorthand to get link's DOM id.
|
34
|
+
# Common usage:
|
35
|
+
# page.link_with(:dom_id => "links_exact_id")
|
36
|
+
def dom_id
|
37
|
+
node['id']
|
38
|
+
end
|
37
39
|
|
38
|
-
|
39
|
-
|
40
|
-
|
40
|
+
# A list of words in the rel attribute, all lower-cased.
|
41
|
+
def rel
|
42
|
+
@rel ||= (val = attributes['rel']) ? val.downcase.split(' ') : []
|
43
|
+
end
|
44
|
+
|
45
|
+
# Test if the rel attribute includes +kind+.
|
46
|
+
def rel? kind
|
47
|
+
rel.include? kind
|
48
|
+
end
|
41
49
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
50
|
+
# The text content of this link
|
51
|
+
def text
|
52
|
+
return @text if @text
|
46
53
|
|
47
|
-
|
48
|
-
def rel?(kind)
|
49
|
-
rel.include?(kind)
|
50
|
-
end
|
54
|
+
@text = @node.inner_text
|
51
55
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
# This method is a shorthand to get link's DOM id.
|
58
|
-
# Common usage: page.link_with(:dom_id => "links_exact_id")
|
59
|
-
def dom_id
|
60
|
-
node['id']
|
61
|
-
end
|
56
|
+
# If there is no text, try to find an image and use it's alt text
|
57
|
+
if (@text.nil? or @text.empty?) and imgs = @node.search('img') then
|
58
|
+
@text = imgs.map do |e|
|
59
|
+
e['alt']
|
60
|
+
end.join
|
62
61
|
end
|
62
|
+
|
63
|
+
@text
|
63
64
|
end
|
65
|
+
|
66
|
+
alias :to_s :text
|
67
|
+
|
68
|
+
# A URI for the #href for this link. The link is first parsed as a raw
|
69
|
+
# link. If that fails parsing an escaped link is attepmted.
|
70
|
+
|
71
|
+
def uri
|
72
|
+
@uri ||= if @href then
|
73
|
+
begin
|
74
|
+
URI.parse @href
|
75
|
+
rescue URI::InvalidURIError
|
76
|
+
URI.parse WEBrick::HTTPUtils.escape @href
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
64
81
|
end
|
82
|
+
|
@@ -0,0 +1,56 @@
|
|
1
|
+
##
|
2
|
+
# This class encapsulates a meta element with a refresh http-equiv. Mechanize
|
3
|
+
# treats meta refresh elements just like 'a' tags. MetaRefresh objects will
|
4
|
+
# contain links, but most likely will have no text.
|
5
|
+
|
6
|
+
class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
|
7
|
+
|
8
|
+
attr_reader :delay
|
9
|
+
|
10
|
+
##
|
11
|
+
# Matches the content attribute of a meta refresh element. After the match:
|
12
|
+
#
|
13
|
+
# $1:: delay
|
14
|
+
# $3:: url
|
15
|
+
CONTENT_REGEXP = /^\s*(\d+\.?\d*)(;|;\s*url=\s*['"]?(\S*?)['"]?)?\s*$/i
|
16
|
+
|
17
|
+
##
|
18
|
+
# Parses the delay and url from the content attribute of a meta refresh
|
19
|
+
# element. Parse requires the uri of the current page to infer a url when
|
20
|
+
# no url is specified.
|
21
|
+
#
|
22
|
+
# Returns a MetaRefresh instance.
|
23
|
+
#
|
24
|
+
# Returns nil if the delay and url cannot be parsed.
|
25
|
+
|
26
|
+
def self.parse content, base_uri
|
27
|
+
return unless content =~ CONTENT_REGEXP
|
28
|
+
|
29
|
+
delay, refresh_uri = $1, $3
|
30
|
+
|
31
|
+
dest = base_uri
|
32
|
+
dest += refresh_uri if refresh_uri
|
33
|
+
|
34
|
+
return delay, dest
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.from_node node, page, uri
|
38
|
+
http_equiv = node['http-equiv']
|
39
|
+
return unless http_equiv and http_equiv.downcase == 'refresh'
|
40
|
+
|
41
|
+
delay, uri = parse node['content'], uri
|
42
|
+
|
43
|
+
return unless delay
|
44
|
+
|
45
|
+
new node, page, delay, uri.to_s
|
46
|
+
end
|
47
|
+
|
48
|
+
def initialize node, page, delay, href
|
49
|
+
super node, page.mech, page
|
50
|
+
|
51
|
+
@delay = delay.to_i
|
52
|
+
@href = href
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|