mechanize 2.1.1 → 2.2
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +36 -1
- data/EXAMPLES.rdoc +23 -18
- data/GUIDE.rdoc +10 -5
- data/Manifest.txt +4 -0
- data/Rakefile +2 -1
- data/lib/mechanize.rb +88 -18
- data/lib/mechanize/file_request.rb +4 -0
- data/lib/mechanize/file_saver.rb +3 -3
- data/lib/mechanize/http/agent.rb +155 -114
- data/lib/mechanize/image.rb +6 -0
- data/lib/mechanize/page.rb +38 -3
- data/lib/mechanize/page/image.rb +160 -10
- data/lib/mechanize/page/link.rb +5 -0
- data/lib/mechanize/page/meta_refresh.rb +28 -25
- data/lib/mechanize/pluggable_parsers.rb +28 -14
- data/lib/mechanize/util.rb +6 -0
- data/test/htdocs/tc_links.html +2 -0
- data/test/test_mechanize.rb +39 -10
- data/test/test_mechanize_directory_saver.rb +49 -0
- data/test/test_mechanize_file_request.rb +14 -8
- data/test/test_mechanize_http_agent.rb +391 -370
- data/test/test_mechanize_image.rb +8 -0
- data/test/test_mechanize_link.rb +8 -0
- data/test/test_mechanize_page.rb +11 -10
- data/test/test_mechanize_page_image.rb +183 -0
- data/test/test_mechanize_page_meta_refresh.rb +20 -4
- data/test/test_mechanize_pluggable_parser.rb +15 -0
- metadata +56 -27
- metadata.gz.sig +0 -0
data/lib/mechanize/page.rb
CHANGED
@@ -186,9 +186,26 @@ class Mechanize::Page < Mechanize::File
|
|
186
186
|
@meta_content_type || response['content-type']
|
187
187
|
end
|
188
188
|
|
189
|
-
|
189
|
+
##
|
190
|
+
# :method: search
|
191
|
+
#
|
192
|
+
# Search for +paths+ in the page using Nokogiri's #search. The +paths+ can
|
193
|
+
# be XPath or CSS and an optional Hash of namespaces may be appended.
|
194
|
+
#
|
195
|
+
# See Nokogiri::XML::Node#search for further details.
|
196
|
+
|
190
197
|
def_delegator :parser, :search, :search
|
191
|
-
|
198
|
+
|
199
|
+
alias / search
|
200
|
+
|
201
|
+
##
|
202
|
+
# :method: at
|
203
|
+
#
|
204
|
+
# Search through the page for +path+ under +namespace+ using Nokogiri's #at.
|
205
|
+
# The +path+ may be either a CSS or XPath expression.
|
206
|
+
#
|
207
|
+
# See also Nokogiri::XML::Node#at
|
208
|
+
|
192
209
|
def_delegator :parser, :at, :at
|
193
210
|
|
194
211
|
##
|
@@ -283,6 +300,24 @@ class Mechanize::Page < Mechanize::File
|
|
283
300
|
|
284
301
|
elements_with :iframe
|
285
302
|
|
303
|
+
##
|
304
|
+
# :method: image_with(criteria)
|
305
|
+
#
|
306
|
+
# Find a single image matching +criteria+.
|
307
|
+
# Example:
|
308
|
+
# page.image_with(:alt => /main/).fetch.save
|
309
|
+
|
310
|
+
##
|
311
|
+
# :method: images_with(criteria)
|
312
|
+
#
|
313
|
+
# Find all images matching +criteria+.
|
314
|
+
# Example:
|
315
|
+
# page.images_with(:src => /jpg\Z/).each do |img|
|
316
|
+
# img.fetch.save
|
317
|
+
# end
|
318
|
+
|
319
|
+
elements_with :image
|
320
|
+
|
286
321
|
##
|
287
322
|
# Return a list of all link and area tags
|
288
323
|
def links
|
@@ -310,7 +345,7 @@ class Mechanize::Page < Mechanize::File
|
|
310
345
|
query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'
|
311
346
|
|
312
347
|
@meta_refresh ||= search(query).map do |node|
|
313
|
-
MetaRefresh.from_node node, self
|
348
|
+
MetaRefresh.from_node node, self
|
314
349
|
end.compact
|
315
350
|
end
|
316
351
|
|
data/lib/mechanize/page/image.rb
CHANGED
@@ -2,29 +2,179 @@
|
|
2
2
|
# An image element on an HTML page
|
3
3
|
|
4
4
|
class Mechanize::Page::Image
|
5
|
+
|
5
6
|
attr_reader :node
|
6
|
-
|
7
|
+
attr_accessor :page
|
8
|
+
attr_accessor :mech
|
9
|
+
|
10
|
+
##
|
11
|
+
# Creates a new Mechanize::Page::Image from an image +node+ and source
|
12
|
+
# +page+.
|
7
13
|
|
8
|
-
def initialize
|
14
|
+
def initialize node, page
|
9
15
|
@node = node
|
10
16
|
@page = page
|
17
|
+
@mech = page.mech
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
# The alt attribute of the image
|
22
|
+
|
23
|
+
def alt
|
24
|
+
node['alt']
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# The caption of the image. In order of preference, the #title, #alt, or
|
29
|
+
# empty string "".
|
30
|
+
|
31
|
+
def caption
|
32
|
+
title || alt || ''
|
33
|
+
end
|
34
|
+
|
35
|
+
alias :text :caption
|
36
|
+
|
37
|
+
##
|
38
|
+
# The class attribute of the image
|
39
|
+
|
40
|
+
def dom_class
|
41
|
+
node['class']
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# The id attribute of the image
|
46
|
+
|
47
|
+
def dom_id
|
48
|
+
node['id']
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
# The suffix of the #url. The dot is a part of suffix, not a delimiter.
|
53
|
+
#
|
54
|
+
# p image.url # => "http://example/test.jpg"
|
55
|
+
# p image.extname # => ".jpg"
|
56
|
+
#
|
57
|
+
# Returns an empty string if #url has no suffix:
|
58
|
+
#
|
59
|
+
# p image.url # => "http://example/sampleimage"
|
60
|
+
# p image.extname # => ""
|
61
|
+
|
62
|
+
def extname
|
63
|
+
return nil unless src
|
64
|
+
|
65
|
+
File.extname url.path
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# Downloads the image.
|
70
|
+
#
|
71
|
+
# agent.page.image_with(:src => /logo/).fetch.save
|
72
|
+
#
|
73
|
+
# The referer is:
|
74
|
+
#
|
75
|
+
# #page("parent") ::
|
76
|
+
# all images on http html, relative #src images on https html
|
77
|
+
# (no referer) ::
|
78
|
+
# absolute #src images on https html
|
79
|
+
# user specified ::
|
80
|
+
# img.fetch(nil, my_referer_uri_or_page)
|
81
|
+
|
82
|
+
def fetch parameters = [], referer = nil, headers = {}
|
83
|
+
mech.get src, parameters, referer || image_referer, headers
|
84
|
+
end
|
85
|
+
|
86
|
+
##
|
87
|
+
# The height attribute of the image
|
88
|
+
|
89
|
+
def height
|
90
|
+
node['height']
|
91
|
+
end
|
92
|
+
|
93
|
+
def image_referer # :nodoc:
|
94
|
+
http_page = page.uri && page.uri.scheme == 'http'
|
95
|
+
https_page = page.uri && page.uri.scheme == 'https'
|
96
|
+
|
97
|
+
case
|
98
|
+
when http_page then page
|
99
|
+
when https_page && relative? then page
|
100
|
+
else
|
101
|
+
Mechanize::File.new(nil, { 'content-type' => 'text/plain' }, '', 200)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
##
|
106
|
+
# MIME type guessed from the image url suffix
|
107
|
+
#
|
108
|
+
# p image.extname # => ".jpg"
|
109
|
+
# p image.mime_type # => "image/jpeg"
|
110
|
+
# page.images_with(:mime_type => /gif|jpeg|png/).each do ...
|
111
|
+
#
|
112
|
+
# Returns nil if url has no (well-known) suffix:
|
113
|
+
#
|
114
|
+
# p image.url # => "http://example/sampleimage"
|
115
|
+
# p image.mime_type # => nil
|
116
|
+
|
117
|
+
def mime_type
|
118
|
+
suffix_without_dot = extname ? extname.sub(/\A\./){''}.downcase : nil
|
119
|
+
|
120
|
+
Mechanize::Util::DefaultMimeTypes[suffix_without_dot]
|
11
121
|
end
|
12
122
|
|
123
|
+
def pretty_print(q) # :nodoc:
|
124
|
+
q.object_group(self) {
|
125
|
+
q.breakable; q.pp url
|
126
|
+
q.breakable; q.pp caption
|
127
|
+
}
|
128
|
+
end
|
129
|
+
|
130
|
+
alias inspect pretty_inspect # :nodoc:
|
131
|
+
|
132
|
+
def relative? # :nodoc:
|
133
|
+
%r{^https?://} !~ src
|
134
|
+
end
|
135
|
+
|
136
|
+
##
|
137
|
+
# The src attribute of the image
|
138
|
+
|
13
139
|
def src
|
14
|
-
|
140
|
+
node['src']
|
15
141
|
end
|
16
142
|
|
143
|
+
##
|
144
|
+
# The title attribute of the image
|
145
|
+
|
146
|
+
def title
|
147
|
+
node['title']
|
148
|
+
end
|
149
|
+
|
150
|
+
##
|
151
|
+
# The URL string of this image
|
152
|
+
|
153
|
+
def to_s
|
154
|
+
url.to_s
|
155
|
+
end
|
156
|
+
|
157
|
+
##
|
158
|
+
# URI for this image
|
159
|
+
|
17
160
|
def url
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
else
|
22
|
-
if page.bases[0]
|
23
|
-
(page.bases[0].href + src).to_s
|
161
|
+
if relative? then
|
162
|
+
if page.bases[0] then
|
163
|
+
page.bases[0].href + src
|
24
164
|
else
|
25
|
-
|
165
|
+
page.uri + src
|
26
166
|
end
|
167
|
+
else
|
168
|
+
src
|
27
169
|
end
|
28
170
|
end
|
171
|
+
|
172
|
+
##
|
173
|
+
# The width attribute of the image
|
174
|
+
|
175
|
+
def width
|
176
|
+
node['width']
|
177
|
+
end
|
178
|
+
|
29
179
|
end
|
30
180
|
|
data/lib/mechanize/page/link.rb
CHANGED
@@ -22,7 +22,7 @@ class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
|
|
22
22
|
# $1:: delay
|
23
23
|
# $3:: url
|
24
24
|
|
25
|
-
CONTENT_REGEXP = /^\s*(\d+\.?\d*)(
|
25
|
+
CONTENT_REGEXP = /^\s*(\d+\.?\d*)\s*(?:;(?:\s*url\s*=\s*(['"]?)(\S*)\2)?\s*)?$/i
|
26
26
|
|
27
27
|
##
|
28
28
|
# Regexp of unsafe URI characters that excludes % for Issue #177
|
@@ -30,46 +30,49 @@ class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
|
|
30
30
|
UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&%=+$,\[\]]/
|
31
31
|
|
32
32
|
##
|
33
|
-
# Parses the delay and url from the content attribute of a meta
|
34
|
-
# element.
|
35
|
-
# no url is specified.
|
33
|
+
# Parses the delay and url from the content attribute of a meta
|
34
|
+
# refresh element.
|
36
35
|
#
|
37
|
-
# Returns an array of [delay, url]
|
38
|
-
#
|
39
|
-
#
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
36
|
+
# Returns an array of [delay, url, link_self], where the first two
|
37
|
+
# are strings containing the respective parts of the refresh value,
|
38
|
+
# and link_self is a boolean value that indicates whether the url
|
39
|
+
# part is missing or empty. If base_uri, the URI of the current
|
40
|
+
# page is given, the value of url becomes an absolute URI.
|
41
|
+
|
42
|
+
def self.parse content, base_uri = nil
|
43
|
+
m = CONTENT_REGEXP.match(content) or return
|
44
|
+
|
45
|
+
delay, url = m[1], m[3]
|
46
|
+
url &&= url.empty? ? nil : Mechanize::Util.uri_escape(url, UNSAFE)
|
47
|
+
link_self = url.nil?
|
48
|
+
if base_uri
|
49
|
+
url = url ? base_uri + url : base_uri
|
50
|
+
end
|
51
|
+
|
52
|
+
return delay, url, link_self
|
53
53
|
end
|
54
54
|
|
55
|
-
def self.from_node node, page, uri
|
56
|
-
http_equiv = node['http-equiv']
|
57
|
-
|
55
|
+
def self.from_node node, page, uri = nil
|
56
|
+
http_equiv = node['http-equiv'] and
|
57
|
+
/\ARefresh\z/i =~ http_equiv or return
|
58
58
|
|
59
59
|
delay, uri, link_self = parse node['content'], uri
|
60
60
|
|
61
61
|
return unless delay
|
62
62
|
|
63
|
-
new node, page, delay, uri
|
63
|
+
new node, page, delay, uri, link_self
|
64
64
|
end
|
65
65
|
|
66
66
|
def initialize node, page, delay, href, link_self = false
|
67
67
|
super node, page.mech, page
|
68
68
|
|
69
|
-
@delay = delay
|
69
|
+
@delay = delay.include?(?.) ? delay.to_f : delay.to_i
|
70
70
|
@href = href
|
71
71
|
@link_self = link_self
|
72
72
|
end
|
73
73
|
|
74
|
+
def noreferrer?
|
75
|
+
true
|
76
|
+
end
|
74
77
|
end
|
75
78
|
|
@@ -3,13 +3,10 @@ require 'mechanize/file_saver'
|
|
3
3
|
require 'mechanize/page'
|
4
4
|
|
5
5
|
##
|
6
|
-
# This class is used to register and maintain pluggable parsers for Mechanize
|
7
|
-
# to use.
|
8
|
-
#
|
9
6
|
# Mechanize allows different parsers for different content types. Mechanize
|
10
7
|
# uses PluggableParser to determine which parser to use for any content type.
|
11
|
-
# To use your own
|
12
|
-
#
|
8
|
+
# To use your own parser or to change the default parsers, register them with
|
9
|
+
# this class through Mechanize#pluggable_parser.
|
13
10
|
#
|
14
11
|
# The default parser for unregistered content types is Mechanize::File.
|
15
12
|
#
|
@@ -22,8 +19,8 @@ require 'mechanize/page'
|
|
22
19
|
# == Example
|
23
20
|
#
|
24
21
|
# To create your own parser, just create a class that takes four parameters in
|
25
|
-
# the constructor. Here is an example of registering a
|
26
|
-
#
|
22
|
+
# the constructor. Here is an example of registering a parser that handles
|
23
|
+
# CSV files:
|
27
24
|
#
|
28
25
|
# require 'csv'
|
29
26
|
#
|
@@ -43,8 +40,8 @@ require 'mechanize/page'
|
|
43
40
|
# Now any response with a content type of 'text/csv' will initialize a
|
44
41
|
# CSVParser and return that object to the caller.
|
45
42
|
#
|
46
|
-
# To register a
|
47
|
-
#
|
43
|
+
# To register a parser for a content type that Mechanize does not know about,
|
44
|
+
# use the hash syntax:
|
48
45
|
#
|
49
46
|
# agent.pluggable_parser['text/something'] = SomeClass
|
50
47
|
#
|
@@ -73,6 +70,7 @@ class Mechanize::PluggableParser
|
|
73
70
|
CONTENT_TYPES[:html] => Mechanize::Page,
|
74
71
|
CONTENT_TYPES[:xhtml] => Mechanize::Page,
|
75
72
|
CONTENT_TYPES[:wap] => Mechanize::Page,
|
73
|
+
'image' => Mechanize::Image
|
76
74
|
}
|
77
75
|
|
78
76
|
@default = Mechanize::File
|
@@ -81,11 +79,24 @@ class Mechanize::PluggableParser
|
|
81
79
|
##
|
82
80
|
# Returns the parser registered for the given +content_type+
|
83
81
|
|
84
|
-
def parser
|
85
|
-
|
82
|
+
def parser content_type
|
83
|
+
return default unless content_type
|
84
|
+
|
85
|
+
parser = @parsers[content_type]
|
86
|
+
|
87
|
+
return parser if parser
|
88
|
+
|
89
|
+
mime_type = MIME::Type.new content_type
|
90
|
+
|
91
|
+
parser = @parsers[mime_type.to_s] ||
|
92
|
+
@parsers[mime_type.simplified] ||
|
93
|
+
@parsers[mime_type.media_type] ||
|
94
|
+
default
|
95
|
+
rescue MIME::InvalidContentType
|
96
|
+
default
|
86
97
|
end
|
87
98
|
|
88
|
-
def register_parser
|
99
|
+
def register_parser content_type, klass # :nodoc:
|
89
100
|
@parsers[content_type] = klass
|
90
101
|
end
|
91
102
|
|
@@ -135,9 +146,12 @@ class Mechanize::PluggableParser
|
|
135
146
|
|
136
147
|
##
|
137
148
|
# Sets the parser for +content_type+ content to +klass+
|
149
|
+
#
|
150
|
+
# The +content_type+ may either be a full MIME type a simplified MIME type
|
151
|
+
# ('text/x-csv' simplifies to 'text/csv') or a media type like 'image'.
|
138
152
|
|
139
|
-
def []=
|
140
|
-
|
153
|
+
def []= content_type, klass
|
154
|
+
register_parser content_type, klass
|
141
155
|
end
|
142
156
|
|
143
157
|
end
|
data/lib/mechanize/util.rb
CHANGED
@@ -21,6 +21,12 @@ class Mechanize::Util
|
|
21
21
|
[Iconv::InvalidEncoding, Iconv::IllegalSequence]
|
22
22
|
end
|
23
23
|
|
24
|
+
# default mime type data for Page::Image#mime_type.
|
25
|
+
# You can use another Apache-compatible mimetab.
|
26
|
+
# mimetab = WEBrick::HTTPUtils.load_mime_types('/etc/mime.types')
|
27
|
+
# Mechanize::Util::DefaultMimeTypes.replace(mimetab)
|
28
|
+
DefaultMimeTypes = WEBrick::HTTPUtils::DefaultMimeTypes
|
29
|
+
|
24
30
|
def self.build_query_string(parameters, enc=nil)
|
25
31
|
parameters.map { |k,v|
|
26
32
|
# WEBrick::HTTP.escape* has some problems about m17n on ruby-1.9.*.
|