mechanize 2.1.1 → 2.2

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mechanize might be problematic. Click here for more details.

@@ -0,0 +1,6 @@
1
+ ##
2
+ # An Image holds downloaded data for an image/* response.
3
+
4
+ class Mechanize::Image < Mechanize::Download
5
+ end
6
+
@@ -186,9 +186,26 @@ class Mechanize::Page < Mechanize::File
186
186
  @meta_content_type || response['content-type']
187
187
  end
188
188
 
189
- # Search through the page like HPricot
189
+ ##
190
+ # :method: search
191
+ #
192
+ # Search for +paths+ in the page using Nokogiri's #search. The +paths+ can
193
+ # be XPath or CSS and an optional Hash of namespaces may be appended.
194
+ #
195
+ # See Nokogiri::XML::Node#search for further details.
196
+
190
197
  def_delegator :parser, :search, :search
191
- def_delegator :parser, :/, :/
198
+
199
+ alias / search
200
+
201
+ ##
202
+ # :method: at
203
+ #
204
+ # Search through the page for +path+ under +namespace+ using Nokogiri's #at.
205
+ # The +path+ may be either a CSS or XPath expression.
206
+ #
207
+ # See also Nokogiri::XML::Node#at
208
+
192
209
  def_delegator :parser, :at, :at
193
210
 
194
211
  ##
@@ -283,6 +300,24 @@ class Mechanize::Page < Mechanize::File
283
300
 
284
301
  elements_with :iframe
285
302
 
303
+ ##
304
+ # :method: image_with(criteria)
305
+ #
306
+ # Find a single image matching +criteria+.
307
+ # Example:
308
+ # page.image_with(:alt => /main/).fetch.save
309
+
310
+ ##
311
+ # :method: images_with(criteria)
312
+ #
313
+ # Find all images matching +criteria+.
314
+ # Example:
315
+ # page.images_with(:src => /jpg\Z/).each do |img|
316
+ # img.fetch.save
317
+ # end
318
+
319
+ elements_with :image
320
+
286
321
  ##
287
322
  # Return a list of all link and area tags
288
323
  def links
@@ -310,7 +345,7 @@ class Mechanize::Page < Mechanize::File
310
345
  query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'
311
346
 
312
347
  @meta_refresh ||= search(query).map do |node|
313
- MetaRefresh.from_node node, self, uri
348
+ MetaRefresh.from_node node, self
314
349
  end.compact
315
350
  end
316
351
 
@@ -2,29 +2,179 @@
2
2
  # An image element on an HTML page
3
3
 
4
4
  class Mechanize::Page::Image
5
+
5
6
  attr_reader :node
6
- attr_reader :page
7
+ attr_accessor :page
8
+ attr_accessor :mech
9
+
10
+ ##
11
+ # Creates a new Mechanize::Page::Image from an image +node+ and source
12
+ # +page+.
7
13
 
8
- def initialize(node, page)
14
+ def initialize node, page
9
15
  @node = node
10
16
  @page = page
17
+ @mech = page.mech
18
+ end
19
+
20
+ ##
21
+ # The alt attribute of the image
22
+
23
+ def alt
24
+ node['alt']
25
+ end
26
+
27
+ ##
28
+ # The caption of the image. In order of preference, the #title, #alt, or
29
+ # empty string "".
30
+
31
+ def caption
32
+ title || alt || ''
33
+ end
34
+
35
+ alias :text :caption
36
+
37
+ ##
38
+ # The class attribute of the image
39
+
40
+ def dom_class
41
+ node['class']
42
+ end
43
+
44
+ ##
45
+ # The id attribute of the image
46
+
47
+ def dom_id
48
+ node['id']
49
+ end
50
+
51
+ ##
52
+ # The suffix of the #url. The dot is a part of suffix, not a delimiter.
53
+ #
54
+ # p image.url # => "http://example/test.jpg"
55
+ # p image.extname # => ".jpg"
56
+ #
57
+ # Returns an empty string if #url has no suffix:
58
+ #
59
+ # p image.url # => "http://example/sampleimage"
60
+ # p image.extname # => ""
61
+
62
+ def extname
63
+ return nil unless src
64
+
65
+ File.extname url.path
66
+ end
67
+
68
+ ##
69
+ # Downloads the image.
70
+ #
71
+ # agent.page.image_with(:src => /logo/).fetch.save
72
+ #
73
+ # The referer is:
74
+ #
75
+ # #page("parent") ::
76
+ # all images on http html, relative #src images on https html
77
+ # (no referer) ::
78
+ # absolute #src images on https html
79
+ # user specified ::
80
+ # img.fetch(nil, my_referer_uri_or_page)
81
+
82
+ def fetch parameters = [], referer = nil, headers = {}
83
+ mech.get src, parameters, referer || image_referer, headers
84
+ end
85
+
86
+ ##
87
+ # The height attribute of the image
88
+
89
+ def height
90
+ node['height']
91
+ end
92
+
93
+ def image_referer # :nodoc:
94
+ http_page = page.uri && page.uri.scheme == 'http'
95
+ https_page = page.uri && page.uri.scheme == 'https'
96
+
97
+ case
98
+ when http_page then page
99
+ when https_page && relative? then page
100
+ else
101
+ Mechanize::File.new(nil, { 'content-type' => 'text/plain' }, '', 200)
102
+ end
103
+ end
104
+
105
+ ##
106
+ # MIME type guessed from the image url suffix
107
+ #
108
+ # p image.extname # => ".jpg"
109
+ # p image.mime_type # => "image/jpeg"
110
+ # page.images_with(:mime_type => /gif|jpeg|png/).each do ...
111
+ #
112
+ # Returns nil if url has no (well-known) suffix:
113
+ #
114
+ # p image.url # => "http://example/sampleimage"
115
+ # p image.mime_type # => nil
116
+
117
+ def mime_type
118
+ suffix_without_dot = extname ? extname.sub(/\A\./){''}.downcase : nil
119
+
120
+ Mechanize::Util::DefaultMimeTypes[suffix_without_dot]
11
121
  end
12
122
 
123
+ def pretty_print(q) # :nodoc:
124
+ q.object_group(self) {
125
+ q.breakable; q.pp url
126
+ q.breakable; q.pp caption
127
+ }
128
+ end
129
+
130
+ alias inspect pretty_inspect # :nodoc:
131
+
132
+ def relative? # :nodoc:
133
+ %r{^https?://} !~ src
134
+ end
135
+
136
+ ##
137
+ # The src attribute of the image
138
+
13
139
  def src
14
- @node['src']
140
+ node['src']
15
141
  end
16
142
 
143
+ ##
144
+ # The title attribute of the image
145
+
146
+ def title
147
+ node['title']
148
+ end
149
+
150
+ ##
151
+ # The URL string of this image
152
+
153
+ def to_s
154
+ url.to_s
155
+ end
156
+
157
+ ##
158
+ # URI for this image
159
+
17
160
  def url
18
- case src
19
- when %r{^https?://}
20
- src
21
- else
22
- if page.bases[0]
23
- (page.bases[0].href + src).to_s
161
+ if relative? then
162
+ if page.bases[0] then
163
+ page.bases[0].href + src
24
164
  else
25
- (page.uri + src).to_s
165
+ page.uri + src
26
166
  end
167
+ else
168
+ src
27
169
  end
28
170
  end
171
+
172
+ ##
173
+ # The width attribute of the image
174
+
175
+ def width
176
+ node['width']
177
+ end
178
+
29
179
  end
30
180
 
@@ -63,6 +63,11 @@ class Mechanize::Page::Link
63
63
  rel.include? kind
64
64
  end
65
65
 
66
+ # Test if this link should not be traced.
67
+ def noreferrer?
68
+ rel?('noreferrer')
69
+ end
70
+
66
71
  # The text content of this link
67
72
  def text
68
73
  return @text if @text
@@ -22,7 +22,7 @@ class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
22
22
  # $1:: delay
23
23
  # $3:: url
24
24
 
25
- CONTENT_REGEXP = /^\s*(\d+\.?\d*)(;|;\s*url=\s*['"]?(\S*?)['"]?)?\s*$/i
25
+ CONTENT_REGEXP = /^\s*(\d+\.?\d*)\s*(?:;(?:\s*url\s*=\s*(['"]?)(\S*)\2)?\s*)?$/i
26
26
 
27
27
  ##
28
28
  # Regexp of unsafe URI characters that excludes % for Issue #177
@@ -30,46 +30,49 @@ class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
30
30
  UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&%=+$,\[\]]/
31
31
 
32
32
  ##
33
- # Parses the delay and url from the content attribute of a meta refresh
34
- # element. Parse requires the uri of the current page to infer a url when
35
- # no url is specified.
33
+ # Parses the delay and url from the content attribute of a meta
34
+ # refresh element.
36
35
  #
37
- # Returns an array of [delay, url]. (both in string)
38
- #
39
- # Returns nil if the delay and url cannot be parsed.
40
-
41
- def self.parse content, base_uri
42
- return unless content =~ CONTENT_REGEXP
43
-
44
- link_self = $3.nil? || $3.empty?
45
- delay = $1
46
- refresh_uri = $3
47
- refresh_uri = Mechanize::Util.uri_escape refresh_uri, UNSAFE if refresh_uri
48
-
49
- dest = base_uri
50
- dest += refresh_uri if refresh_uri
51
-
52
- return delay, dest, link_self
36
+ # Returns an array of [delay, url, link_self], where the first two
37
+ # are strings containing the respective parts of the refresh value,
38
+ # and link_self is a boolean value that indicates whether the url
39
+ # part is missing or empty. If base_uri, the URI of the current
40
+ # page is given, the value of url becomes an absolute URI.
41
+
42
+ def self.parse content, base_uri = nil
43
+ m = CONTENT_REGEXP.match(content) or return
44
+
45
+ delay, url = m[1], m[3]
46
+ url &&= url.empty? ? nil : Mechanize::Util.uri_escape(url, UNSAFE)
47
+ link_self = url.nil?
48
+ if base_uri
49
+ url = url ? base_uri + url : base_uri
50
+ end
51
+
52
+ return delay, url, link_self
53
53
  end
54
54
 
55
- def self.from_node node, page, uri
56
- http_equiv = node['http-equiv']
57
- return unless http_equiv and http_equiv.downcase == 'refresh'
55
+ def self.from_node node, page, uri = nil
56
+ http_equiv = node['http-equiv'] and
57
+ /\ARefresh\z/i =~ http_equiv or return
58
58
 
59
59
  delay, uri, link_self = parse node['content'], uri
60
60
 
61
61
  return unless delay
62
62
 
63
- new node, page, delay, uri.to_s, link_self
63
+ new node, page, delay, uri, link_self
64
64
  end
65
65
 
66
66
  def initialize node, page, delay, href, link_self = false
67
67
  super node, page.mech, page
68
68
 
69
- @delay = delay =~ /\./ ? delay.to_f : delay.to_i
69
+ @delay = delay.include?(?.) ? delay.to_f : delay.to_i
70
70
  @href = href
71
71
  @link_self = link_self
72
72
  end
73
73
 
74
+ def noreferrer?
75
+ true
76
+ end
74
77
  end
75
78
 
@@ -3,13 +3,10 @@ require 'mechanize/file_saver'
3
3
  require 'mechanize/page'
4
4
 
5
5
  ##
6
- # This class is used to register and maintain pluggable parsers for Mechanize
7
- # to use.
8
- #
9
6
  # Mechanize allows different parsers for different content types. Mechanize
10
7
  # uses PluggableParser to determine which parser to use for any content type.
11
- # To use your own pluggable parser or to change the default pluggable parsers,
12
- # register them with this class.
8
+ # To use your own parser or to change the default parsers, register them with
9
+ # this class through Mechanize#pluggable_parser.
13
10
  #
14
11
  # The default parser for unregistered content types is Mechanize::File.
15
12
  #
@@ -22,8 +19,8 @@ require 'mechanize/page'
22
19
  # == Example
23
20
  #
24
21
  # To create your own parser, just create a class that takes four parameters in
25
- # the constructor. Here is an example of registering a pluggable parser that
26
- # handles CSV files:
22
+ # the constructor. Here is an example of registering a parser that handles
23
+ # CSV files:
27
24
  #
28
25
  # require 'csv'
29
26
  #
@@ -43,8 +40,8 @@ require 'mechanize/page'
43
40
  # Now any response with a content type of 'text/csv' will initialize a
44
41
  # CSVParser and return that object to the caller.
45
42
  #
46
- # To register a pluggable parser for a content type that pluggable parser does
47
- # not know about, use the hash syntax:
43
+ # To register a parser for a content type that Mechanize does not know about,
44
+ # use the hash syntax:
48
45
  #
49
46
  # agent.pluggable_parser['text/something'] = SomeClass
50
47
  #
@@ -73,6 +70,7 @@ class Mechanize::PluggableParser
73
70
  CONTENT_TYPES[:html] => Mechanize::Page,
74
71
  CONTENT_TYPES[:xhtml] => Mechanize::Page,
75
72
  CONTENT_TYPES[:wap] => Mechanize::Page,
73
+ 'image' => Mechanize::Image
76
74
  }
77
75
 
78
76
  @default = Mechanize::File
@@ -81,11 +79,24 @@ class Mechanize::PluggableParser
81
79
  ##
82
80
  # Returns the parser registered for the given +content_type+
83
81
 
84
- def parser(content_type)
85
- content_type.nil? ? default : @parsers[content_type] || default
82
+ def parser content_type
83
+ return default unless content_type
84
+
85
+ parser = @parsers[content_type]
86
+
87
+ return parser if parser
88
+
89
+ mime_type = MIME::Type.new content_type
90
+
91
+ parser = @parsers[mime_type.to_s] ||
92
+ @parsers[mime_type.simplified] ||
93
+ @parsers[mime_type.media_type] ||
94
+ default
95
+ rescue MIME::InvalidContentType
96
+ default
86
97
  end
87
98
 
88
- def register_parser(content_type, klass) # :nodoc:
99
+ def register_parser content_type, klass # :nodoc:
89
100
  @parsers[content_type] = klass
90
101
  end
91
102
 
@@ -135,9 +146,12 @@ class Mechanize::PluggableParser
135
146
 
136
147
  ##
137
148
  # Sets the parser for +content_type+ content to +klass+
149
+ #
150
+ # The +content_type+ may either be a full MIME type a simplified MIME type
151
+ # ('text/x-csv' simplifies to 'text/csv') or a media type like 'image'.
138
152
 
139
- def []=(content_type, klass)
140
- @parsers[content_type] = klass
153
+ def []= content_type, klass
154
+ register_parser content_type, klass
141
155
  end
142
156
 
143
157
  end
@@ -21,6 +21,12 @@ class Mechanize::Util
21
21
  [Iconv::InvalidEncoding, Iconv::IllegalSequence]
22
22
  end
23
23
 
24
+ # default mime type data for Page::Image#mime_type.
25
+ # You can use another Apache-compatible mimetab.
26
+ # mimetab = WEBrick::HTTPUtils.load_mime_types('/etc/mime.types')
27
+ # Mechanize::Util::DefaultMimeTypes.replace(mimetab)
28
+ DefaultMimeTypes = WEBrick::HTTPUtils::DefaultMimeTypes
29
+
24
30
  def self.build_query_string(parameters, enc=nil)
25
31
  parameters.map { |k,v|
26
32
  # WEBrick::HTTP.escape* has some problems about m17n on ruby-1.9.*.