spidr 0.1.9 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/spidr/page.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'spidr/extensions/uri'
2
+
1
3
  require 'uri'
2
4
  require 'nokogiri'
3
5
 
@@ -10,15 +12,17 @@ module Spidr
10
12
  # HTTP Response
11
13
  attr_reader :response
12
14
 
13
- # Body returned for the page
14
- attr_reader :body
15
-
16
15
  # Headers returned with the body
17
16
  attr_reader :headers
18
17
 
19
18
  #
20
- # Creates a new Page object from the specified _url_ and HTTP
21
- # _response_.
19
+ # Creates a new Page object.
20
+ #
21
+ # @param [URI::HTTP] url
22
+ # The URL of the page.
23
+ #
24
+ # @param [Net::HTTP::Response] response
25
+ # The response from the request for the page.
22
26
  #
23
27
  def initialize(url,response)
24
28
  @url = url
@@ -28,169 +32,234 @@ module Spidr
28
32
  end
29
33
 
30
34
  #
31
- # Returns the response code from the page.
35
+ # The response code from the page.
36
+ #
37
+ # @return [Integer]
38
+ # Response code from the page.
32
39
  #
33
40
  def code
34
- @response.code
41
+ @response.code.to_i
35
42
  end
36
43
 
37
44
  #
38
- # Returns +true+ if the response code is 200, returns +false+ otherwise.
45
+ # Determines if the response code is +200+.
46
+ #
47
+ # @return [Boolean]
48
+ # Specifies whether the response code is +200+.
39
49
  #
40
50
  def is_ok?
41
51
  code == 200
42
52
  end
43
53
 
54
+ alias ok? is_ok?
55
+
56
+ #
57
+ # Determines if the response code is +301+ or +307+.
44
58
  #
45
- # Returns +true+ if the response code is 301 or 307, returns +false+
46
- # otherwise.
59
+ # @return [Boolean]
60
+ # Specifies whether the response code is +301+ or +307+.
47
61
  #
48
62
  def is_redirect?
49
63
  (code == 301 || code == 307)
50
64
  end
51
65
 
66
+ alias redirect? is_redirect?
67
+
68
+ #
69
+ # Determines if the response code is +308+.
52
70
  #
53
- # Returns +true+ if the response code is 308, returns +false+ otherwise.
71
+ # @return [Boolean]
72
+ # Specifies whether the response code is +308+.
54
73
  #
55
74
  def timedout?
56
75
  code == 308
57
76
  end
58
77
 
59
78
  #
60
- # Returns +true+ if the response code is 400, returns +false+ otherwise.
79
+ # Determines if the response code is +400+.
80
+ #
81
+ # @return [Boolean]
82
+ # Specifies whether the response code is +400+.
61
83
  #
62
84
  def bad_request?
63
85
  code == 400
64
86
  end
65
87
 
66
88
  #
67
- # Returns +true+ if the response code is 401, returns +false+ otherwise.
89
+ # Determines if the response code is +401+.
90
+ #
91
+ # @return [Boolean]
92
+ # Specifies whether the response code is +401+.
68
93
  #
69
94
  def is_unauthorized?
70
95
  code == 401
71
96
  end
72
97
 
98
+ alias unauthorized? is_unauthorized?
99
+
73
100
  #
74
- # Returns +true+ if the response code is 403, returns +false+ otherwise.
101
+ # Determines if the response code is +403+.
102
+ #
103
+ # @return [Boolean]
104
+ # Specifies whether the response code is +403+.
75
105
  #
76
106
  def is_forbidden?
77
107
  code == 403
78
108
  end
79
109
 
110
+ alias forbidden? is_forbidden?
111
+
80
112
  #
81
- # Returns +true+ if the response code is 404, returns +false+ otherwise.
113
+ # Determines if the response code is +404+.
114
+ #
115
+ # @return [Boolean]
116
+ # Specifies whether the response code is +404+.
82
117
  #
83
118
  def is_missing?
84
119
  code == 404
85
120
  end
86
121
 
122
+ alias missing? is_missing?
123
+
87
124
  #
88
- # Returns +true+ if the response code is 500, returns +false+ otherwise.
125
+ # Determines if the response code is +500+.
126
+ #
127
+ # @return [Boolean]
128
+ # Specifies whether the response code is +500+.
89
129
  #
90
130
  def had_internal_server_error?
91
131
  code == 500
92
132
  end
93
133
 
94
134
  #
95
- # Returns the content-type of the page.
135
+ # The Content-Type of the page.
136
+ #
137
+ # @return [String]
138
+ # The Content-Type of the page.
96
139
  #
97
140
  def content_type
98
141
  @response['Content-Type']
99
142
  end
100
143
 
101
144
  #
102
- # Returns +true+ if the page is a plain text document, returns +false+
103
- # otherwise.
145
+ # Determines if the page is plain-text.
146
+ #
147
+ # @return [Boolean]
148
+ # Specifies whether the page is plain-text.
104
149
  #
105
150
  def plain_text?
106
151
  (content_type =~ /text\/plain/) == 0
107
152
  end
108
153
 
154
+ alias txt? plain_text?
155
+
156
+ #
157
+ # Determines if the page is HTML document.
109
158
  #
110
- # Returns +true+ if the page is a HTML document, returns +false+
111
- # otherwise.
159
+ # @return [Boolean]
160
+ # Specifies whether the page is HTML document.
112
161
  #
113
162
  def html?
114
163
  (content_type =~ /text\/html/) == 0
115
164
  end
116
165
 
117
166
  #
118
- # Returns +true+ if the page is a XML document, returns +false+
119
- # otherwise.
167
+ # Determines if the page is XML document.
168
+ #
169
+ # @return [Boolean]
170
+ # Specifies whether the page is XML document.
120
171
  #
121
172
  def xml?
122
173
  (content_type =~ /text\/xml/) == 0
123
174
  end
124
175
 
125
176
  #
126
- # Returns +true+ if the page is a Javascript file, returns +false+
127
- # otherwise.
177
+ # Determines if the page is JavaScript.
178
+ #
179
+ # @return [Boolean]
180
+ # Specifies whether the page is JavaScript.
128
181
  #
129
182
  def javascript?
130
183
  (content_type =~ /(text|application)\/javascript/) == 0
131
184
  end
132
185
 
133
186
  #
134
- # Returns +true+ if the page is a CSS file, returns +false+
135
- # otherwise.
187
+ # Determines if the page is a CSS stylesheet.
188
+ #
189
+ # @return [Boolean]
190
+ # Specifies whether the page is a CSS stylesheet.
136
191
  #
137
192
  def css?
138
193
  (content_type =~ /text\/css/) == 0
139
194
  end
140
195
 
141
196
  #
142
- # Returns +true+ if the page is a RSS/RDF feed, returns +false+
143
- # otherwise.
197
+ # Determines if the page is a RSS feed.
198
+ #
199
+ # @return [Boolean]
200
+ # Specifies whether the page is a RSS feed.
144
201
  #
145
202
  def rss?
146
203
  (content_type =~ /application\/(rss|rdf)\+xml/) == 0
147
204
  end
148
205
 
149
206
  #
150
- # Returns +true+ if the page is a Atom feed, returns +false+
151
- # otherwise.
207
+ # Determines if the page is an Atom feed.
208
+ #
209
+ # @return [Boolean]
210
+ # Specifies whether the page is an Atom feed.
152
211
  #
153
212
  def atom?
154
213
  (content_type =~ /application\/atom\+xml/) == 0
155
214
  end
156
215
 
157
216
  #
158
- # Returns +true+ if the page is a MS Word document, returns +false+
159
- # otherwise.
217
+ # Determines if the page is a MS Word document.
218
+ #
219
+ # @return [Boolean]
220
+ # Specifies whether the page is a MS Word document.
160
221
  #
161
222
  def ms_word?
162
223
  (content_type =~ /application\/msword/) == 0
163
224
  end
164
225
 
165
226
  #
166
- # Returns +true+ if the page is a PDF document, returns +false+
167
- # otherwise.
227
+ # Determines if the page is a PDF document.
228
+ #
229
+ # @return [Boolean]
230
+ # Specifies whether the page is a PDF document.
168
231
  #
169
232
  def pdf?
170
233
  (content_type =~ /application\/pdf/) == 0
171
234
  end
172
235
 
173
236
  #
174
- # Returns +true+ if the page is a ZIP archive, returns +false+
175
- # otherwise.
237
+ # Determines if the page is a ZIP archive.
238
+ #
239
+ # @return [Boolean]
240
+ # Specifies whether the page is a ZIP archive.
176
241
  #
177
242
  def zip?
178
243
  (content_type =~ /application\/zip/) == 0
179
244
  end
180
245
 
181
246
  #
182
- # Returns the body of the page in +String+ form.
247
+ # The body of the response.
248
+ #
249
+ # @return [String]
250
+ # The body of the response.
183
251
  #
184
252
  def body
185
253
  @response.body
186
254
  end
187
255
 
188
256
  #
189
- # If the page has a <tt>text/html</tt> content-type, a
190
- # Nokogiri::HTML::Document object will be returned. If the page has a
191
- # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
192
- # will be returned. Other content-types will cause +nil+ to be
193
- # returned.
257
+ # Returns a parsed document object for HTML, XML, RSS and Atom pages.
258
+ #
259
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
260
+ # The document that represents HTML or XML pages.
261
+ # Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
262
+ # the page could not be parsed properly.
194
263
  #
195
264
  def doc
196
265
  return nil if (body.nil? || body.empty?)
@@ -198,7 +267,7 @@ module Spidr
198
267
  begin
199
268
  if html?
200
269
  return @doc ||= Nokogiri::HTML(body)
201
- elsif xml?
270
+ elsif (xml? || rss? || atom?)
202
271
  return @doc ||= Nokogiri::XML(body)
203
272
  end
204
273
  rescue
@@ -207,7 +276,70 @@ module Spidr
207
276
  end
208
277
 
209
278
  #
210
- # Returns all links from the HTML page.
279
+ # Searches the document for XPath or CSS Path paths.
280
+ #
281
+ # @param [Array<String>] paths
282
+ # CSS or XPath expressions to search the document with.
283
+ #
284
+ # @return [Array]
285
+ # The matched nodes from the document.
286
+ # Returns an empty Array if no nodes were matched, or if the page
287
+ # is not an HTML or XML document.
288
+ #
289
+ # @example
290
+ # page.search('//a[@href]')
291
+ #
292
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
293
+ #
294
+ def search(*paths)
295
+ if doc
296
+ return doc.search(*paths)
297
+ end
298
+
299
+ return []
300
+ end
301
+
302
+ #
303
+ # Searches for the first occurrence an XPath or CSS Path expression.
304
+ #
305
+ # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
306
+ # The first matched node. Returns +nil+ if no nodes could be matched,
307
+ # or if the page is not a HTML or XML document.
308
+ #
309
+ # @example
310
+ # page.at('//title')
311
+ #
312
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
313
+ #
314
+ def at(*arguments)
315
+ if doc
316
+ return doc.at(*arguments)
317
+ end
318
+
319
+ return nil
320
+ end
321
+
322
+ alias / search
323
+ alias % at
324
+
325
+ #
326
+ # The title of the HTML page.
327
+ #
328
+ # @return [String]
329
+ # The inner-text of the title element of the page.
330
+ #
331
+ def title
332
+ if (node = at('//title'))
333
+ return node.inner_text
334
+ end
335
+ end
336
+
337
+ #
338
+ # The links from within the page.
339
+ #
340
+ # @return [Array<String>]
341
+ # All links within the HTML page, frame/iframe source URLs and any
342
+ # links in the +Location+ header.
211
343
  #
212
344
  def links
213
345
  urls = []
@@ -218,7 +350,15 @@ module Spidr
218
350
 
219
351
  case code
220
352
  when 300..303, 307
221
- add_url.call(@headers['location'])
353
+ location = @headers['location']
354
+
355
+ if location.kind_of?(Array)
356
+ # handle multiple location URLs
357
+ location.each(&add_url)
358
+ else
359
+ # usually the location header contains a single String
360
+ add_url.call(location)
361
+ end
222
362
  end
223
363
 
224
364
  if (html? && doc)
@@ -239,44 +379,45 @@ module Spidr
239
379
  end
240
380
 
241
381
  #
242
- # Returns all links from the HtML page as absolute URLs.
382
+ # Absolute URIs from within the page.
383
+ #
384
+ # @return [Array<URI::HTTP>]
385
+ # The links from within the page, converted to absolute URIs.
243
386
  #
244
387
  def urls
245
388
  links.map { |link| to_absolute(link) }.compact
246
389
  end
247
390
 
248
- protected
249
-
250
391
  #
251
- # Converts the specified _link_ into an absolute URL
252
- # based on the url of the page.
392
+ # Normalizes and expands a given link into a proper URI.
393
+ #
394
+ # @param [String] link
395
+ # The link to normalize and expand.
396
+ #
397
+ # @return [URI::HTTP]
398
+ # The normalized URI.
253
399
  #
254
400
  def to_absolute(link)
255
- # decode, clean then re-encode the URL
256
- link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
257
-
258
401
  begin
259
- relative = URI(link)
260
- absolute = @url.merge(relative)
261
-
262
- if absolute.path
263
- if absolute.path.empty?
264
- # default the absolute path to '/'
265
- absolute.path = '/'
266
- else
267
- # make sure the path does not contain any .. or . directories.
268
- absolute.path = File.expand_path(absolute.path)
269
- end
270
- end
271
-
272
- return absolute
273
- rescue URI::InvalidURIError => e
402
+ url = @url.merge(link.to_s)
403
+ rescue URI::InvalidURIError
274
404
  return nil
275
405
  end
406
+
407
+ unless (url.path.nil? || url.path.empty?)
408
+ # make sure the path does not contain any .. or . directories,
409
+ # since URI::Generic#merge cannot normalize paths such as
410
+ # "/stuff/../"
411
+ url.path = URI.expand_path(url.path)
412
+ end
413
+
414
+ return url
276
415
  end
277
416
 
417
+ protected
418
+
278
419
  #
279
- # Provides transparent access to the values in the +headers+ +Hash+.
420
+ # Provides transparent access to the values in +headers+.
280
421
  #
281
422
  def method_missing(sym,*args,&block)
282
423
  if (args.empty? && block.nil?)