spidr 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/spidr/page.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'spidr/extensions/uri'
2
+
1
3
  require 'uri'
2
4
  require 'nokogiri'
3
5
 
@@ -10,15 +12,17 @@ module Spidr
10
12
  # HTTP Response
11
13
  attr_reader :response
12
14
 
13
- # Body returned for the page
14
- attr_reader :body
15
-
16
15
  # Headers returned with the body
17
16
  attr_reader :headers
18
17
 
19
18
  #
20
- # Creates a new Page object from the specified _url_ and HTTP
21
- # _response_.
19
+ # Creates a new Page object.
20
+ #
21
+ # @param [URI::HTTP] url
22
+ # The URL of the page.
23
+ #
24
+ # @param [Net::HTTP::Response] response
25
+ # The response from the request for the page.
22
26
  #
23
27
  def initialize(url,response)
24
28
  @url = url
@@ -28,169 +32,234 @@ module Spidr
28
32
  end
29
33
 
30
34
  #
31
- # Returns the response code from the page.
35
+ # The response code from the page.
36
+ #
37
+ # @return [Integer]
38
+ # Response code from the page.
32
39
  #
33
40
  def code
34
- @response.code
41
+ @response.code.to_i
35
42
  end
36
43
 
37
44
  #
38
- # Returns +true+ if the response code is 200, returns +false+ otherwise.
45
+ # Determines if the response code is +200+.
46
+ #
47
+ # @return [Boolean]
48
+ # Specifies whether the response code is +200+.
39
49
  #
40
50
  def is_ok?
41
51
  code == 200
42
52
  end
43
53
 
54
+ alias ok? is_ok?
55
+
56
+ #
57
+ # Determines if the response code is +301+ or +307+.
44
58
  #
45
- # Returns +true+ if the response code is 301 or 307, returns +false+
46
- # otherwise.
59
+ # @return [Boolean]
60
+ # Specifies whether the response code is +301+ or +307+.
47
61
  #
48
62
  def is_redirect?
49
63
  (code == 301 || code == 307)
50
64
  end
51
65
 
66
+ alias redirect? is_redirect?
67
+
68
+ #
69
+ # Determines if the response code is +308+.
52
70
  #
53
- # Returns +true+ if the response code is 308, returns +false+ otherwise.
71
+ # @return [Boolean]
72
+ # Specifies whether the response code is +308+.
54
73
  #
55
74
  def timedout?
56
75
  code == 308
57
76
  end
58
77
 
59
78
  #
60
- # Returns +true+ if the response code is 400, returns +false+ otherwise.
79
+ # Determines if the response code is +400+.
80
+ #
81
+ # @return [Boolean]
82
+ # Specifies whether the response code is +400+.
61
83
  #
62
84
  def bad_request?
63
85
  code == 400
64
86
  end
65
87
 
66
88
  #
67
- # Returns +true+ if the response code is 401, returns +false+ otherwise.
89
+ # Determines if the response code is +401+.
90
+ #
91
+ # @return [Boolean]
92
+ # Specifies whether the response code is +401+.
68
93
  #
69
94
  def is_unauthorized?
70
95
  code == 401
71
96
  end
72
97
 
98
+ alias unauthorized? is_unauthorized?
99
+
73
100
  #
74
- # Returns +true+ if the response code is 403, returns +false+ otherwise.
101
+ # Determines if the response code is +403+.
102
+ #
103
+ # @return [Boolean]
104
+ # Specifies whether the response code is +403+.
75
105
  #
76
106
  def is_forbidden?
77
107
  code == 403
78
108
  end
79
109
 
110
+ alias forbidden? is_forbidden?
111
+
80
112
  #
81
- # Returns +true+ if the response code is 404, returns +false+ otherwise.
113
+ # Determines if the response code is +404+.
114
+ #
115
+ # @return [Boolean]
116
+ # Specifies whether the response code is +404+.
82
117
  #
83
118
  def is_missing?
84
119
  code == 404
85
120
  end
86
121
 
122
+ alias missing? is_missing?
123
+
87
124
  #
88
- # Returns +true+ if the response code is 500, returns +false+ otherwise.
125
+ # Determines if the response code is +500+.
126
+ #
127
+ # @return [Boolean]
128
+ # Specifies whether the response code is +500+.
89
129
  #
90
130
  def had_internal_server_error?
91
131
  code == 500
92
132
  end
93
133
 
94
134
  #
95
- # Returns the content-type of the page.
135
+ # The Content-Type of the page.
136
+ #
137
+ # @return [String]
138
+ # The Content-Type of the page.
96
139
  #
97
140
  def content_type
98
141
  @response['Content-Type']
99
142
  end
100
143
 
101
144
  #
102
- # Returns +true+ if the page is a plain text document, returns +false+
103
- # otherwise.
145
+ # Determines if the page is plain-text.
146
+ #
147
+ # @return [Boolean]
148
+ # Specifies whether the page is plain-text.
104
149
  #
105
150
  def plain_text?
106
151
  (content_type =~ /text\/plain/) == 0
107
152
  end
108
153
 
154
+ alias txt? plain_text?
155
+
156
+ #
157
+ # Determines if the page is HTML document.
109
158
  #
110
- # Returns +true+ if the page is a HTML document, returns +false+
111
- # otherwise.
159
+ # @return [Boolean]
160
+ # Specifies whether the page is HTML document.
112
161
  #
113
162
  def html?
114
163
  (content_type =~ /text\/html/) == 0
115
164
  end
116
165
 
117
166
  #
118
- # Returns +true+ if the page is a XML document, returns +false+
119
- # otherwise.
167
+ # Determines if the page is XML document.
168
+ #
169
+ # @return [Boolean]
170
+ # Specifies whether the page is XML document.
120
171
  #
121
172
  def xml?
122
173
  (content_type =~ /text\/xml/) == 0
123
174
  end
124
175
 
125
176
  #
126
- # Returns +true+ if the page is a Javascript file, returns +false+
127
- # otherwise.
177
+ # Determines if the page is JavaScript.
178
+ #
179
+ # @return [Boolean]
180
+ # Specifies whether the page is JavaScript.
128
181
  #
129
182
  def javascript?
130
183
  (content_type =~ /(text|application)\/javascript/) == 0
131
184
  end
132
185
 
133
186
  #
134
- # Returns +true+ if the page is a CSS file, returns +false+
135
- # otherwise.
187
+ # Determines if the page is a CSS stylesheet.
188
+ #
189
+ # @return [Boolean]
190
+ # Specifies whether the page is a CSS stylesheet.
136
191
  #
137
192
  def css?
138
193
  (content_type =~ /text\/css/) == 0
139
194
  end
140
195
 
141
196
  #
142
- # Returns +true+ if the page is a RSS/RDF feed, returns +false+
143
- # otherwise.
197
+ # Determines if the page is a RSS feed.
198
+ #
199
+ # @return [Boolean]
200
+ # Specifies whether the page is a RSS feed.
144
201
  #
145
202
  def rss?
146
203
  (content_type =~ /application\/(rss|rdf)\+xml/) == 0
147
204
  end
148
205
 
149
206
  #
150
- # Returns +true+ if the page is a Atom feed, returns +false+
151
- # otherwise.
207
+ # Determines if the page is an Atom feed.
208
+ #
209
+ # @return [Boolean]
210
+ # Specifies whether the page is an Atom feed.
152
211
  #
153
212
  def atom?
154
213
  (content_type =~ /application\/atom\+xml/) == 0
155
214
  end
156
215
 
157
216
  #
158
- # Returns +true+ if the page is a MS Word document, returns +false+
159
- # otherwise.
217
+ # Determines if the page is a MS Word document.
218
+ #
219
+ # @return [Boolean]
220
+ # Specifies whether the page is a MS Word document.
160
221
  #
161
222
  def ms_word?
162
223
  (content_type =~ /application\/msword/) == 0
163
224
  end
164
225
 
165
226
  #
166
- # Returns +true+ if the page is a PDF document, returns +false+
167
- # otherwise.
227
+ # Determines if the page is a PDF document.
228
+ #
229
+ # @return [Boolean]
230
+ # Specifies whether the page is a PDF document.
168
231
  #
169
232
  def pdf?
170
233
  (content_type =~ /application\/pdf/) == 0
171
234
  end
172
235
 
173
236
  #
174
- # Returns +true+ if the page is a ZIP archive, returns +false+
175
- # otherwise.
237
+ # Determines if the page is a ZIP archive.
238
+ #
239
+ # @return [Boolean]
240
+ # Specifies whether the page is a ZIP archive.
176
241
  #
177
242
  def zip?
178
243
  (content_type =~ /application\/zip/) == 0
179
244
  end
180
245
 
181
246
  #
182
- # Returns the body of the page in +String+ form.
247
+ # The body of the response.
248
+ #
249
+ # @return [String]
250
+ # The body of the response.
183
251
  #
184
252
  def body
185
253
  @response.body
186
254
  end
187
255
 
188
256
  #
189
- # If the page has a <tt>text/html</tt> content-type, a
190
- # Nokogiri::HTML::Document object will be returned. If the page has a
191
- # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
192
- # will be returned. Other content-types will cause +nil+ to be
193
- # returned.
257
+ # Returns a parsed document object for HTML, XML, RSS and Atom pages.
258
+ #
259
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
260
+ # The document that represents HTML or XML pages.
261
+ # Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
262
+ # the page could not be parsed properly.
194
263
  #
195
264
  def doc
196
265
  return nil if (body.nil? || body.empty?)
@@ -198,7 +267,7 @@ module Spidr
198
267
  begin
199
268
  if html?
200
269
  return @doc ||= Nokogiri::HTML(body)
201
- elsif xml?
270
+ elsif (xml? || rss? || atom?)
202
271
  return @doc ||= Nokogiri::XML(body)
203
272
  end
204
273
  rescue
@@ -207,7 +276,70 @@ module Spidr
207
276
  end
208
277
 
209
278
  #
210
- # Returns all links from the HTML page.
279
+ # Searches the document for XPath or CSS Path paths.
280
+ #
281
+ # @param [Array<String>] paths
282
+ # CSS or XPath expressions to search the document with.
283
+ #
284
+ # @return [Array]
285
+ # The matched nodes from the document.
286
+ # Returns an empty Array if no nodes were matched, or if the page
287
+ # is not an HTML or XML document.
288
+ #
289
+ # @example
290
+ # page.search('//a[@href]')
291
+ #
292
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
293
+ #
294
+ def search(*paths)
295
+ if doc
296
+ return doc.search(*paths)
297
+ end
298
+
299
+ return []
300
+ end
301
+
302
+ #
303
+ # Searches for the first occurrence an XPath or CSS Path expression.
304
+ #
305
+ # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
306
+ # The first matched node. Returns +nil+ if no nodes could be matched,
307
+ # or if the page is not a HTML or XML document.
308
+ #
309
+ # @example
310
+ # page.at('//title')
311
+ #
312
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
313
+ #
314
+ def at(*arguments)
315
+ if doc
316
+ return doc.at(*arguments)
317
+ end
318
+
319
+ return nil
320
+ end
321
+
322
+ alias / search
323
+ alias % at
324
+
325
+ #
326
+ # The title of the HTML page.
327
+ #
328
+ # @return [String]
329
+ # The inner-text of the title element of the page.
330
+ #
331
+ def title
332
+ if (node = at('//title'))
333
+ return node.inner_text
334
+ end
335
+ end
336
+
337
+ #
338
+ # The links from within the page.
339
+ #
340
+ # @return [Array<String>]
341
+ # All links within the HTML page, frame/iframe source URLs and any
342
+ # links in the +Location+ header.
211
343
  #
212
344
  def links
213
345
  urls = []
@@ -218,7 +350,15 @@ module Spidr
218
350
 
219
351
  case code
220
352
  when 300..303, 307
221
- add_url.call(@headers['location'])
353
+ location = @headers['location']
354
+
355
+ if location.kind_of?(Array)
356
+ # handle multiple location URLs
357
+ location.each(&add_url)
358
+ else
359
+ # usually the location header contains a single String
360
+ add_url.call(location)
361
+ end
222
362
  end
223
363
 
224
364
  if (html? && doc)
@@ -239,44 +379,45 @@ module Spidr
239
379
  end
240
380
 
241
381
  #
242
- # Returns all links from the HtML page as absolute URLs.
382
+ # Absolute URIs from within the page.
383
+ #
384
+ # @return [Array<URI::HTTP>]
385
+ # The links from within the page, converted to absolute URIs.
243
386
  #
244
387
  def urls
245
388
  links.map { |link| to_absolute(link) }.compact
246
389
  end
247
390
 
248
- protected
249
-
250
391
  #
251
- # Converts the specified _link_ into an absolute URL
252
- # based on the url of the page.
392
+ # Normalizes and expands a given link into a proper URI.
393
+ #
394
+ # @param [String] link
395
+ # The link to normalize and expand.
396
+ #
397
+ # @return [URI::HTTP]
398
+ # The normalized URI.
253
399
  #
254
400
  def to_absolute(link)
255
- # decode, clean then re-encode the URL
256
- link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
257
-
258
401
  begin
259
- relative = URI(link)
260
- absolute = @url.merge(relative)
261
-
262
- if absolute.path
263
- if absolute.path.empty?
264
- # default the absolute path to '/'
265
- absolute.path = '/'
266
- else
267
- # make sure the path does not contain any .. or . directories.
268
- absolute.path = File.expand_path(absolute.path)
269
- end
270
- end
271
-
272
- return absolute
273
- rescue URI::InvalidURIError => e
402
+ url = @url.merge(link.to_s)
403
+ rescue URI::InvalidURIError
274
404
  return nil
275
405
  end
406
+
407
+ unless (url.path.nil? || url.path.empty?)
408
+ # make sure the path does not contain any .. or . directories,
409
+ # since URI::Generic#merge cannot normalize paths such as
410
+ # "/stuff/../"
411
+ url.path = URI.expand_path(url.path)
412
+ end
413
+
414
+ return url
276
415
  end
277
416
 
417
+ protected
418
+
278
419
  #
279
- # Provides transparent access to the values in the +headers+ +Hash+.
420
+ # Provides transparent access to the values in +headers+.
280
421
  #
281
422
  def method_missing(sym,*args,&block)
282
423
  if (args.empty? && block.nil?)