spidr 0.2.7 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/spidr/page.rb CHANGED
@@ -1,8 +1,6 @@
1
- require 'spidr/extensions/uri'
2
-
3
- require 'set'
4
- require 'uri'
5
- require 'nokogiri'
1
+ require 'spidr/headers'
2
+ require 'spidr/body'
3
+ require 'spidr/links'
6
4
 
7
5
  module Spidr
8
6
  #
@@ -10,8 +8,9 @@ module Spidr
10
8
  #
11
9
  class Page
12
10
 
13
- # Reserved names used within Cookie strings
14
- RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
11
+ include Headers
12
+ include Body
13
+ include Links
15
14
 
16
15
  # URL of the page
17
16
  attr_reader :url
@@ -39,26 +38,21 @@ module Spidr
39
38
  end
40
39
 
41
40
  #
42
- # The response code from the page.
43
- #
44
- # @return [Integer]
45
- # Response code from the page.
46
- #
47
- def code
48
- @response.code.to_i
49
- end
50
-
41
+ # The meta-redirect links of the page.
51
42
  #
52
- # Determines if the response code is `200`.
43
+ # @return [Array<String>]
44
+ # All meta-redirect links in the page.
53
45
  #
54
- # @return [Boolean]
55
- # Specifies whether the response code is `200`.
46
+ # @deprecated
47
+ # Deprecated in 0.3.0 and will be removed in 0.4.0.
48
+ # Use {#meta_redirects} instead.
56
49
  #
57
- def is_ok?
58
- code == 200
59
- end
50
+ def meta_redirect
51
+ STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
52
+ STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
60
53
 
61
- alias ok? is_ok?
54
+ meta_redirects
55
+ end
62
56
 
63
57
  #
64
58
  # Determines if the response code is `300`, `301`, `302`, `303`
@@ -81,531 +75,33 @@ module Spidr
81
75
 
82
76
  alias redirect? is_redirect?
83
77
 
84
- #
85
- # Determines if the response code is `308`.
86
- #
87
- # @return [Boolean]
88
- # Specifies whether the response code is `308`.
89
- #
90
- def timedout?
91
- code == 308
92
- end
93
-
94
- #
95
- # Determines if the response code is `400`.
96
- #
97
- # @return [Boolean]
98
- # Specifies whether the response code is `400`.
99
- #
100
- def bad_request?
101
- code == 400
102
- end
103
-
104
- #
105
- # Determines if the response code is `401`.
106
- #
107
- # @return [Boolean]
108
- # Specifies whether the response code is `401`.
109
- #
110
- def is_unauthorized?
111
- code == 401
112
- end
113
-
114
- alias unauthorized? is_unauthorized?
115
-
116
- #
117
- # Determines if the response code is `403`.
118
- #
119
- # @return [Boolean]
120
- # Specifies whether the response code is `403`.
121
- #
122
- def is_forbidden?
123
- code == 403
124
- end
125
-
126
- alias forbidden? is_forbidden?
127
-
128
- #
129
- # Determines if the response code is `404`.
130
- #
131
- # @return [Boolean]
132
- # Specifies whether the response code is `404`.
133
- #
134
- def is_missing?
135
- code == 404
136
- end
137
-
138
- alias missing? is_missing?
139
-
140
- #
141
- # Determines if the response code is `500`.
142
- #
143
- # @return [Boolean]
144
- # Specifies whether the response code is `500`.
145
- #
146
- def had_internal_server_error?
147
- code == 500
148
- end
149
-
150
- #
151
- # The Content-Type of the page.
152
- #
153
- # @return [String]
154
- # The Content-Type of the page.
155
- #
156
- def content_type
157
- (@response['Content-Type'] || '')
158
- end
159
-
160
- #
161
- # The content types of the page.
162
- #
163
- # @return [Array<String>]
164
- # The values within the Content-Type header.
165
- #
166
- # @since 0.2.2
167
- #
168
- def content_types
169
- (@headers['content-type'] || [])
170
- end
171
-
172
- #
173
- # Determines if the page is plain-text.
174
- #
175
- # @return [Boolean]
176
- # Specifies whether the page is plain-text.
177
- #
178
- def plain_text?
179
- is_content_type?('text/plain')
180
- end
181
-
182
- alias txt? plain_text?
183
-
184
- #
185
- # Determines if the page is HTML document.
186
- #
187
- # @return [Boolean]
188
- # Specifies whether the page is HTML document.
189
- #
190
- def html?
191
- is_content_type?('text/html')
192
- end
193
-
194
- #
195
- # Determines if the page is XML document.
196
- #
197
- # @return [Boolean]
198
- # Specifies whether the page is XML document.
199
- #
200
- def xml?
201
- is_content_type?('text/xml')
202
- end
203
-
204
- #
205
- # Determines if the page is XML Stylesheet (XSL).
206
- #
207
- # @return [Boolean]
208
- # Specifies whether the page is XML Stylesheet (XSL).
209
- #
210
- def xsl?
211
- is_content_type?('text/xsl')
212
- end
213
-
214
- #
215
- # Determines if the page is JavaScript.
216
- #
217
- # @return [Boolean]
218
- # Specifies whether the page is JavaScript.
219
- #
220
- def javascript?
221
- is_content_type?('text/javascript') || \
222
- is_content_type?('application/javascript')
223
- end
224
-
225
- #
226
- # Determines if the page is a CSS stylesheet.
227
- #
228
- # @return [Boolean]
229
- # Specifies whether the page is a CSS stylesheet.
230
- #
231
- def css?
232
- is_content_type?('text/css')
233
- end
234
-
235
- #
236
- # Determines if the page is a RSS feed.
237
- #
238
- # @return [Boolean]
239
- # Specifies whether the page is a RSS feed.
240
- #
241
- def rss?
242
- is_content_type?('application/rss+xml') || \
243
- is_content_type?('application/rdf+xml')
244
- end
245
-
246
- #
247
- # Determines if the page is an Atom feed.
248
- #
249
- # @return [Boolean]
250
- # Specifies whether the page is an Atom feed.
251
- #
252
- def atom?
253
- is_content_type?('application/atom+xml')
254
- end
255
-
256
- #
257
- # Determines if the page is a MS Word document.
258
- #
259
- # @return [Boolean]
260
- # Specifies whether the page is a MS Word document.
261
- #
262
- def ms_word?
263
- is_content_type?('application/msword')
264
- end
265
-
266
- #
267
- # Determines if the page is a PDF document.
268
- #
269
- # @return [Boolean]
270
- # Specifies whether the page is a PDF document.
271
- #
272
- def pdf?
273
- is_content_type?('application/pdf')
274
- end
275
-
276
- #
277
- # Determines if the page is a ZIP archive.
278
- #
279
- # @return [Boolean]
280
- # Specifies whether the page is a ZIP archive.
281
- #
282
- def zip?
283
- is_content_type?('application/zip')
284
- end
285
-
286
- #
287
- # The raw Cookie String sent along with the page.
288
- #
289
- # @return [String]
290
- # The raw Cookie from the response.
291
- #
292
- # @since 0.2.7
293
- #
294
- def raw_cookie
295
- (@response['Set-Cookie'] || '')
296
- end
297
-
298
- #
299
- # The raw Cookie String sent along with the page.
300
- #
301
- # @return [String]
302
- # The raw Cookie from the response.
303
- #
304
- # @deprecated
305
- # Deprecated in 0.2.7 and will be removed in 0.3.0.
306
- # Use {#raw_cookie} instead.
307
- #
308
- # @since 0.2.2
309
- #
310
- def cookie
311
- STDERR.puts 'DEPRECATION: Spidr::Page#cookie will be removed in 0.3.0'
312
- STDERR.puts 'DEPRECATION: Use Spidr::Page#raw_cookie instead'
313
-
314
- return raw_cookie
315
- end
316
-
317
- #
318
- # The Cookie values sent along with the page.
319
- #
320
- # @return [Array<String>]
321
- # The Cookies from the response.
322
- #
323
- # @since 0.2.2
324
- #
325
- def cookies
326
- (@headers['set-cookie'] || [])
327
- end
328
-
329
- #
330
- # The Cookie key -> value pairs returned with the response.
331
- #
332
- # @return [Hash{String => String}]
333
- # The cookie keys and values.
334
- #
335
- # @since 0.2.2
336
- #
337
- def cookie_params
338
- params = {}
339
-
340
- cookies.each do |cookie|
341
- cookie.split('; ').each do |key_value|
342
- key, value = key_value.split('=',2)
343
-
344
- next if RESERVED_COOKIE_NAMES.include?(key)
345
-
346
- params[key] = (value || '')
347
- end
348
- end
349
-
350
- return params
351
- end
352
-
353
- #
354
- # The body of the response.
355
- #
356
- # @return [String]
357
- # The body of the response.
358
- #
359
- def body
360
- (@response.body || '')
361
- end
362
-
363
- #
364
- # Returns a parsed document object for HTML, XML, RSS and Atom pages.
365
- #
366
- # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
367
- # The document that represents HTML or XML pages.
368
- # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
369
- # the page could not be parsed properly.
370
- #
371
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
372
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
373
- #
374
- def doc
375
- return nil if body.empty?
376
-
377
- begin
378
- if html?
379
- return @doc ||= Nokogiri::HTML(body)
380
- elsif (xml? || xsl? || rss? || atom?)
381
- return @doc ||= Nokogiri::XML(body)
382
- end
383
- rescue
384
- return nil
385
- end
386
- end
387
-
388
- #
389
- # Searches the document for XPath or CSS Path paths.
390
- #
391
- # @param [Array<String>] paths
392
- # CSS or XPath expressions to search the document with.
393
- #
394
- # @return [Array]
395
- # The matched nodes from the document.
396
- # Returns an empty Array if no nodes were matched, or if the page
397
- # is not an HTML or XML document.
398
- #
399
- # @example
400
- # page.search('//a[@href]')
401
- #
402
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
403
- #
404
- def search(*paths)
405
- if doc
406
- doc.search(*paths)
407
- else
408
- []
409
- end
410
- end
78
+ protected
411
79
 
412
80
  #
413
- # Searches for the first occurrence an XPath or CSS Path expression.
414
- #
415
- # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
416
- # The first matched node. Returns `nil` if no nodes could be matched,
417
- # or if the page is not a HTML or XML document.
418
- #
419
- # @example
420
- # page.at('//title')
81
+ # Provides transparent access to the values in {#headers}.
421
82
  #
422
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
83
+ # @param [Symbol] name
84
+ # The name of the missing method.
423
85
  #
424
- def at(*arguments)
425
- if doc
426
- doc.at(*arguments)
427
- end
428
- end
429
-
430
- alias / search
431
- alias % at
432
-
433
- #
434
- # The title of the HTML page.
86
+ # @param [Array] arguments
87
+ # Additional arguments for the missing method.
435
88
  #
436
89
  # @return [String]
437
- # The inner-text of the title element of the page.
438
- #
439
- def title
440
- if (node = at('//title'))
441
- node.inner_text
442
- end
443
- end
444
-
445
- #
446
- # The links from within the page.
447
- #
448
- # @return [Array<String>]
449
- # All links within the HTML page, frame/iframe source URLs and any
450
- # links in the `Location` header.
90
+ # The missing method mapped to a header in {#headers}.
451
91
  #
452
- def links
453
- urls = []
454
-
455
- add_url = lambda { |url|
456
- urls << url unless (url.nil? || url.empty?)
457
- }
458
-
459
- self.redirects_to.each(&add_url) if self.is_redirect?
460
-
461
- if (html? && doc)
462
- doc.search('a[@href]').each do |a|
463
- add_url.call(a.get_attribute('href'))
464
- end
465
-
466
- doc.search('frame[@src]').each do |iframe|
467
- add_url.call(iframe.get_attribute('src'))
468
- end
469
-
470
- doc.search('iframe[@src]').each do |iframe|
471
- add_url.call(iframe.get_attribute('src'))
472
- end
473
-
474
- doc.search('link[@href]').each do |link|
475
- add_url.call(link.get_attribute('href'))
476
- end
477
-
478
- doc.search('script[@src]').each do |script|
479
- add_url.call(script.get_attribute('src'))
480
- end
481
- end
482
-
483
- return urls
484
- end
485
-
486
- #
487
- # URL(s) that this document redirects to.
488
- #
489
- # @return [Array<String>]
490
- # The links that this page redirects to (usually found in a
491
- # location header or by way of a page-level meta redirect).
492
- #
493
- def redirects_to
494
- location = @headers['location']
495
-
496
- if location.nil?
497
- # check page-level meta redirects if there isn't a location header
498
- meta_redirect
499
- elsif location.kind_of?(Array)
500
- location
501
- else
502
- # usually the location header contains a single String
503
- [location]
504
- end
505
- end
506
-
507
- #
508
- # Absolute URIs from within the page.
509
- #
510
- # @return [Array<URI::HTTP>]
511
- # The links from within the page, converted to absolute URIs.
512
- #
513
- def urls
514
- links.map { |link| to_absolute(link) }.compact
515
- end
516
-
517
- #
518
- # Normalizes and expands a given link into a proper URI.
519
- #
520
- # @param [String] link
521
- # The link to normalize and expand.
522
- #
523
- # @return [URI::HTTP]
524
- # The normalized URI.
525
- #
526
- def to_absolute(link)
527
- begin
528
- url = @url.merge(link.to_s)
529
- rescue URI::InvalidURIError, URI::InvalidComponentError
530
- return nil
531
- end
532
-
533
- unless (url.path.nil? || url.path.empty?)
534
- # make sure the path does not contain any .. or . directories,
535
- # since URI::Generic#merge cannot normalize paths such as
536
- # "/stuff/../"
537
- url.path = URI.expand_path(url.path)
538
- end
539
-
540
- return url
541
- end
542
-
543
- #
544
- # Determines if a page-level "soft" redirect is present. If yes,
545
- # returns an array of those redirects (usually a single URL).
546
- # Otherwise, returns false.
547
- #
548
- # @return [Array<String>]
549
- # An array of redirect URLs
92
+ # @raise [NoMethodError]
93
+ # The missing method did not map to a header in {#headers}.
550
94
  #
551
- def meta_redirect
552
- redirects = []
553
-
554
- if (html? && doc)
555
- search('//meta[@http-equiv and @content]').each do |node|
556
- if node.get_attribute('http-equiv') =~ /refresh/i
557
- content = node.get_attribute('content')
95
+ def method_missing(name,*arguments,&block)
96
+ if (arguments.empty? && block.nil?)
97
+ header_name = name.to_s.sub('_','-')
558
98
 
559
- if (redirect = content.match(/url=(\S+)$/))
560
- redirects << redirect[1]
561
- end
562
- end
99
+ if @response.key?(header_name)
100
+ return @response[header_name]
563
101
  end
564
102
  end
565
103
 
566
- return redirects.uniq
567
- end
568
-
569
- #
570
- # Returns a boolean indicating whether or not page-level meta
571
- # redirects are present in this page.
572
- #
573
- # @return [Boolean]
574
- # Specifies whether the page includes page-level redirects.
575
- #
576
- def meta_redirect?
577
- !meta_redirect.empty?
578
- end
579
-
580
- protected
581
-
582
- #
583
- # Determines if any of the content-types of the page include a given
584
- # type.
585
- #
586
- # @param [String] type
587
- # The content-type to test for.
588
- #
589
- # @return [Boolean]
590
- # Specifies whether the page includes the given content-type.
591
- #
592
- # @since 0.2.4
593
- #
594
- def is_content_type?(type)
595
- content_types.any? { |content| content.include?(type) }
596
- end
597
-
598
- #
599
- # Provides transparent access to the values in `headers`.
600
- #
601
- def method_missing(sym,*args,&block)
602
- if (args.empty? && block.nil?)
603
- name = sym.id2name.sub('_','-')
604
-
605
- return @response[name] if @response.key?(name)
606
- end
607
-
608
- return super(sym,*args,&block)
104
+ return super(name,*arguments,&block)
609
105
  end
610
106
 
611
107
  end