spidr 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/spidr/page.rb CHANGED
@@ -1,8 +1,6 @@
1
- require 'spidr/extensions/uri'
2
-
3
- require 'set'
4
- require 'uri'
5
- require 'nokogiri'
1
+ require 'spidr/headers'
2
+ require 'spidr/body'
3
+ require 'spidr/links'
6
4
 
7
5
  module Spidr
8
6
  #
@@ -10,8 +8,9 @@ module Spidr
10
8
  #
11
9
  class Page
12
10
 
13
- # Reserved names used within Cookie strings
14
- RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
11
+ include Headers
12
+ include Body
13
+ include Links
15
14
 
16
15
  # URL of the page
17
16
  attr_reader :url
@@ -39,26 +38,21 @@ module Spidr
39
38
  end
40
39
 
41
40
  #
42
- # The response code from the page.
43
- #
44
- # @return [Integer]
45
- # Response code from the page.
46
- #
47
- def code
48
- @response.code.to_i
49
- end
50
-
41
+ # The meta-redirect links of the page.
51
42
  #
52
- # Determines if the response code is `200`.
43
+ # @return [Array<String>]
44
+ # All meta-redirect links in the page.
53
45
  #
54
- # @return [Boolean]
55
- # Specifies whether the response code is `200`.
46
+ # @deprecated
47
+ # Deprecated in 0.3.0 and will be removed in 0.4.0.
48
+ # Use {#meta_redirects} instead.
56
49
  #
57
- def is_ok?
58
- code == 200
59
- end
50
+ def meta_redirect
51
+ STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
52
+ STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
60
53
 
61
- alias ok? is_ok?
54
+ meta_redirects
55
+ end
62
56
 
63
57
  #
64
58
  # Determines if the response code is `300`, `301`, `302`, `303`
@@ -81,531 +75,33 @@ module Spidr
81
75
 
82
76
  alias redirect? is_redirect?
83
77
 
84
- #
85
- # Determines if the response code is `308`.
86
- #
87
- # @return [Boolean]
88
- # Specifies whether the response code is `308`.
89
- #
90
- def timedout?
91
- code == 308
92
- end
93
-
94
- #
95
- # Determines if the response code is `400`.
96
- #
97
- # @return [Boolean]
98
- # Specifies whether the response code is `400`.
99
- #
100
- def bad_request?
101
- code == 400
102
- end
103
-
104
- #
105
- # Determines if the response code is `401`.
106
- #
107
- # @return [Boolean]
108
- # Specifies whether the response code is `401`.
109
- #
110
- def is_unauthorized?
111
- code == 401
112
- end
113
-
114
- alias unauthorized? is_unauthorized?
115
-
116
- #
117
- # Determines if the response code is `403`.
118
- #
119
- # @return [Boolean]
120
- # Specifies whether the response code is `403`.
121
- #
122
- def is_forbidden?
123
- code == 403
124
- end
125
-
126
- alias forbidden? is_forbidden?
127
-
128
- #
129
- # Determines if the response code is `404`.
130
- #
131
- # @return [Boolean]
132
- # Specifies whether the response code is `404`.
133
- #
134
- def is_missing?
135
- code == 404
136
- end
137
-
138
- alias missing? is_missing?
139
-
140
- #
141
- # Determines if the response code is `500`.
142
- #
143
- # @return [Boolean]
144
- # Specifies whether the response code is `500`.
145
- #
146
- def had_internal_server_error?
147
- code == 500
148
- end
149
-
150
- #
151
- # The Content-Type of the page.
152
- #
153
- # @return [String]
154
- # The Content-Type of the page.
155
- #
156
- def content_type
157
- (@response['Content-Type'] || '')
158
- end
159
-
160
- #
161
- # The content types of the page.
162
- #
163
- # @return [Array<String>]
164
- # The values within the Content-Type header.
165
- #
166
- # @since 0.2.2
167
- #
168
- def content_types
169
- (@headers['content-type'] || [])
170
- end
171
-
172
- #
173
- # Determines if the page is plain-text.
174
- #
175
- # @return [Boolean]
176
- # Specifies whether the page is plain-text.
177
- #
178
- def plain_text?
179
- is_content_type?('text/plain')
180
- end
181
-
182
- alias txt? plain_text?
183
-
184
- #
185
- # Determines if the page is HTML document.
186
- #
187
- # @return [Boolean]
188
- # Specifies whether the page is HTML document.
189
- #
190
- def html?
191
- is_content_type?('text/html')
192
- end
193
-
194
- #
195
- # Determines if the page is XML document.
196
- #
197
- # @return [Boolean]
198
- # Specifies whether the page is XML document.
199
- #
200
- def xml?
201
- is_content_type?('text/xml')
202
- end
203
-
204
- #
205
- # Determines if the page is XML Stylesheet (XSL).
206
- #
207
- # @return [Boolean]
208
- # Specifies whether the page is XML Stylesheet (XSL).
209
- #
210
- def xsl?
211
- is_content_type?('text/xsl')
212
- end
213
-
214
- #
215
- # Determines if the page is JavaScript.
216
- #
217
- # @return [Boolean]
218
- # Specifies whether the page is JavaScript.
219
- #
220
- def javascript?
221
- is_content_type?('text/javascript') || \
222
- is_content_type?('application/javascript')
223
- end
224
-
225
- #
226
- # Determines if the page is a CSS stylesheet.
227
- #
228
- # @return [Boolean]
229
- # Specifies whether the page is a CSS stylesheet.
230
- #
231
- def css?
232
- is_content_type?('text/css')
233
- end
234
-
235
- #
236
- # Determines if the page is a RSS feed.
237
- #
238
- # @return [Boolean]
239
- # Specifies whether the page is a RSS feed.
240
- #
241
- def rss?
242
- is_content_type?('application/rss+xml') || \
243
- is_content_type?('application/rdf+xml')
244
- end
245
-
246
- #
247
- # Determines if the page is an Atom feed.
248
- #
249
- # @return [Boolean]
250
- # Specifies whether the page is an Atom feed.
251
- #
252
- def atom?
253
- is_content_type?('application/atom+xml')
254
- end
255
-
256
- #
257
- # Determines if the page is a MS Word document.
258
- #
259
- # @return [Boolean]
260
- # Specifies whether the page is a MS Word document.
261
- #
262
- def ms_word?
263
- is_content_type?('application/msword')
264
- end
265
-
266
- #
267
- # Determines if the page is a PDF document.
268
- #
269
- # @return [Boolean]
270
- # Specifies whether the page is a PDF document.
271
- #
272
- def pdf?
273
- is_content_type?('application/pdf')
274
- end
275
-
276
- #
277
- # Determines if the page is a ZIP archive.
278
- #
279
- # @return [Boolean]
280
- # Specifies whether the page is a ZIP archive.
281
- #
282
- def zip?
283
- is_content_type?('application/zip')
284
- end
285
-
286
- #
287
- # The raw Cookie String sent along with the page.
288
- #
289
- # @return [String]
290
- # The raw Cookie from the response.
291
- #
292
- # @since 0.2.7
293
- #
294
- def raw_cookie
295
- (@response['Set-Cookie'] || '')
296
- end
297
-
298
- #
299
- # The raw Cookie String sent along with the page.
300
- #
301
- # @return [String]
302
- # The raw Cookie from the response.
303
- #
304
- # @deprecated
305
- # Deprecated in 0.2.7 and will be removed in 0.3.0.
306
- # Use {#raw_cookie} instead.
307
- #
308
- # @since 0.2.2
309
- #
310
- def cookie
311
- STDERR.puts 'DEPRECATION: Spidr::Page#cookie will be removed in 0.3.0'
312
- STDERR.puts 'DEPRECATION: Use Spidr::Page#raw_cookie instead'
313
-
314
- return raw_cookie
315
- end
316
-
317
- #
318
- # The Cookie values sent along with the page.
319
- #
320
- # @return [Array<String>]
321
- # The Cookies from the response.
322
- #
323
- # @since 0.2.2
324
- #
325
- def cookies
326
- (@headers['set-cookie'] || [])
327
- end
328
-
329
- #
330
- # The Cookie key -> value pairs returned with the response.
331
- #
332
- # @return [Hash{String => String}]
333
- # The cookie keys and values.
334
- #
335
- # @since 0.2.2
336
- #
337
- def cookie_params
338
- params = {}
339
-
340
- cookies.each do |cookie|
341
- cookie.split('; ').each do |key_value|
342
- key, value = key_value.split('=',2)
343
-
344
- next if RESERVED_COOKIE_NAMES.include?(key)
345
-
346
- params[key] = (value || '')
347
- end
348
- end
349
-
350
- return params
351
- end
352
-
353
- #
354
- # The body of the response.
355
- #
356
- # @return [String]
357
- # The body of the response.
358
- #
359
- def body
360
- (@response.body || '')
361
- end
362
-
363
- #
364
- # Returns a parsed document object for HTML, XML, RSS and Atom pages.
365
- #
366
- # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
367
- # The document that represents HTML or XML pages.
368
- # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
369
- # the page could not be parsed properly.
370
- #
371
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
372
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
373
- #
374
- def doc
375
- return nil if body.empty?
376
-
377
- begin
378
- if html?
379
- return @doc ||= Nokogiri::HTML(body)
380
- elsif (xml? || xsl? || rss? || atom?)
381
- return @doc ||= Nokogiri::XML(body)
382
- end
383
- rescue
384
- return nil
385
- end
386
- end
387
-
388
- #
389
- # Searches the document for XPath or CSS Path paths.
390
- #
391
- # @param [Array<String>] paths
392
- # CSS or XPath expressions to search the document with.
393
- #
394
- # @return [Array]
395
- # The matched nodes from the document.
396
- # Returns an empty Array if no nodes were matched, or if the page
397
- # is not an HTML or XML document.
398
- #
399
- # @example
400
- # page.search('//a[@href]')
401
- #
402
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
403
- #
404
- def search(*paths)
405
- if doc
406
- doc.search(*paths)
407
- else
408
- []
409
- end
410
- end
78
+ protected
411
79
 
412
80
  #
413
- # Searches for the first occurrence an XPath or CSS Path expression.
414
- #
415
- # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
416
- # The first matched node. Returns `nil` if no nodes could be matched,
417
- # or if the page is not a HTML or XML document.
418
- #
419
- # @example
420
- # page.at('//title')
81
+ # Provides transparent access to the values in {#headers}.
421
82
  #
422
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
83
+ # @param [Symbol] name
84
+ # The name of the missing method.
423
85
  #
424
- def at(*arguments)
425
- if doc
426
- doc.at(*arguments)
427
- end
428
- end
429
-
430
- alias / search
431
- alias % at
432
-
433
- #
434
- # The title of the HTML page.
86
+ # @param [Array] arguments
87
+ # Additional arguments for the missing method.
435
88
  #
436
89
  # @return [String]
437
- # The inner-text of the title element of the page.
438
- #
439
- def title
440
- if (node = at('//title'))
441
- node.inner_text
442
- end
443
- end
444
-
445
- #
446
- # The links from within the page.
447
- #
448
- # @return [Array<String>]
449
- # All links within the HTML page, frame/iframe source URLs and any
450
- # links in the `Location` header.
90
+ # The missing method mapped to a header in {#headers}.
451
91
  #
452
- def links
453
- urls = []
454
-
455
- add_url = lambda { |url|
456
- urls << url unless (url.nil? || url.empty?)
457
- }
458
-
459
- self.redirects_to.each(&add_url) if self.is_redirect?
460
-
461
- if (html? && doc)
462
- doc.search('a[@href]').each do |a|
463
- add_url.call(a.get_attribute('href'))
464
- end
465
-
466
- doc.search('frame[@src]').each do |iframe|
467
- add_url.call(iframe.get_attribute('src'))
468
- end
469
-
470
- doc.search('iframe[@src]').each do |iframe|
471
- add_url.call(iframe.get_attribute('src'))
472
- end
473
-
474
- doc.search('link[@href]').each do |link|
475
- add_url.call(link.get_attribute('href'))
476
- end
477
-
478
- doc.search('script[@src]').each do |script|
479
- add_url.call(script.get_attribute('src'))
480
- end
481
- end
482
-
483
- return urls
484
- end
485
-
486
- #
487
- # URL(s) that this document redirects to.
488
- #
489
- # @return [Array<String>]
490
- # The links that this page redirects to (usually found in a
491
- # location header or by way of a page-level meta redirect).
492
- #
493
- def redirects_to
494
- location = @headers['location']
495
-
496
- if location.nil?
497
- # check page-level meta redirects if there isn't a location header
498
- meta_redirect
499
- elsif location.kind_of?(Array)
500
- location
501
- else
502
- # usually the location header contains a single String
503
- [location]
504
- end
505
- end
506
-
507
- #
508
- # Absolute URIs from within the page.
509
- #
510
- # @return [Array<URI::HTTP>]
511
- # The links from within the page, converted to absolute URIs.
512
- #
513
- def urls
514
- links.map { |link| to_absolute(link) }.compact
515
- end
516
-
517
- #
518
- # Normalizes and expands a given link into a proper URI.
519
- #
520
- # @param [String] link
521
- # The link to normalize and expand.
522
- #
523
- # @return [URI::HTTP]
524
- # The normalized URI.
525
- #
526
- def to_absolute(link)
527
- begin
528
- url = @url.merge(link.to_s)
529
- rescue URI::InvalidURIError, URI::InvalidComponentError
530
- return nil
531
- end
532
-
533
- unless (url.path.nil? || url.path.empty?)
534
- # make sure the path does not contain any .. or . directories,
535
- # since URI::Generic#merge cannot normalize paths such as
536
- # "/stuff/../"
537
- url.path = URI.expand_path(url.path)
538
- end
539
-
540
- return url
541
- end
542
-
543
- #
544
- # Determines if a page-level "soft" redirect is present. If yes,
545
- # returns an array of those redirects (usually a single URL).
546
- # Otherwise, returns false.
547
- #
548
- # @return [Array<String>]
549
- # An array of redirect URLs
92
+ # @raise [NoMethodError]
93
+ # The missing method did not map to a header in {#headers}.
550
94
  #
551
- def meta_redirect
552
- redirects = []
553
-
554
- if (html? && doc)
555
- search('//meta[@http-equiv and @content]').each do |node|
556
- if node.get_attribute('http-equiv') =~ /refresh/i
557
- content = node.get_attribute('content')
95
+ def method_missing(name,*arguments,&block)
96
+ if (arguments.empty? && block.nil?)
97
+ header_name = name.to_s.sub('_','-')
558
98
 
559
- if (redirect = content.match(/url=(\S+)$/))
560
- redirects << redirect[1]
561
- end
562
- end
99
+ if @response.key?(header_name)
100
+ return @response[header_name]
563
101
  end
564
102
  end
565
103
 
566
- return redirects.uniq
567
- end
568
-
569
- #
570
- # Returns a boolean indicating whether or not page-level meta
571
- # redirects are present in this page.
572
- #
573
- # @return [Boolean]
574
- # Specifies whether the page includes page-level redirects.
575
- #
576
- def meta_redirect?
577
- !meta_redirect.empty?
578
- end
579
-
580
- protected
581
-
582
- #
583
- # Determines if any of the content-types of the page include a given
584
- # type.
585
- #
586
- # @param [String] type
587
- # The content-type to test for.
588
- #
589
- # @return [Boolean]
590
- # Specifies whether the page includes the given content-type.
591
- #
592
- # @since 0.2.4
593
- #
594
- def is_content_type?(type)
595
- content_types.any? { |content| content.include?(type) }
596
- end
597
-
598
- #
599
- # Provides transparent access to the values in `headers`.
600
- #
601
- def method_missing(sym,*args,&block)
602
- if (args.empty? && block.nil?)
603
- name = sym.id2name.sub('_','-')
604
-
605
- return @response[name] if @response.key?(name)
606
- end
607
-
608
- return super(sym,*args,&block)
104
+ return super(name,*arguments,&block)
609
105
  end
610
106
 
611
107
  end