spidr 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -3,6 +3,7 @@ doc
3
3
  web
4
4
  tmp
5
5
  .DS_Store
6
+ .bundle
6
7
  .yardoc
7
8
  *.swp
8
9
  *~
data/ChangeLog.md CHANGED
@@ -1,3 +1,13 @@
1
+ ### 0.2.5 / 2010-07-02
2
+
3
+ * Added {Spidr::Page#meta_redirect}.
4
+ * Added {Spidr::Page#meta_redirect?}.
5
+ * Manage development dependencies with Bundler.
6
+ * Support following "old-school" meta-refresh redirects (thanks zapnap).
7
+ * Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
8
+ * Fixed a constant lookup issue in {Spidr::Agent}.
9
+ * Use `yield` instead of `block.call` when necessary.
10
+
1
11
  ### 0.2.4 / 2010-05-05
2
12
 
3
13
  * Added {Spidr::Filters#visit_urls}.
data/Gemfile ADDED
@@ -0,0 +1,27 @@
1
+ source 'https://rubygems.org'
2
+
3
+ group(:runtime) do
4
+ gem 'nokogiri', '>= 1.3.0'
5
+ end
6
+
7
+ group(:development) do
8
+ gem 'rake', '~> 0.8.7'
9
+ gem 'jeweler', '~> 1.4.0', :git => 'git://github.com/technicalpickles/jeweler.git'
10
+ end
11
+
12
+ group(:doc) do
13
+ case RUBY_PLATFORM
14
+ when 'java'
15
+ gem 'maruku', '~> 0.6.0'
16
+ else
17
+ gem 'rdiscount', '~> 1.6.3'
18
+ end
19
+
20
+ gem 'yard', '~> 0.5.3'
21
+ end
22
+
23
+ group(:test) do
24
+ gem 'wsoc', '~> 0.1.3'
25
+ end
26
+
27
+ gem 'rspec', '~> 1.3.0', :group => [:development, :test]
data/README.md CHANGED
@@ -20,6 +20,7 @@ and easy to use.
20
20
  * frame tags.
21
21
  * Cookie protected links.
22
22
  * HTTP 300, 301, 302, 303 and 307 Redirects.
23
+ * Meta-Refresh Redirects.
23
24
  * HTTP Basic Auth protected links.
24
25
  * Black-list or white-list URLs based upon:
25
26
  * URL scheme.
data/Rakefile CHANGED
@@ -1,27 +1,28 @@
1
1
  require 'rubygems'
2
+ require 'bundler'
3
+
4
+ begin
5
+ Bundler.setup(:development, :doc)
6
+ rescue Bundler::BundlerError => e
7
+ STDERR.puts e.message
8
+ STDERR.puts "Run `bundle install` to install missing gems"
9
+ exit e.status_code
10
+ end
11
+
2
12
  require 'rake'
13
+ require 'jeweler'
3
14
  require './lib/spidr/version.rb'
4
15
 
5
- begin
6
- require 'jeweler'
7
- Jeweler::Tasks.new do |gem|
8
- gem.name = 'spidr'
9
- gem.version = Spidr::VERSION
10
- gem.license = 'MIT'
11
- gem.summary = %Q{A versatile Ruby web spidering library}
12
- gem.description = %Q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
13
- gem.email = 'postmodern.mod3@gmail.com'
14
- gem.homepage = 'http://github.com/postmodern/spidr'
15
- gem.authors = ['Postmodern']
16
- gem.add_dependency 'nokogiri', '>= 1.3.0'
17
- gem.add_development_dependency 'rspec', '~> 1.3.0'
18
- gem.add_development_dependency 'yard', '~> 0.5.3'
19
- gem.add_development_dependency 'wsoc', '~> 0.1.1'
20
- gem.has_rdoc = 'yard'
21
- end
22
- Jeweler::GemcutterTasks.new
23
- rescue LoadError
24
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
16
+ Jeweler::Tasks.new do |gem|
17
+ gem.name = 'spidr'
18
+ gem.version = Spidr::VERSION
19
+ gem.license = 'MIT'
20
+ gem.summary = %Q{A versatile Ruby web spidering library}
21
+ gem.description = %Q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
22
+ gem.email = 'postmodern.mod3@gmail.com'
23
+ gem.homepage = 'http://github.com/postmodern/spidr'
24
+ gem.authors = ['Postmodern']
25
+ gem.has_rdoc = 'yard'
25
26
  end
26
27
 
27
28
  require 'spec/rake/spectask'
@@ -31,15 +32,7 @@ Spec::Rake::SpecTask.new(:spec) do |spec|
31
32
  spec.spec_opts = ['--options', '.specopts']
32
33
  end
33
34
 
34
- task :spec => :check_dependencies
35
35
  task :default => :spec
36
36
 
37
- begin
38
- require 'yard'
39
-
40
- YARD::Rake::YardocTask.new
41
- rescue LoadError
42
- task :yard do
43
- abort "YARD is not available. In order to run yard, you must: gem install yard"
44
- end
45
- end
37
+ require 'yard'
38
+ YARD::Rake::YardocTask.new
data/lib/spidr/agent.rb CHANGED
@@ -98,7 +98,7 @@ module Spidr
98
98
  # @yieldparam [Agent] agent
99
99
  # The newly created agent.
100
100
  #
101
- def initialize(options={},&block)
101
+ def initialize(options={})
102
102
  @host_header = options[:host_header]
103
103
  @host_headers = {}
104
104
 
@@ -121,7 +121,7 @@ module Spidr
121
121
 
122
122
  super(options)
123
123
 
124
- block.call(self) if block
124
+ yield self if block_given?
125
125
  end
126
126
 
127
127
  #
@@ -140,9 +140,9 @@ module Spidr
140
140
  # @yieldparam [Agent] agent
141
141
  # The newly created agent.
142
142
  #
143
- def self.start_at(url,options={},&block)
143
+ def self.start_at(url,options={})
144
144
  self.new(options) do |spider|
145
- block.call(spider) if block
145
+ yield spider if block_given?
146
146
 
147
147
  spider.start_at(url)
148
148
  end
@@ -164,9 +164,9 @@ module Spidr
164
164
  # @yieldparam [Agent] agent
165
165
  # The newly created agent.
166
166
  #
167
- def self.host(name,options={},&block)
167
+ def self.host(name,options={})
168
168
  self.new(options.merge(:host => name)) do |spider|
169
- block.call(spider) if block
169
+ yield spider if block_given?
170
170
 
171
171
  spider.start_at("http://#{name}/")
172
172
  end
@@ -188,11 +188,11 @@ module Spidr
188
188
  # @yieldparam [Agent] agent
189
189
  # The newly created agent.
190
190
  #
191
- def self.site(url,options={},&block)
191
+ def self.site(url,options={})
192
192
  url = URI(url.to_s)
193
193
 
194
194
  return self.new(options.merge(:host => url.host)) do |spider|
195
- block.call(spider) if block
195
+ yield spider if block_given?
196
196
 
197
197
  spider.start_at(url)
198
198
  end
@@ -457,11 +457,18 @@ module Spidr
457
457
  link = url.to_s
458
458
 
459
459
  begin
460
- @every_url_blocks.each { |block| block.call(url) }
460
+ @every_url_blocks.each { |url_block| url_block.call(url) }
461
461
 
462
- @urls_like_blocks.each do |pattern,blocks|
463
- if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
464
- blocks.each { |url_block| url_block.call(url) }
462
+ @urls_like_blocks.each do |pattern,url_blocks|
463
+ match = case pattern
464
+ when Regexp
465
+ link =~ pattern
466
+ else
467
+ (pattern == link) || (pattern == url)
468
+ end
469
+
470
+ if match
471
+ url_blocks.each { |url_block| url_block.call(url) }
465
472
  end
466
473
  end
467
474
  rescue Actions::Paused => action
@@ -494,7 +501,7 @@ module Spidr
494
501
  # @return [Page, nil]
495
502
  # The page for the response, or `nil` if the request failed.
496
503
  #
497
- def get_page(url,&block)
504
+ def get_page(url)
498
505
  url = URI(url.to_s)
499
506
 
500
507
  prepare_request(url) do |session,path,headers|
@@ -503,7 +510,7 @@ module Spidr
503
510
  # save any new cookies
504
511
  @cookies.from_page(new_page)
505
512
 
506
- block.call(new_page) if block
513
+ yield new_page if block_given?
507
514
  return new_page
508
515
  end
509
516
  end
@@ -529,7 +536,7 @@ module Spidr
529
536
  #
530
537
  # @since 0.2.2
531
538
  #
532
- def post_page(url,post_data='',&block)
539
+ def post_page(url,post_data='')
533
540
  url = URI(url.to_s)
534
541
 
535
542
  prepare_request(url) do |session,path,headers|
@@ -538,7 +545,7 @@ module Spidr
538
545
  # save any new cookies
539
546
  @cookies.from_page(new_page)
540
547
 
541
- block.call(new_page) if block
548
+ yield new_page if block_given?
542
549
  return new_page
543
550
  end
544
551
  end
@@ -560,7 +567,7 @@ module Spidr
560
567
  # The page that was visited. If `nil` is returned, either the request
561
568
  # for the page failed, or the page was skipped.
562
569
  #
563
- def visit_page(url,&block)
570
+ def visit_page(url)
564
571
  url = URI(url.to_s) unless url.kind_of?(URI)
565
572
 
566
573
  get_page(url) do |page|
@@ -569,7 +576,7 @@ module Spidr
569
576
  begin
570
577
  @every_page_blocks.each { |page_block| page_block.call(page) }
571
578
 
572
- block.call(page) if block
579
+ yield page if block_given?
573
580
  rescue Actions::Paused => action
574
581
  raise(action)
575
582
  rescue Actions::SkipPage
@@ -668,7 +675,7 @@ module Spidr
668
675
  begin
669
676
  sleep(@delay) if @delay > 0
670
677
 
671
- block.call(@sessions[url],path,headers)
678
+ yield @sessions[url], path, headers
672
679
  rescue SystemCallError,
673
680
  Timeout::Error,
674
681
  SocketError,
@@ -719,7 +726,7 @@ module Spidr
719
726
  #
720
727
  def failed(url)
721
728
  @failures << url
722
- @every_failed_url_blocks.each { |block| block.call(url) }
729
+ @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
723
730
  return true
724
731
  end
725
732
 
@@ -130,6 +130,22 @@ module Spidr
130
130
  @dirty.delete(host)
131
131
  end
132
132
 
133
+ hdomain = host.split('.')
134
+
135
+ if hdomain.length > 2
136
+ parent_cookies = for_host(hdomain[1..-1].join('.'))
137
+
138
+ unless (parent_cookies.nil? || parent_cookies.empty?)
139
+ @cookies[host] = if @cookies[host].nil?
140
+ # inherit the parent cookies
141
+ parent_cookies
142
+ else
143
+ # merge the parent cookies with any host-specific cookies
144
+ "#{parent_cookies}; #{@cookies[host]}"
145
+ end
146
+ end
147
+ end
148
+
133
149
  return @cookies[host]
134
150
  end
135
151
 
data/lib/spidr/events.rb CHANGED
@@ -72,8 +72,8 @@ module Spidr
72
72
  # @yieldparam [Hash] headers
73
73
  # The headers from a response.
74
74
  #
75
- def all_headers(&block)
76
- every_page { |page| block.call(page.headers) }
75
+ def all_headers
76
+ every_page { |page| yield page.headers }
77
77
  end
78
78
 
79
79
  #
@@ -99,9 +99,9 @@ module Spidr
99
99
  # @yieldparam [Page] page
100
100
  # A visited page.
101
101
  #
102
- def every_ok_page(&block)
102
+ def every_ok_page
103
103
  every_page do |page|
104
- block.call(page) if (block && page.ok?)
104
+ yield page if (block_given? && page.ok?)
105
105
  end
106
106
  end
107
107
 
@@ -114,9 +114,9 @@ module Spidr
114
114
  # @yieldparam [Page] page
115
115
  # A visited page.
116
116
  #
117
- def every_redirect_page(&block)
117
+ def every_redirect_page
118
118
  every_page do |page|
119
- block.call(page) if (block && page.redirect?)
119
+ yield page if (block_given? && page.redirect?)
120
120
  end
121
121
  end
122
122
 
@@ -129,9 +129,9 @@ module Spidr
129
129
  # @yieldparam [Page] page
130
130
  # A visited page.
131
131
  #
132
- def every_timedout_page(&block)
132
+ def every_timedout_page
133
133
  every_page do |page|
134
- block.call(page) if (block && page.timedout?)
134
+ yield page if (block_given? && page.timedout?)
135
135
  end
136
136
  end
137
137
 
@@ -144,9 +144,9 @@ module Spidr
144
144
  # @yieldparam [Page] page
145
145
  # A visited page.
146
146
  #
147
- def every_bad_request_page(&block)
147
+ def every_bad_request_page
148
148
  every_page do |page|
149
- block.call(page) if (block && page.bad_request?)
149
+ yield page if (block_given? && page.bad_request?)
150
150
  end
151
151
  end
152
152
 
@@ -159,9 +159,9 @@ module Spidr
159
159
  # @yieldparam [Page] page
160
160
  # A visited page.
161
161
  #
162
- def every_unauthorized_page(&block)
162
+ def every_unauthorized_page
163
163
  every_page do |page|
164
- block.call(page) if (block && page.unauthorized?)
164
+ yield page if (block_given? && page.unauthorized?)
165
165
  end
166
166
  end
167
167
 
@@ -174,9 +174,9 @@ module Spidr
174
174
  # @yieldparam [Page] page
175
175
  # A visited page.
176
176
  #
177
- def every_forbidden_page(&block)
177
+ def every_forbidden_page
178
178
  every_page do |page|
179
- block.call(page) if (block && page.forbidden?)
179
+ yield page if (block_given? && page.forbidden?)
180
180
  end
181
181
  end
182
182
 
@@ -189,9 +189,9 @@ module Spidr
189
189
  # @yieldparam [Page] page
190
190
  # A visited page.
191
191
  #
192
- def every_missing_page(&block)
192
+ def every_missing_page
193
193
  every_page do |page|
194
- block.call(page) if (block && page.missing?)
194
+ yield page if (block_given? && page.missing?)
195
195
  end
196
196
  end
197
197
 
@@ -205,9 +205,9 @@ module Spidr
205
205
  # @yieldparam [Page] page
206
206
  # A visited page.
207
207
  #
208
- def every_internal_server_error_page(&block)
208
+ def every_internal_server_error_page
209
209
  every_page do |page|
210
- block.call(page) if (block && page.had_internal_server_error?)
210
+ yield page if (block_given? && page.had_internal_server_error?)
211
211
  end
212
212
  end
213
213
 
@@ -220,9 +220,9 @@ module Spidr
220
220
  # @yieldparam [Page] page
221
221
  # A visited page.
222
222
  #
223
- def every_txt_page(&block)
223
+ def every_txt_page
224
224
  every_page do |page|
225
- block.call(page) if (block && page.txt?)
225
+ yield page if (block_given? && page.txt?)
226
226
  end
227
227
  end
228
228
 
@@ -235,9 +235,9 @@ module Spidr
235
235
  # @yieldparam [Page] page
236
236
  # A visited page.
237
237
  #
238
- def every_html_page(&block)
238
+ def every_html_page
239
239
  every_page do |page|
240
- block.call(page) if (block && page.html?)
240
+ yield page if (block_given? && page.html?)
241
241
  end
242
242
  end
243
243
 
@@ -250,9 +250,9 @@ module Spidr
250
250
  # @yieldparam [Page] page
251
251
  # A visited page.
252
252
  #
253
- def every_xml_page(&block)
253
+ def every_xml_page
254
254
  every_page do |page|
255
- block.call(page) if (block && page.xml?)
255
+ yield page if (block_given? && page.xml?)
256
256
  end
257
257
  end
258
258
 
@@ -266,9 +266,9 @@ module Spidr
266
266
  # @yieldparam [Page] page
267
267
  # A visited page.
268
268
  #
269
- def every_xsl_page(&block)
269
+ def every_xsl_page
270
270
  every_page do |page|
271
- block.call(page) if (block && page.xsl?)
271
+ yield page if (block_given? && page.xsl?)
272
272
  end
273
273
  end
274
274
 
@@ -285,11 +285,11 @@ module Spidr
285
285
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
286
286
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
287
287
  #
288
- def every_doc(&block)
288
+ def every_doc
289
289
  every_page do |page|
290
- if block
290
+ if block_given?
291
291
  if (doc = page.doc)
292
- block.call(doc)
292
+ yield doc
293
293
  end
294
294
  end
295
295
  end
@@ -306,11 +306,11 @@ module Spidr
306
306
  #
307
307
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
308
308
  #
309
- def every_html_doc(&block)
309
+ def every_html_doc
310
310
  every_page do |page|
311
- if (block && page.html?)
311
+ if (block_given? && page.html?)
312
312
  if (doc = page.doc)
313
- block.call(doc)
313
+ yield doc
314
314
  end
315
315
  end
316
316
  end
@@ -327,11 +327,11 @@ module Spidr
327
327
  #
328
328
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
329
329
  #
330
- def every_xml_doc(&block)
330
+ def every_xml_doc
331
331
  every_page do |page|
332
- if (block && page.xml?)
332
+ if (block_given? && page.xml?)
333
333
  if (doc = page.doc)
334
- block.call(doc)
334
+ yield doc
335
335
  end
336
336
  end
337
337
  end
@@ -349,11 +349,11 @@ module Spidr
349
349
  #
350
350
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
351
351
  #
352
- def every_xsl_doc(&block)
352
+ def every_xsl_doc
353
353
  every_page do |page|
354
- if (block && page.xsl?)
354
+ if (block_given? && page.xsl?)
355
355
  if (doc = page.doc)
356
- block.call(doc)
356
+ yield doc
357
357
  end
358
358
  end
359
359
  end
@@ -370,11 +370,11 @@ module Spidr
370
370
  #
371
371
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
372
372
  #
373
- def every_rss_doc(&block)
373
+ def every_rss_doc
374
374
  every_page do |page|
375
- if (block && page.rss?)
375
+ if (block_given? && page.rss?)
376
376
  if (doc = page.doc)
377
- block.call(doc)
377
+ yield doc
378
378
  end
379
379
  end
380
380
  end
@@ -391,11 +391,11 @@ module Spidr
391
391
  #
392
392
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
393
393
  #
394
- def every_atom_doc(&block)
394
+ def every_atom_doc
395
395
  every_page do |page|
396
- if (block && page.atom?)
396
+ if (block_given? && page.atom?)
397
397
  if (doc = page.doc)
398
- block.call(doc)
398
+ yield doc
399
399
  end
400
400
  end
401
401
  end
@@ -410,9 +410,9 @@ module Spidr
410
410
  # @yieldparam [Page] page
411
411
  # A visited page.
412
412
  #
413
- def every_javascript_page(&block)
413
+ def every_javascript_page
414
414
  every_page do |page|
415
- block.call(page) if (block && page.javascript?)
415
+ yield page if (block_given? && page.javascript?)
416
416
  end
417
417
  end
418
418
 
@@ -425,9 +425,9 @@ module Spidr
425
425
  # @yieldparam [Page] page
426
426
  # A visited page.
427
427
  #
428
- def every_css_page(&block)
428
+ def every_css_page
429
429
  every_page do |page|
430
- block.call(page) if (block && page.css?)
430
+ yield page if (block_given? && page.css?)
431
431
  end
432
432
  end
433
433
 
@@ -440,9 +440,9 @@ module Spidr
440
440
  # @yieldparam [Page] feed
441
441
  # A visited page.
442
442
  #
443
- def every_rss_page(&block)
443
+ def every_rss_page
444
444
  every_page do |page|
445
- block.call(page) if (block && page.rss?)
445
+ yield page if (block_given? && page.rss?)
446
446
  end
447
447
  end
448
448
 
@@ -455,9 +455,9 @@ module Spidr
455
455
  # @yieldparam [Page] feed
456
456
  # A visited page.
457
457
  #
458
- def every_atom_page(&block)
458
+ def every_atom_page
459
459
  every_page do |page|
460
- block.call(page) if (block && page.atom?)
460
+ yield page if (block_given? && page.atom?)
461
461
  end
462
462
  end
463
463
 
@@ -470,9 +470,9 @@ module Spidr
470
470
  # @yieldparam [Page] page
471
471
  # A visited page.
472
472
  #
473
- def every_ms_word_page(&block)
473
+ def every_ms_word_page
474
474
  every_page do |page|
475
- block.call(page) if (block && page.ms_word?)
475
+ yield page if (block_given? && page.ms_word?)
476
476
  end
477
477
  end
478
478
 
@@ -485,9 +485,9 @@ module Spidr
485
485
  # @yieldparam [Page] page
486
486
  # A visited page.
487
487
  #
488
- def every_pdf_page(&block)
488
+ def every_pdf_page
489
489
  every_page do |page|
490
- block.call(page) if (block && page.pdf?)
490
+ yield page if (block_given? && page.pdf?)
491
491
  end
492
492
  end
493
493
 
@@ -500,9 +500,9 @@ module Spidr
500
500
  # @yieldparam [Page] page
501
501
  # A visited page.
502
502
  #
503
- def every_zip_page(&block)
503
+ def every_zip_page
504
504
  every_page do |page|
505
- block.call(page) if (block && page.zip?)
505
+ yield page if (block_given? && page.zip?)
506
506
  end
507
507
  end
508
508
 
data/lib/spidr/page.rb CHANGED
@@ -62,7 +62,8 @@ module Spidr
62
62
 
63
63
  #
64
64
  # Determines if the response code is `300`, `301`, `302`, `303`
65
- # or `307`.
65
+ # or `307`. Also checks for "soft" redirects added at the page
66
+ # level by a meta refresh tag.
66
67
  #
67
68
  # @return [Boolean]
68
69
  # Specifies whether the response code is a HTTP Redirect code.
@@ -71,6 +72,8 @@ module Spidr
71
72
  case code
72
73
  when 300..303, 307
73
74
  true
75
+ when 200
76
+ meta_redirect?
74
77
  else
75
78
  false
76
79
  end
@@ -434,17 +437,7 @@ module Spidr
434
437
  urls << url unless (url.nil? || url.empty?)
435
438
  }
436
439
 
437
- if self.is_redirect?
438
- location = @headers['location']
439
-
440
- if location.kind_of?(Array)
441
- # handle multiple location URLs
442
- location.each(&add_url)
443
- else
444
- # usually the location header contains a single String
445
- add_url.call(location)
446
- end
447
- end
440
+ self.redirects_to.each(&add_url) if self.is_redirect?
448
441
 
449
442
  if (html? && doc)
450
443
  doc.search('a[@href]').each do |a|
@@ -471,6 +464,27 @@ module Spidr
471
464
  return urls
472
465
  end
473
466
 
467
+ #
468
+ # URL(s) that this document redirects to.
469
+ #
470
+ # @return [Array<String>]
471
+ # The links that this page redirects to (usually found in a
472
+ # location header or by way of a page-level meta redirect).
473
+ #
474
+ def redirects_to
475
+ location = @headers['location']
476
+
477
+ if location.nil?
478
+ # check page-level meta redirects if there isn't a location header
479
+ meta_redirect
480
+ elsif location.kind_of?(Array)
481
+ location
482
+ else
483
+ # usually the location header contains a single String
484
+ [location]
485
+ end
486
+ end
487
+
474
488
  #
475
489
  # Absolute URIs from within the page.
476
490
  #
@@ -507,6 +521,43 @@ module Spidr
507
521
  return url
508
522
  end
509
523
 
524
+ #
525
+ # Determines if a page-level "soft" redirect is present. If yes,
526
+ # returns an array of those redirects (usually a single URL).
527
+ # Otherwise, returns false.
528
+ #
529
+ # @return [Array<String>]
530
+ # An array of redirect URLs
531
+ #
532
+ def meta_redirect
533
+ redirects = []
534
+
535
+ if (html? && doc)
536
+ search('//meta[@http-equiv and @content]').each do |node|
537
+ if node.attr('http-equiv') =~ /refresh/i
538
+ content = node.attr('content')
539
+
540
+ if (redirect = content.match(/url=(\S+)$/))
541
+ redirects << redirect[1]
542
+ end
543
+ end
544
+ end
545
+ end
546
+
547
+ return redirects.uniq
548
+ end
549
+
550
+ #
551
+ # Returns a boolean indicating whether or not page-level meta
552
+ # redirects are present in this page.
553
+ #
554
+ # @return [Boolean]
555
+ # Specifies whether the page includes page-level redirects.
556
+ #
557
+ def meta_redirect?
558
+ !meta_redirect.empty?
559
+ end
560
+
510
561
  protected
511
562
 
512
563
  #
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.2.4'
3
+ VERSION = '0.2.5'
4
4
  end
@@ -101,8 +101,21 @@ describe CookieJar do
101
101
  it "should encode multiple cookie params" do
102
102
  @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
103
103
  @cookie_jar['zerosum.org'] = {'other' => '1'}
104
+ cookie = @cookie_jar.for_host('zerosum.org')
104
105
 
105
- @cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam; other=1'
106
+ cookie.should include('admin=ofcourseiam')
107
+ cookie.should include('; ')
108
+ cookie.should include('other=1')
109
+ end
110
+
111
+ it "should include cookies for the parent domain" do
112
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
113
+ @cookie_jar['sub.zerosum.org'] = {'other' => '1'}
114
+ cookie = @cookie_jar.for_host('sub.zerosum.org')
115
+
116
+ cookie.should include('admin=ofcourseiam')
117
+ cookie.should include('; ')
118
+ cookie.should include('other=1')
106
119
  end
107
120
  end
108
121
  end
data/spec/helpers/wsoc.rb CHANGED
@@ -66,7 +66,7 @@ module Helpers
66
66
  end
67
67
 
68
68
  def run_course
69
- Agent.start_at(COURSE_URL) do |agent|
69
+ Spidr::Agent.start_at(COURSE_URL) do |agent|
70
70
  course_auth_store.each do |path,auth|
71
71
  agent.authorized.add(
72
72
  COURSE_URL.merge(path),
data/spec/page_spec.rb CHANGED
@@ -79,6 +79,21 @@ describe Page do
79
79
  end
80
80
  end
81
81
 
82
+ describe "redirects" do
83
+ before(:all) do
84
+ @page = get_page('http://spidr.rubyforge.org/course/start.html')
85
+ @page.stub!(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
86
+ end
87
+
88
+ it "should provide access to page-level redirects" do
89
+ @page.redirects_to.should == ['http://spidr.rubyforge.org/redirected']
90
+ end
91
+
92
+ it "should include meta refresh redirects in the list of links" do
93
+ @page.links.should include('http://spidr.rubyforge.org/redirected')
94
+ end
95
+ end
96
+
82
97
  describe "cookies" do
83
98
  before(:all) do
84
99
  @page = get_page('http://twitter.com/login')
data/spec/spec_helper.rb CHANGED
@@ -1,7 +1,15 @@
1
1
  require 'rubygems'
2
- gem 'rspec', '>=1.2.8'
3
- require 'spec'
2
+ require 'bundler'
3
+
4
+ begin
5
+ Bundler.setup(:runtime, :test)
6
+ rescue Bundler::BundlerError => e
7
+ STDERR.puts e.message
8
+ STDERR.puts "Run `bundle install` to install missing gems"
9
+ exit e.status_code
10
+ end
4
11
 
12
+ require 'spec'
5
13
  require 'spidr/version'
6
14
 
7
15
  include Spidr
data/spidr.gemspec CHANGED
@@ -5,112 +5,112 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{spidr}
8
- s.version = "0.2.4"
8
+ s.version = "0.2.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Postmodern"]
12
- s.date = %q{2010-05-05}
12
+ s.date = %q{2010-07-02}
13
13
  s.description = %q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
14
14
  s.email = %q{postmodern.mod3@gmail.com}
15
15
  s.extra_rdoc_files = [
16
16
  "ChangeLog.md",
17
- "LICENSE.txt",
18
- "README.md"
17
+ "LICENSE.txt",
18
+ "README.md"
19
19
  ]
20
20
  s.files = [
21
21
  ".gitignore",
22
- ".specopts",
23
- ".yardopts",
24
- "ChangeLog.md",
25
- "LICENSE.txt",
26
- "README.md",
27
- "Rakefile",
28
- "lib/spidr.rb",
29
- "lib/spidr/actions.rb",
30
- "lib/spidr/actions/actions.rb",
31
- "lib/spidr/actions/exceptions.rb",
32
- "lib/spidr/actions/exceptions/action.rb",
33
- "lib/spidr/actions/exceptions/paused.rb",
34
- "lib/spidr/actions/exceptions/skip_link.rb",
35
- "lib/spidr/actions/exceptions/skip_page.rb",
36
- "lib/spidr/agent.rb",
37
- "lib/spidr/auth_credential.rb",
38
- "lib/spidr/auth_store.rb",
39
- "lib/spidr/cookie_jar.rb",
40
- "lib/spidr/events.rb",
41
- "lib/spidr/extensions.rb",
42
- "lib/spidr/extensions/uri.rb",
43
- "lib/spidr/filters.rb",
44
- "lib/spidr/page.rb",
45
- "lib/spidr/rules.rb",
46
- "lib/spidr/sanitizers.rb",
47
- "lib/spidr/session_cache.rb",
48
- "lib/spidr/spidr.rb",
49
- "lib/spidr/version.rb",
50
- "spec/actions_spec.rb",
51
- "spec/agent_spec.rb",
52
- "spec/auth_store_spec.rb",
53
- "spec/cookie_jar_spec.rb",
54
- "spec/extensions/uri_spec.rb",
55
- "spec/filters_spec.rb",
56
- "spec/helpers/history.rb",
57
- "spec/helpers/page.rb",
58
- "spec/helpers/wsoc.rb",
59
- "spec/page_examples.rb",
60
- "spec/page_spec.rb",
61
- "spec/rules_spec.rb",
62
- "spec/sanitizers_spec.rb",
63
- "spec/session_cache.rb",
64
- "spec/spec_helper.rb",
65
- "spec/spidr_spec.rb",
66
- "spidr.gemspec"
22
+ ".specopts",
23
+ ".yardopts",
24
+ "ChangeLog.md",
25
+ "Gemfile",
26
+ "LICENSE.txt",
27
+ "README.md",
28
+ "Rakefile",
29
+ "lib/spidr.rb",
30
+ "lib/spidr/actions.rb",
31
+ "lib/spidr/actions/actions.rb",
32
+ "lib/spidr/actions/exceptions.rb",
33
+ "lib/spidr/actions/exceptions/action.rb",
34
+ "lib/spidr/actions/exceptions/paused.rb",
35
+ "lib/spidr/actions/exceptions/skip_link.rb",
36
+ "lib/spidr/actions/exceptions/skip_page.rb",
37
+ "lib/spidr/agent.rb",
38
+ "lib/spidr/auth_credential.rb",
39
+ "lib/spidr/auth_store.rb",
40
+ "lib/spidr/cookie_jar.rb",
41
+ "lib/spidr/events.rb",
42
+ "lib/spidr/extensions.rb",
43
+ "lib/spidr/extensions/uri.rb",
44
+ "lib/spidr/filters.rb",
45
+ "lib/spidr/page.rb",
46
+ "lib/spidr/rules.rb",
47
+ "lib/spidr/sanitizers.rb",
48
+ "lib/spidr/session_cache.rb",
49
+ "lib/spidr/spidr.rb",
50
+ "lib/spidr/version.rb",
51
+ "spec/actions_spec.rb",
52
+ "spec/agent_spec.rb",
53
+ "spec/auth_store_spec.rb",
54
+ "spec/cookie_jar_spec.rb",
55
+ "spec/extensions/uri_spec.rb",
56
+ "spec/filters_spec.rb",
57
+ "spec/helpers/history.rb",
58
+ "spec/helpers/page.rb",
59
+ "spec/helpers/wsoc.rb",
60
+ "spec/page_examples.rb",
61
+ "spec/page_spec.rb",
62
+ "spec/rules_spec.rb",
63
+ "spec/sanitizers_spec.rb",
64
+ "spec/session_cache.rb",
65
+ "spec/spec_helper.rb",
66
+ "spec/spidr_spec.rb",
67
+ "spidr.gemspec"
67
68
  ]
68
69
  s.has_rdoc = %q{yard}
69
70
  s.homepage = %q{http://github.com/postmodern/spidr}
70
71
  s.licenses = ["MIT"]
71
- s.rdoc_options = ["--charset=UTF-8"]
72
72
  s.require_paths = ["lib"]
73
- s.rubygems_version = %q{1.3.6}
73
+ s.rubygems_version = %q{1.3.7}
74
74
  s.summary = %q{A versatile Ruby web spidering library}
75
75
  s.test_files = [
76
+ "spec/actions_spec.rb",
77
+ "spec/agent_spec.rb",
76
78
  "spec/auth_store_spec.rb",
77
- "spec/rules_spec.rb",
78
- "spec/session_cache.rb",
79
- "spec/spec_helper.rb",
80
- "spec/sanitizers_spec.rb",
81
- "spec/filters_spec.rb",
82
- "spec/page_spec.rb",
83
- "spec/spidr_spec.rb",
84
- "spec/agent_spec.rb",
85
- "spec/cookie_jar_spec.rb",
86
- "spec/extensions/uri_spec.rb",
87
- "spec/helpers/history.rb",
88
- "spec/helpers/page.rb",
89
- "spec/helpers/wsoc.rb",
90
- "spec/page_examples.rb",
91
- "spec/actions_spec.rb"
79
+ "spec/cookie_jar_spec.rb",
80
+ "spec/extensions/uri_spec.rb",
81
+ "spec/filters_spec.rb",
82
+ "spec/helpers/history.rb",
83
+ "spec/helpers/page.rb",
84
+ "spec/helpers/wsoc.rb",
85
+ "spec/page_examples.rb",
86
+ "spec/page_spec.rb",
87
+ "spec/rules_spec.rb",
88
+ "spec/sanitizers_spec.rb",
89
+ "spec/session_cache.rb",
90
+ "spec/spec_helper.rb",
91
+ "spec/spidr_spec.rb"
92
92
  ]
93
93
 
94
94
  if s.respond_to? :specification_version then
95
95
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
96
96
  s.specification_version = 3
97
97
 
98
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
98
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
99
99
  s.add_runtime_dependency(%q<nokogiri>, [">= 1.3.0"])
100
+ s.add_development_dependency(%q<rake>, ["~> 0.8.7"])
101
+ s.add_development_dependency(%q<jeweler>, ["~> 1.4.0"])
100
102
  s.add_development_dependency(%q<rspec>, ["~> 1.3.0"])
101
- s.add_development_dependency(%q<yard>, ["~> 0.5.3"])
102
- s.add_development_dependency(%q<wsoc>, ["~> 0.1.1"])
103
103
  else
104
104
  s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
105
+ s.add_dependency(%q<rake>, ["~> 0.8.7"])
106
+ s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
105
107
  s.add_dependency(%q<rspec>, ["~> 1.3.0"])
106
- s.add_dependency(%q<yard>, ["~> 0.5.3"])
107
- s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
108
108
  end
109
109
  else
110
110
  s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
111
+ s.add_dependency(%q<rake>, ["~> 0.8.7"])
112
+ s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
111
113
  s.add_dependency(%q<rspec>, ["~> 1.3.0"])
112
- s.add_dependency(%q<yard>, ["~> 0.5.3"])
113
- s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
114
114
  end
115
115
  end
116
116
 
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 2
8
- - 4
9
- version: 0.2.4
8
+ - 5
9
+ version: 0.2.5
10
10
  platform: ruby
11
11
  authors:
12
12
  - Postmodern
@@ -14,13 +14,13 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-05-05 00:00:00 -07:00
17
+ date: 2010-07-02 00:00:00 -07:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: nokogiri
22
- prerelease: false
23
22
  requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
24
  requirements:
25
25
  - - ">="
26
26
  - !ruby/object:Gem::Version
@@ -30,48 +30,52 @@ dependencies:
30
30
  - 0
31
31
  version: 1.3.0
32
32
  type: :runtime
33
+ prerelease: false
33
34
  version_requirements: *id001
34
35
  - !ruby/object:Gem::Dependency
35
- name: rspec
36
- prerelease: false
36
+ name: rake
37
37
  requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
38
39
  requirements:
39
40
  - - ~>
40
41
  - !ruby/object:Gem::Version
41
42
  segments:
42
- - 1
43
- - 3
44
43
  - 0
45
- version: 1.3.0
44
+ - 8
45
+ - 7
46
+ version: 0.8.7
46
47
  type: :development
48
+ prerelease: false
47
49
  version_requirements: *id002
48
50
  - !ruby/object:Gem::Dependency
49
- name: yard
50
- prerelease: false
51
+ name: jeweler
51
52
  requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
52
54
  requirements:
53
55
  - - ~>
54
56
  - !ruby/object:Gem::Version
55
57
  segments:
58
+ - 1
59
+ - 4
56
60
  - 0
57
- - 5
58
- - 3
59
- version: 0.5.3
61
+ version: 1.4.0
60
62
  type: :development
63
+ prerelease: false
61
64
  version_requirements: *id003
62
65
  - !ruby/object:Gem::Dependency
63
- name: wsoc
64
- prerelease: false
66
+ name: rspec
65
67
  requirement: &id004 !ruby/object:Gem::Requirement
68
+ none: false
66
69
  requirements:
67
70
  - - ~>
68
71
  - !ruby/object:Gem::Version
69
72
  segments:
70
- - 0
71
73
  - 1
72
- - 1
73
- version: 0.1.1
74
+ - 3
75
+ - 0
76
+ version: 1.3.0
74
77
  type: :development
78
+ prerelease: false
75
79
  version_requirements: *id004
76
80
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
77
81
  email: postmodern.mod3@gmail.com
@@ -88,6 +92,7 @@ files:
88
92
  - .specopts
89
93
  - .yardopts
90
94
  - ChangeLog.md
95
+ - Gemfile
91
96
  - LICENSE.txt
92
97
  - README.md
93
98
  - Rakefile
@@ -135,18 +140,21 @@ homepage: http://github.com/postmodern/spidr
135
140
  licenses:
136
141
  - MIT
137
142
  post_install_message:
138
- rdoc_options:
139
- - --charset=UTF-8
143
+ rdoc_options: []
144
+
140
145
  require_paths:
141
146
  - lib
142
147
  required_ruby_version: !ruby/object:Gem::Requirement
148
+ none: false
143
149
  requirements:
144
150
  - - ">="
145
151
  - !ruby/object:Gem::Version
152
+ hash: 740918287
146
153
  segments:
147
154
  - 0
148
155
  version: "0"
149
156
  required_rubygems_version: !ruby/object:Gem::Requirement
157
+ none: false
150
158
  requirements:
151
159
  - - ">="
152
160
  - !ruby/object:Gem::Version
@@ -156,24 +164,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement
156
164
  requirements: []
157
165
 
158
166
  rubyforge_project:
159
- rubygems_version: 1.3.6
167
+ rubygems_version: 1.3.7
160
168
  signing_key:
161
169
  specification_version: 3
162
170
  summary: A versatile Ruby web spidering library
163
171
  test_files:
164
- - spec/auth_store_spec.rb
165
- - spec/rules_spec.rb
166
- - spec/session_cache.rb
167
- - spec/spec_helper.rb
168
- - spec/sanitizers_spec.rb
169
- - spec/filters_spec.rb
170
- - spec/page_spec.rb
171
- - spec/spidr_spec.rb
172
+ - spec/actions_spec.rb
172
173
  - spec/agent_spec.rb
174
+ - spec/auth_store_spec.rb
173
175
  - spec/cookie_jar_spec.rb
174
176
  - spec/extensions/uri_spec.rb
177
+ - spec/filters_spec.rb
175
178
  - spec/helpers/history.rb
176
179
  - spec/helpers/page.rb
177
180
  - spec/helpers/wsoc.rb
178
181
  - spec/page_examples.rb
179
- - spec/actions_spec.rb
182
+ - spec/page_spec.rb
183
+ - spec/rules_spec.rb
184
+ - spec/sanitizers_spec.rb
185
+ - spec/session_cache.rb
186
+ - spec/spec_helper.rb
187
+ - spec/spidr_spec.rb