spidr 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -3,6 +3,7 @@ doc
3
3
  web
4
4
  tmp
5
5
  .DS_Store
6
+ .bundle
6
7
  .yardoc
7
8
  *.swp
8
9
  *~
data/ChangeLog.md CHANGED
@@ -1,3 +1,13 @@
1
+ ### 0.2.5 / 2010-07-02
2
+
3
+ * Added {Spidr::Page#meta_redirect}.
4
+ * Added {Spidr::Page#meta_redirect?}.
5
+ * Manage development dependencies with Bundler.
6
+ * Support following "old-school" meta-refresh redirects (thanks zapnap).
7
+ * Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
8
+ * Fixed a constant lookup issue in {Spidr::Agent}.
9
+ * Use `yield` instead of `block.call` when necessary.
10
+
1
11
  ### 0.2.4 / 2010-05-05
2
12
 
3
13
  * Added {Spidr::Filters#visit_urls}.
data/Gemfile ADDED
@@ -0,0 +1,27 @@
1
+ source 'https://rubygems.org'
2
+
3
+ group(:runtime) do
4
+ gem 'nokogiri', '>= 1.3.0'
5
+ end
6
+
7
+ group(:development) do
8
+ gem 'rake', '~> 0.8.7'
9
+ gem 'jeweler', '~> 1.4.0', :git => 'git://github.com/technicalpickles/jeweler.git'
10
+ end
11
+
12
+ group(:doc) do
13
+ case RUBY_PLATFORM
14
+ when 'java'
15
+ gem 'maruku', '~> 0.6.0'
16
+ else
17
+ gem 'rdiscount', '~> 1.6.3'
18
+ end
19
+
20
+ gem 'yard', '~> 0.5.3'
21
+ end
22
+
23
+ group(:test) do
24
+ gem 'wsoc', '~> 0.1.3'
25
+ end
26
+
27
+ gem 'rspec', '~> 1.3.0', :group => [:development, :test]
data/README.md CHANGED
@@ -20,6 +20,7 @@ and easy to use.
20
20
  * frame tags.
21
21
  * Cookie protected links.
22
22
  * HTTP 300, 301, 302, 303 and 307 Redirects.
23
+ * Meta-Refresh Redirects.
23
24
  * HTTP Basic Auth protected links.
24
25
  * Black-list or white-list URLs based upon:
25
26
  * URL scheme.
data/Rakefile CHANGED
@@ -1,27 +1,28 @@
1
1
  require 'rubygems'
2
+ require 'bundler'
3
+
4
+ begin
5
+ Bundler.setup(:development, :doc)
6
+ rescue Bundler::BundlerError => e
7
+ STDERR.puts e.message
8
+ STDERR.puts "Run `bundle install` to install missing gems"
9
+ exit e.status_code
10
+ end
11
+
2
12
  require 'rake'
13
+ require 'jeweler'
3
14
  require './lib/spidr/version.rb'
4
15
 
5
- begin
6
- require 'jeweler'
7
- Jeweler::Tasks.new do |gem|
8
- gem.name = 'spidr'
9
- gem.version = Spidr::VERSION
10
- gem.license = 'MIT'
11
- gem.summary = %Q{A versatile Ruby web spidering library}
12
- gem.description = %Q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
13
- gem.email = 'postmodern.mod3@gmail.com'
14
- gem.homepage = 'http://github.com/postmodern/spidr'
15
- gem.authors = ['Postmodern']
16
- gem.add_dependency 'nokogiri', '>= 1.3.0'
17
- gem.add_development_dependency 'rspec', '~> 1.3.0'
18
- gem.add_development_dependency 'yard', '~> 0.5.3'
19
- gem.add_development_dependency 'wsoc', '~> 0.1.1'
20
- gem.has_rdoc = 'yard'
21
- end
22
- Jeweler::GemcutterTasks.new
23
- rescue LoadError
24
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
16
+ Jeweler::Tasks.new do |gem|
17
+ gem.name = 'spidr'
18
+ gem.version = Spidr::VERSION
19
+ gem.license = 'MIT'
20
+ gem.summary = %Q{A versatile Ruby web spidering library}
21
+ gem.description = %Q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
22
+ gem.email = 'postmodern.mod3@gmail.com'
23
+ gem.homepage = 'http://github.com/postmodern/spidr'
24
+ gem.authors = ['Postmodern']
25
+ gem.has_rdoc = 'yard'
25
26
  end
26
27
 
27
28
  require 'spec/rake/spectask'
@@ -31,15 +32,7 @@ Spec::Rake::SpecTask.new(:spec) do |spec|
31
32
  spec.spec_opts = ['--options', '.specopts']
32
33
  end
33
34
 
34
- task :spec => :check_dependencies
35
35
  task :default => :spec
36
36
 
37
- begin
38
- require 'yard'
39
-
40
- YARD::Rake::YardocTask.new
41
- rescue LoadError
42
- task :yard do
43
- abort "YARD is not available. In order to run yard, you must: gem install yard"
44
- end
45
- end
37
+ require 'yard'
38
+ YARD::Rake::YardocTask.new
data/lib/spidr/agent.rb CHANGED
@@ -98,7 +98,7 @@ module Spidr
98
98
  # @yieldparam [Agent] agent
99
99
  # The newly created agent.
100
100
  #
101
- def initialize(options={},&block)
101
+ def initialize(options={})
102
102
  @host_header = options[:host_header]
103
103
  @host_headers = {}
104
104
 
@@ -121,7 +121,7 @@ module Spidr
121
121
 
122
122
  super(options)
123
123
 
124
- block.call(self) if block
124
+ yield self if block_given?
125
125
  end
126
126
 
127
127
  #
@@ -140,9 +140,9 @@ module Spidr
140
140
  # @yieldparam [Agent] agent
141
141
  # The newly created agent.
142
142
  #
143
- def self.start_at(url,options={},&block)
143
+ def self.start_at(url,options={})
144
144
  self.new(options) do |spider|
145
- block.call(spider) if block
145
+ yield spider if block_given?
146
146
 
147
147
  spider.start_at(url)
148
148
  end
@@ -164,9 +164,9 @@ module Spidr
164
164
  # @yieldparam [Agent] agent
165
165
  # The newly created agent.
166
166
  #
167
- def self.host(name,options={},&block)
167
+ def self.host(name,options={})
168
168
  self.new(options.merge(:host => name)) do |spider|
169
- block.call(spider) if block
169
+ yield spider if block_given?
170
170
 
171
171
  spider.start_at("http://#{name}/")
172
172
  end
@@ -188,11 +188,11 @@ module Spidr
188
188
  # @yieldparam [Agent] agent
189
189
  # The newly created agent.
190
190
  #
191
- def self.site(url,options={},&block)
191
+ def self.site(url,options={})
192
192
  url = URI(url.to_s)
193
193
 
194
194
  return self.new(options.merge(:host => url.host)) do |spider|
195
- block.call(spider) if block
195
+ yield spider if block_given?
196
196
 
197
197
  spider.start_at(url)
198
198
  end
@@ -457,11 +457,18 @@ module Spidr
457
457
  link = url.to_s
458
458
 
459
459
  begin
460
- @every_url_blocks.each { |block| block.call(url) }
460
+ @every_url_blocks.each { |url_block| url_block.call(url) }
461
461
 
462
- @urls_like_blocks.each do |pattern,blocks|
463
- if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
464
- blocks.each { |url_block| url_block.call(url) }
462
+ @urls_like_blocks.each do |pattern,url_blocks|
463
+ match = case pattern
464
+ when Regexp
465
+ link =~ pattern
466
+ else
467
+ (pattern == link) || (pattern == url)
468
+ end
469
+
470
+ if match
471
+ url_blocks.each { |url_block| url_block.call(url) }
465
472
  end
466
473
  end
467
474
  rescue Actions::Paused => action
@@ -494,7 +501,7 @@ module Spidr
494
501
  # @return [Page, nil]
495
502
  # The page for the response, or `nil` if the request failed.
496
503
  #
497
- def get_page(url,&block)
504
+ def get_page(url)
498
505
  url = URI(url.to_s)
499
506
 
500
507
  prepare_request(url) do |session,path,headers|
@@ -503,7 +510,7 @@ module Spidr
503
510
  # save any new cookies
504
511
  @cookies.from_page(new_page)
505
512
 
506
- block.call(new_page) if block
513
+ yield new_page if block_given?
507
514
  return new_page
508
515
  end
509
516
  end
@@ -529,7 +536,7 @@ module Spidr
529
536
  #
530
537
  # @since 0.2.2
531
538
  #
532
- def post_page(url,post_data='',&block)
539
+ def post_page(url,post_data='')
533
540
  url = URI(url.to_s)
534
541
 
535
542
  prepare_request(url) do |session,path,headers|
@@ -538,7 +545,7 @@ module Spidr
538
545
  # save any new cookies
539
546
  @cookies.from_page(new_page)
540
547
 
541
- block.call(new_page) if block
548
+ yield new_page if block_given?
542
549
  return new_page
543
550
  end
544
551
  end
@@ -560,7 +567,7 @@ module Spidr
560
567
  # The page that was visited. If `nil` is returned, either the request
561
568
  # for the page failed, or the page was skipped.
562
569
  #
563
- def visit_page(url,&block)
570
+ def visit_page(url)
564
571
  url = URI(url.to_s) unless url.kind_of?(URI)
565
572
 
566
573
  get_page(url) do |page|
@@ -569,7 +576,7 @@ module Spidr
569
576
  begin
570
577
  @every_page_blocks.each { |page_block| page_block.call(page) }
571
578
 
572
- block.call(page) if block
579
+ yield page if block_given?
573
580
  rescue Actions::Paused => action
574
581
  raise(action)
575
582
  rescue Actions::SkipPage
@@ -668,7 +675,7 @@ module Spidr
668
675
  begin
669
676
  sleep(@delay) if @delay > 0
670
677
 
671
- block.call(@sessions[url],path,headers)
678
+ yield @sessions[url], path, headers
672
679
  rescue SystemCallError,
673
680
  Timeout::Error,
674
681
  SocketError,
@@ -719,7 +726,7 @@ module Spidr
719
726
  #
720
727
  def failed(url)
721
728
  @failures << url
722
- @every_failed_url_blocks.each { |block| block.call(url) }
729
+ @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
723
730
  return true
724
731
  end
725
732
 
@@ -130,6 +130,22 @@ module Spidr
130
130
  @dirty.delete(host)
131
131
  end
132
132
 
133
+ hdomain = host.split('.')
134
+
135
+ if hdomain.length > 2
136
+ parent_cookies = for_host(hdomain[1..-1].join('.'))
137
+
138
+ unless (parent_cookies.nil? || parent_cookies.empty?)
139
+ @cookies[host] = if @cookies[host].nil?
140
+ # inherit the parent cookies
141
+ parent_cookies
142
+ else
143
+ # merge the parent cookies with any host-specific cookies
144
+ "#{parent_cookies}; #{@cookies[host]}"
145
+ end
146
+ end
147
+ end
148
+
133
149
  return @cookies[host]
134
150
  end
135
151
 
data/lib/spidr/events.rb CHANGED
@@ -72,8 +72,8 @@ module Spidr
72
72
  # @yieldparam [Hash] headers
73
73
  # The headers from a response.
74
74
  #
75
- def all_headers(&block)
76
- every_page { |page| block.call(page.headers) }
75
+ def all_headers
76
+ every_page { |page| yield page.headers }
77
77
  end
78
78
 
79
79
  #
@@ -99,9 +99,9 @@ module Spidr
99
99
  # @yieldparam [Page] page
100
100
  # A visited page.
101
101
  #
102
- def every_ok_page(&block)
102
+ def every_ok_page
103
103
  every_page do |page|
104
- block.call(page) if (block && page.ok?)
104
+ yield page if (block_given? && page.ok?)
105
105
  end
106
106
  end
107
107
 
@@ -114,9 +114,9 @@ module Spidr
114
114
  # @yieldparam [Page] page
115
115
  # A visited page.
116
116
  #
117
- def every_redirect_page(&block)
117
+ def every_redirect_page
118
118
  every_page do |page|
119
- block.call(page) if (block && page.redirect?)
119
+ yield page if (block_given? && page.redirect?)
120
120
  end
121
121
  end
122
122
 
@@ -129,9 +129,9 @@ module Spidr
129
129
  # @yieldparam [Page] page
130
130
  # A visited page.
131
131
  #
132
- def every_timedout_page(&block)
132
+ def every_timedout_page
133
133
  every_page do |page|
134
- block.call(page) if (block && page.timedout?)
134
+ yield page if (block_given? && page.timedout?)
135
135
  end
136
136
  end
137
137
 
@@ -144,9 +144,9 @@ module Spidr
144
144
  # @yieldparam [Page] page
145
145
  # A visited page.
146
146
  #
147
- def every_bad_request_page(&block)
147
+ def every_bad_request_page
148
148
  every_page do |page|
149
- block.call(page) if (block && page.bad_request?)
149
+ yield page if (block_given? && page.bad_request?)
150
150
  end
151
151
  end
152
152
 
@@ -159,9 +159,9 @@ module Spidr
159
159
  # @yieldparam [Page] page
160
160
  # A visited page.
161
161
  #
162
- def every_unauthorized_page(&block)
162
+ def every_unauthorized_page
163
163
  every_page do |page|
164
- block.call(page) if (block && page.unauthorized?)
164
+ yield page if (block_given? && page.unauthorized?)
165
165
  end
166
166
  end
167
167
 
@@ -174,9 +174,9 @@ module Spidr
174
174
  # @yieldparam [Page] page
175
175
  # A visited page.
176
176
  #
177
- def every_forbidden_page(&block)
177
+ def every_forbidden_page
178
178
  every_page do |page|
179
- block.call(page) if (block && page.forbidden?)
179
+ yield page if (block_given? && page.forbidden?)
180
180
  end
181
181
  end
182
182
 
@@ -189,9 +189,9 @@ module Spidr
189
189
  # @yieldparam [Page] page
190
190
  # A visited page.
191
191
  #
192
- def every_missing_page(&block)
192
+ def every_missing_page
193
193
  every_page do |page|
194
- block.call(page) if (block && page.missing?)
194
+ yield page if (block_given? && page.missing?)
195
195
  end
196
196
  end
197
197
 
@@ -205,9 +205,9 @@ module Spidr
205
205
  # @yieldparam [Page] page
206
206
  # A visited page.
207
207
  #
208
- def every_internal_server_error_page(&block)
208
+ def every_internal_server_error_page
209
209
  every_page do |page|
210
- block.call(page) if (block && page.had_internal_server_error?)
210
+ yield page if (block_given? && page.had_internal_server_error?)
211
211
  end
212
212
  end
213
213
 
@@ -220,9 +220,9 @@ module Spidr
220
220
  # @yieldparam [Page] page
221
221
  # A visited page.
222
222
  #
223
- def every_txt_page(&block)
223
+ def every_txt_page
224
224
  every_page do |page|
225
- block.call(page) if (block && page.txt?)
225
+ yield page if (block_given? && page.txt?)
226
226
  end
227
227
  end
228
228
 
@@ -235,9 +235,9 @@ module Spidr
235
235
  # @yieldparam [Page] page
236
236
  # A visited page.
237
237
  #
238
- def every_html_page(&block)
238
+ def every_html_page
239
239
  every_page do |page|
240
- block.call(page) if (block && page.html?)
240
+ yield page if (block_given? && page.html?)
241
241
  end
242
242
  end
243
243
 
@@ -250,9 +250,9 @@ module Spidr
250
250
  # @yieldparam [Page] page
251
251
  # A visited page.
252
252
  #
253
- def every_xml_page(&block)
253
+ def every_xml_page
254
254
  every_page do |page|
255
- block.call(page) if (block && page.xml?)
255
+ yield page if (block_given? && page.xml?)
256
256
  end
257
257
  end
258
258
 
@@ -266,9 +266,9 @@ module Spidr
266
266
  # @yieldparam [Page] page
267
267
  # A visited page.
268
268
  #
269
- def every_xsl_page(&block)
269
+ def every_xsl_page
270
270
  every_page do |page|
271
- block.call(page) if (block && page.xsl?)
271
+ yield page if (block_given? && page.xsl?)
272
272
  end
273
273
  end
274
274
 
@@ -285,11 +285,11 @@ module Spidr
285
285
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
286
286
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
287
287
  #
288
- def every_doc(&block)
288
+ def every_doc
289
289
  every_page do |page|
290
- if block
290
+ if block_given?
291
291
  if (doc = page.doc)
292
- block.call(doc)
292
+ yield doc
293
293
  end
294
294
  end
295
295
  end
@@ -306,11 +306,11 @@ module Spidr
306
306
  #
307
307
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
308
308
  #
309
- def every_html_doc(&block)
309
+ def every_html_doc
310
310
  every_page do |page|
311
- if (block && page.html?)
311
+ if (block_given? && page.html?)
312
312
  if (doc = page.doc)
313
- block.call(doc)
313
+ yield doc
314
314
  end
315
315
  end
316
316
  end
@@ -327,11 +327,11 @@ module Spidr
327
327
  #
328
328
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
329
329
  #
330
- def every_xml_doc(&block)
330
+ def every_xml_doc
331
331
  every_page do |page|
332
- if (block && page.xml?)
332
+ if (block_given? && page.xml?)
333
333
  if (doc = page.doc)
334
- block.call(doc)
334
+ yield doc
335
335
  end
336
336
  end
337
337
  end
@@ -349,11 +349,11 @@ module Spidr
349
349
  #
350
350
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
351
351
  #
352
- def every_xsl_doc(&block)
352
+ def every_xsl_doc
353
353
  every_page do |page|
354
- if (block && page.xsl?)
354
+ if (block_given? && page.xsl?)
355
355
  if (doc = page.doc)
356
- block.call(doc)
356
+ yield doc
357
357
  end
358
358
  end
359
359
  end
@@ -370,11 +370,11 @@ module Spidr
370
370
  #
371
371
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
372
372
  #
373
- def every_rss_doc(&block)
373
+ def every_rss_doc
374
374
  every_page do |page|
375
- if (block && page.rss?)
375
+ if (block_given? && page.rss?)
376
376
  if (doc = page.doc)
377
- block.call(doc)
377
+ yield doc
378
378
  end
379
379
  end
380
380
  end
@@ -391,11 +391,11 @@ module Spidr
391
391
  #
392
392
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
393
393
  #
394
- def every_atom_doc(&block)
394
+ def every_atom_doc
395
395
  every_page do |page|
396
- if (block && page.atom?)
396
+ if (block_given? && page.atom?)
397
397
  if (doc = page.doc)
398
- block.call(doc)
398
+ yield doc
399
399
  end
400
400
  end
401
401
  end
@@ -410,9 +410,9 @@ module Spidr
410
410
  # @yieldparam [Page] page
411
411
  # A visited page.
412
412
  #
413
- def every_javascript_page(&block)
413
+ def every_javascript_page
414
414
  every_page do |page|
415
- block.call(page) if (block && page.javascript?)
415
+ yield page if (block_given? && page.javascript?)
416
416
  end
417
417
  end
418
418
 
@@ -425,9 +425,9 @@ module Spidr
425
425
  # @yieldparam [Page] page
426
426
  # A visited page.
427
427
  #
428
- def every_css_page(&block)
428
+ def every_css_page
429
429
  every_page do |page|
430
- block.call(page) if (block && page.css?)
430
+ yield page if (block_given? && page.css?)
431
431
  end
432
432
  end
433
433
 
@@ -440,9 +440,9 @@ module Spidr
440
440
  # @yieldparam [Page] feed
441
441
  # A visited page.
442
442
  #
443
- def every_rss_page(&block)
443
+ def every_rss_page
444
444
  every_page do |page|
445
- block.call(page) if (block && page.rss?)
445
+ yield page if (block_given? && page.rss?)
446
446
  end
447
447
  end
448
448
 
@@ -455,9 +455,9 @@ module Spidr
455
455
  # @yieldparam [Page] feed
456
456
  # A visited page.
457
457
  #
458
- def every_atom_page(&block)
458
+ def every_atom_page
459
459
  every_page do |page|
460
- block.call(page) if (block && page.atom?)
460
+ yield page if (block_given? && page.atom?)
461
461
  end
462
462
  end
463
463
 
@@ -470,9 +470,9 @@ module Spidr
470
470
  # @yieldparam [Page] page
471
471
  # A visited page.
472
472
  #
473
- def every_ms_word_page(&block)
473
+ def every_ms_word_page
474
474
  every_page do |page|
475
- block.call(page) if (block && page.ms_word?)
475
+ yield page if (block_given? && page.ms_word?)
476
476
  end
477
477
  end
478
478
 
@@ -485,9 +485,9 @@ module Spidr
485
485
  # @yieldparam [Page] page
486
486
  # A visited page.
487
487
  #
488
- def every_pdf_page(&block)
488
+ def every_pdf_page
489
489
  every_page do |page|
490
- block.call(page) if (block && page.pdf?)
490
+ yield page if (block_given? && page.pdf?)
491
491
  end
492
492
  end
493
493
 
@@ -500,9 +500,9 @@ module Spidr
500
500
  # @yieldparam [Page] page
501
501
  # A visited page.
502
502
  #
503
- def every_zip_page(&block)
503
+ def every_zip_page
504
504
  every_page do |page|
505
- block.call(page) if (block && page.zip?)
505
+ yield page if (block_given? && page.zip?)
506
506
  end
507
507
  end
508
508
 
data/lib/spidr/page.rb CHANGED
@@ -62,7 +62,8 @@ module Spidr
62
62
 
63
63
  #
64
64
  # Determines if the response code is `300`, `301`, `302`, `303`
65
- # or `307`.
65
+ # or `307`. Also checks for "soft" redirects added at the page
66
+ # level by a meta refresh tag.
66
67
  #
67
68
  # @return [Boolean]
68
69
  # Specifies whether the response code is a HTTP Redirect code.
@@ -71,6 +72,8 @@ module Spidr
71
72
  case code
72
73
  when 300..303, 307
73
74
  true
75
+ when 200
76
+ meta_redirect?
74
77
  else
75
78
  false
76
79
  end
@@ -434,17 +437,7 @@ module Spidr
434
437
  urls << url unless (url.nil? || url.empty?)
435
438
  }
436
439
 
437
- if self.is_redirect?
438
- location = @headers['location']
439
-
440
- if location.kind_of?(Array)
441
- # handle multiple location URLs
442
- location.each(&add_url)
443
- else
444
- # usually the location header contains a single String
445
- add_url.call(location)
446
- end
447
- end
440
+ self.redirects_to.each(&add_url) if self.is_redirect?
448
441
 
449
442
  if (html? && doc)
450
443
  doc.search('a[@href]').each do |a|
@@ -471,6 +464,27 @@ module Spidr
471
464
  return urls
472
465
  end
473
466
 
467
+ #
468
+ # URL(s) that this document redirects to.
469
+ #
470
+ # @return [Array<String>]
471
+ # The links that this page redirects to (usually found in a
472
+ # location header or by way of a page-level meta redirect).
473
+ #
474
+ def redirects_to
475
+ location = @headers['location']
476
+
477
+ if location.nil?
478
+ # check page-level meta redirects if there isn't a location header
479
+ meta_redirect
480
+ elsif location.kind_of?(Array)
481
+ location
482
+ else
483
+ # usually the location header contains a single String
484
+ [location]
485
+ end
486
+ end
487
+
474
488
  #
475
489
  # Absolute URIs from within the page.
476
490
  #
@@ -507,6 +521,43 @@ module Spidr
507
521
  return url
508
522
  end
509
523
 
524
+ #
525
+ # Determines if a page-level "soft" redirect is present. If yes,
526
+ # returns an array of those redirects (usually a single URL).
527
+ # Otherwise, returns false.
528
+ #
529
+ # @return [Array<String>]
530
+ # An array of redirect URLs
531
+ #
532
+ def meta_redirect
533
+ redirects = []
534
+
535
+ if (html? && doc)
536
+ search('//meta[@http-equiv and @content]').each do |node|
537
+ if node.attr('http-equiv') =~ /refresh/i
538
+ content = node.attr('content')
539
+
540
+ if (redirect = content.match(/url=(\S+)$/))
541
+ redirects << redirect[1]
542
+ end
543
+ end
544
+ end
545
+ end
546
+
547
+ return redirects.uniq
548
+ end
549
+
550
+ #
551
+ # Returns a boolean indicating whether or not page-level meta
552
+ # redirects are present in this page.
553
+ #
554
+ # @return [Boolean]
555
+ # Specifies whether the page includes page-level redirects.
556
+ #
557
+ def meta_redirect?
558
+ !meta_redirect.empty?
559
+ end
560
+
510
561
  protected
511
562
 
512
563
  #
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.2.4'
3
+ VERSION = '0.2.5'
4
4
  end
@@ -101,8 +101,21 @@ describe CookieJar do
101
101
  it "should encode multiple cookie params" do
102
102
  @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
103
103
  @cookie_jar['zerosum.org'] = {'other' => '1'}
104
+ cookie = @cookie_jar.for_host('zerosum.org')
104
105
 
105
- @cookie_jar.for_host('zerosum.org').should == 'admin=ofcourseiam; other=1'
106
+ cookie.should include('admin=ofcourseiam')
107
+ cookie.should include('; ')
108
+ cookie.should include('other=1')
109
+ end
110
+
111
+ it "should include cookies for the parent domain" do
112
+ @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
113
+ @cookie_jar['sub.zerosum.org'] = {'other' => '1'}
114
+ cookie = @cookie_jar.for_host('sub.zerosum.org')
115
+
116
+ cookie.should include('admin=ofcourseiam')
117
+ cookie.should include('; ')
118
+ cookie.should include('other=1')
106
119
  end
107
120
  end
108
121
  end
data/spec/helpers/wsoc.rb CHANGED
@@ -66,7 +66,7 @@ module Helpers
66
66
  end
67
67
 
68
68
  def run_course
69
- Agent.start_at(COURSE_URL) do |agent|
69
+ Spidr::Agent.start_at(COURSE_URL) do |agent|
70
70
  course_auth_store.each do |path,auth|
71
71
  agent.authorized.add(
72
72
  COURSE_URL.merge(path),
data/spec/page_spec.rb CHANGED
@@ -79,6 +79,21 @@ describe Page do
79
79
  end
80
80
  end
81
81
 
82
+ describe "redirects" do
83
+ before(:all) do
84
+ @page = get_page('http://spidr.rubyforge.org/course/start.html')
85
+ @page.stub!(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
86
+ end
87
+
88
+ it "should provide access to page-level redirects" do
89
+ @page.redirects_to.should == ['http://spidr.rubyforge.org/redirected']
90
+ end
91
+
92
+ it "should include meta refresh redirects in the list of links" do
93
+ @page.links.should include('http://spidr.rubyforge.org/redirected')
94
+ end
95
+ end
96
+
82
97
  describe "cookies" do
83
98
  before(:all) do
84
99
  @page = get_page('http://twitter.com/login')
data/spec/spec_helper.rb CHANGED
@@ -1,7 +1,15 @@
1
1
  require 'rubygems'
2
- gem 'rspec', '>=1.2.8'
3
- require 'spec'
2
+ require 'bundler'
3
+
4
+ begin
5
+ Bundler.setup(:runtime, :test)
6
+ rescue Bundler::BundlerError => e
7
+ STDERR.puts e.message
8
+ STDERR.puts "Run `bundle install` to install missing gems"
9
+ exit e.status_code
10
+ end
4
11
 
12
+ require 'spec'
5
13
  require 'spidr/version'
6
14
 
7
15
  include Spidr
data/spidr.gemspec CHANGED
@@ -5,112 +5,112 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{spidr}
8
- s.version = "0.2.4"
8
+ s.version = "0.2.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Postmodern"]
12
- s.date = %q{2010-05-05}
12
+ s.date = %q{2010-07-02}
13
13
  s.description = %q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
14
14
  s.email = %q{postmodern.mod3@gmail.com}
15
15
  s.extra_rdoc_files = [
16
16
  "ChangeLog.md",
17
- "LICENSE.txt",
18
- "README.md"
17
+ "LICENSE.txt",
18
+ "README.md"
19
19
  ]
20
20
  s.files = [
21
21
  ".gitignore",
22
- ".specopts",
23
- ".yardopts",
24
- "ChangeLog.md",
25
- "LICENSE.txt",
26
- "README.md",
27
- "Rakefile",
28
- "lib/spidr.rb",
29
- "lib/spidr/actions.rb",
30
- "lib/spidr/actions/actions.rb",
31
- "lib/spidr/actions/exceptions.rb",
32
- "lib/spidr/actions/exceptions/action.rb",
33
- "lib/spidr/actions/exceptions/paused.rb",
34
- "lib/spidr/actions/exceptions/skip_link.rb",
35
- "lib/spidr/actions/exceptions/skip_page.rb",
36
- "lib/spidr/agent.rb",
37
- "lib/spidr/auth_credential.rb",
38
- "lib/spidr/auth_store.rb",
39
- "lib/spidr/cookie_jar.rb",
40
- "lib/spidr/events.rb",
41
- "lib/spidr/extensions.rb",
42
- "lib/spidr/extensions/uri.rb",
43
- "lib/spidr/filters.rb",
44
- "lib/spidr/page.rb",
45
- "lib/spidr/rules.rb",
46
- "lib/spidr/sanitizers.rb",
47
- "lib/spidr/session_cache.rb",
48
- "lib/spidr/spidr.rb",
49
- "lib/spidr/version.rb",
50
- "spec/actions_spec.rb",
51
- "spec/agent_spec.rb",
52
- "spec/auth_store_spec.rb",
53
- "spec/cookie_jar_spec.rb",
54
- "spec/extensions/uri_spec.rb",
55
- "spec/filters_spec.rb",
56
- "spec/helpers/history.rb",
57
- "spec/helpers/page.rb",
58
- "spec/helpers/wsoc.rb",
59
- "spec/page_examples.rb",
60
- "spec/page_spec.rb",
61
- "spec/rules_spec.rb",
62
- "spec/sanitizers_spec.rb",
63
- "spec/session_cache.rb",
64
- "spec/spec_helper.rb",
65
- "spec/spidr_spec.rb",
66
- "spidr.gemspec"
22
+ ".specopts",
23
+ ".yardopts",
24
+ "ChangeLog.md",
25
+ "Gemfile",
26
+ "LICENSE.txt",
27
+ "README.md",
28
+ "Rakefile",
29
+ "lib/spidr.rb",
30
+ "lib/spidr/actions.rb",
31
+ "lib/spidr/actions/actions.rb",
32
+ "lib/spidr/actions/exceptions.rb",
33
+ "lib/spidr/actions/exceptions/action.rb",
34
+ "lib/spidr/actions/exceptions/paused.rb",
35
+ "lib/spidr/actions/exceptions/skip_link.rb",
36
+ "lib/spidr/actions/exceptions/skip_page.rb",
37
+ "lib/spidr/agent.rb",
38
+ "lib/spidr/auth_credential.rb",
39
+ "lib/spidr/auth_store.rb",
40
+ "lib/spidr/cookie_jar.rb",
41
+ "lib/spidr/events.rb",
42
+ "lib/spidr/extensions.rb",
43
+ "lib/spidr/extensions/uri.rb",
44
+ "lib/spidr/filters.rb",
45
+ "lib/spidr/page.rb",
46
+ "lib/spidr/rules.rb",
47
+ "lib/spidr/sanitizers.rb",
48
+ "lib/spidr/session_cache.rb",
49
+ "lib/spidr/spidr.rb",
50
+ "lib/spidr/version.rb",
51
+ "spec/actions_spec.rb",
52
+ "spec/agent_spec.rb",
53
+ "spec/auth_store_spec.rb",
54
+ "spec/cookie_jar_spec.rb",
55
+ "spec/extensions/uri_spec.rb",
56
+ "spec/filters_spec.rb",
57
+ "spec/helpers/history.rb",
58
+ "spec/helpers/page.rb",
59
+ "spec/helpers/wsoc.rb",
60
+ "spec/page_examples.rb",
61
+ "spec/page_spec.rb",
62
+ "spec/rules_spec.rb",
63
+ "spec/sanitizers_spec.rb",
64
+ "spec/session_cache.rb",
65
+ "spec/spec_helper.rb",
66
+ "spec/spidr_spec.rb",
67
+ "spidr.gemspec"
67
68
  ]
68
69
  s.has_rdoc = %q{yard}
69
70
  s.homepage = %q{http://github.com/postmodern/spidr}
70
71
  s.licenses = ["MIT"]
71
- s.rdoc_options = ["--charset=UTF-8"]
72
72
  s.require_paths = ["lib"]
73
- s.rubygems_version = %q{1.3.6}
73
+ s.rubygems_version = %q{1.3.7}
74
74
  s.summary = %q{A versatile Ruby web spidering library}
75
75
  s.test_files = [
76
+ "spec/actions_spec.rb",
77
+ "spec/agent_spec.rb",
76
78
  "spec/auth_store_spec.rb",
77
- "spec/rules_spec.rb",
78
- "spec/session_cache.rb",
79
- "spec/spec_helper.rb",
80
- "spec/sanitizers_spec.rb",
81
- "spec/filters_spec.rb",
82
- "spec/page_spec.rb",
83
- "spec/spidr_spec.rb",
84
- "spec/agent_spec.rb",
85
- "spec/cookie_jar_spec.rb",
86
- "spec/extensions/uri_spec.rb",
87
- "spec/helpers/history.rb",
88
- "spec/helpers/page.rb",
89
- "spec/helpers/wsoc.rb",
90
- "spec/page_examples.rb",
91
- "spec/actions_spec.rb"
79
+ "spec/cookie_jar_spec.rb",
80
+ "spec/extensions/uri_spec.rb",
81
+ "spec/filters_spec.rb",
82
+ "spec/helpers/history.rb",
83
+ "spec/helpers/page.rb",
84
+ "spec/helpers/wsoc.rb",
85
+ "spec/page_examples.rb",
86
+ "spec/page_spec.rb",
87
+ "spec/rules_spec.rb",
88
+ "spec/sanitizers_spec.rb",
89
+ "spec/session_cache.rb",
90
+ "spec/spec_helper.rb",
91
+ "spec/spidr_spec.rb"
92
92
  ]
93
93
 
94
94
  if s.respond_to? :specification_version then
95
95
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
96
96
  s.specification_version = 3
97
97
 
98
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
98
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
99
99
  s.add_runtime_dependency(%q<nokogiri>, [">= 1.3.0"])
100
+ s.add_development_dependency(%q<rake>, ["~> 0.8.7"])
101
+ s.add_development_dependency(%q<jeweler>, ["~> 1.4.0"])
100
102
  s.add_development_dependency(%q<rspec>, ["~> 1.3.0"])
101
- s.add_development_dependency(%q<yard>, ["~> 0.5.3"])
102
- s.add_development_dependency(%q<wsoc>, ["~> 0.1.1"])
103
103
  else
104
104
  s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
105
+ s.add_dependency(%q<rake>, ["~> 0.8.7"])
106
+ s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
105
107
  s.add_dependency(%q<rspec>, ["~> 1.3.0"])
106
- s.add_dependency(%q<yard>, ["~> 0.5.3"])
107
- s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
108
108
  end
109
109
  else
110
110
  s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
111
+ s.add_dependency(%q<rake>, ["~> 0.8.7"])
112
+ s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
111
113
  s.add_dependency(%q<rspec>, ["~> 1.3.0"])
112
- s.add_dependency(%q<yard>, ["~> 0.5.3"])
113
- s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
114
114
  end
115
115
  end
116
116
 
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 2
8
- - 4
9
- version: 0.2.4
8
+ - 5
9
+ version: 0.2.5
10
10
  platform: ruby
11
11
  authors:
12
12
  - Postmodern
@@ -14,13 +14,13 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-05-05 00:00:00 -07:00
17
+ date: 2010-07-02 00:00:00 -07:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: nokogiri
22
- prerelease: false
23
22
  requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
24
  requirements:
25
25
  - - ">="
26
26
  - !ruby/object:Gem::Version
@@ -30,48 +30,52 @@ dependencies:
30
30
  - 0
31
31
  version: 1.3.0
32
32
  type: :runtime
33
+ prerelease: false
33
34
  version_requirements: *id001
34
35
  - !ruby/object:Gem::Dependency
35
- name: rspec
36
- prerelease: false
36
+ name: rake
37
37
  requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
38
39
  requirements:
39
40
  - - ~>
40
41
  - !ruby/object:Gem::Version
41
42
  segments:
42
- - 1
43
- - 3
44
43
  - 0
45
- version: 1.3.0
44
+ - 8
45
+ - 7
46
+ version: 0.8.7
46
47
  type: :development
48
+ prerelease: false
47
49
  version_requirements: *id002
48
50
  - !ruby/object:Gem::Dependency
49
- name: yard
50
- prerelease: false
51
+ name: jeweler
51
52
  requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
52
54
  requirements:
53
55
  - - ~>
54
56
  - !ruby/object:Gem::Version
55
57
  segments:
58
+ - 1
59
+ - 4
56
60
  - 0
57
- - 5
58
- - 3
59
- version: 0.5.3
61
+ version: 1.4.0
60
62
  type: :development
63
+ prerelease: false
61
64
  version_requirements: *id003
62
65
  - !ruby/object:Gem::Dependency
63
- name: wsoc
64
- prerelease: false
66
+ name: rspec
65
67
  requirement: &id004 !ruby/object:Gem::Requirement
68
+ none: false
66
69
  requirements:
67
70
  - - ~>
68
71
  - !ruby/object:Gem::Version
69
72
  segments:
70
- - 0
71
73
  - 1
72
- - 1
73
- version: 0.1.1
74
+ - 3
75
+ - 0
76
+ version: 1.3.0
74
77
  type: :development
78
+ prerelease: false
75
79
  version_requirements: *id004
76
80
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
77
81
  email: postmodern.mod3@gmail.com
@@ -88,6 +92,7 @@ files:
88
92
  - .specopts
89
93
  - .yardopts
90
94
  - ChangeLog.md
95
+ - Gemfile
91
96
  - LICENSE.txt
92
97
  - README.md
93
98
  - Rakefile
@@ -135,18 +140,21 @@ homepage: http://github.com/postmodern/spidr
135
140
  licenses:
136
141
  - MIT
137
142
  post_install_message:
138
- rdoc_options:
139
- - --charset=UTF-8
143
+ rdoc_options: []
144
+
140
145
  require_paths:
141
146
  - lib
142
147
  required_ruby_version: !ruby/object:Gem::Requirement
148
+ none: false
143
149
  requirements:
144
150
  - - ">="
145
151
  - !ruby/object:Gem::Version
152
+ hash: 740918287
146
153
  segments:
147
154
  - 0
148
155
  version: "0"
149
156
  required_rubygems_version: !ruby/object:Gem::Requirement
157
+ none: false
150
158
  requirements:
151
159
  - - ">="
152
160
  - !ruby/object:Gem::Version
@@ -156,24 +164,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement
156
164
  requirements: []
157
165
 
158
166
  rubyforge_project:
159
- rubygems_version: 1.3.6
167
+ rubygems_version: 1.3.7
160
168
  signing_key:
161
169
  specification_version: 3
162
170
  summary: A versatile Ruby web spidering library
163
171
  test_files:
164
- - spec/auth_store_spec.rb
165
- - spec/rules_spec.rb
166
- - spec/session_cache.rb
167
- - spec/spec_helper.rb
168
- - spec/sanitizers_spec.rb
169
- - spec/filters_spec.rb
170
- - spec/page_spec.rb
171
- - spec/spidr_spec.rb
172
+ - spec/actions_spec.rb
172
173
  - spec/agent_spec.rb
174
+ - spec/auth_store_spec.rb
173
175
  - spec/cookie_jar_spec.rb
174
176
  - spec/extensions/uri_spec.rb
177
+ - spec/filters_spec.rb
175
178
  - spec/helpers/history.rb
176
179
  - spec/helpers/page.rb
177
180
  - spec/helpers/wsoc.rb
178
181
  - spec/page_examples.rb
179
- - spec/actions_spec.rb
182
+ - spec/page_spec.rb
183
+ - spec/rules_spec.rb
184
+ - spec/sanitizers_spec.rb
185
+ - spec/session_cache.rb
186
+ - spec/spec_helper.rb
187
+ - spec/spidr_spec.rb