spidr 0.2.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/ChangeLog.md +10 -0
- data/Gemfile +27 -0
- data/README.md +1 -0
- data/Rakefile +23 -30
- data/lib/spidr/agent.rb +27 -20
- data/lib/spidr/cookie_jar.rb +16 -0
- data/lib/spidr/events.rb +58 -58
- data/lib/spidr/page.rb +63 -12
- data/lib/spidr/version.rb +1 -1
- data/spec/cookie_jar_spec.rb +14 -1
- data/spec/helpers/wsoc.rb +1 -1
- data/spec/page_spec.rb +15 -0
- data/spec/spec_helper.rb +10 -2
- data/spidr.gemspec +73 -73
- metadata +39 -31
data/.gitignore
CHANGED
data/ChangeLog.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
### 0.2.5 / 2010-07-02
|
2
|
+
|
3
|
+
* Added {Spidr::Page#meta_redirect}.
|
4
|
+
* Added {Spidr::Page#meta_redirect?}.
|
5
|
+
* Manage development dependencies with Bundler.
|
6
|
+
* Support following "old-school" meta-refresh redirects (thanks zapnap).
|
7
|
+
* Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
|
8
|
+
* Fixed a constant lookup issue in {Spidr::Agent}.
|
9
|
+
* Use `yield` instead of `block.call` when necessary.
|
10
|
+
|
1
11
|
### 0.2.4 / 2010-05-05
|
2
12
|
|
3
13
|
* Added {Spidr::Filters#visit_urls}.
|
data/Gemfile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
group(:runtime) do
|
4
|
+
gem 'nokogiri', '>= 1.3.0'
|
5
|
+
end
|
6
|
+
|
7
|
+
group(:development) do
|
8
|
+
gem 'rake', '~> 0.8.7'
|
9
|
+
gem 'jeweler', '~> 1.4.0', :git => 'git://github.com/technicalpickles/jeweler.git'
|
10
|
+
end
|
11
|
+
|
12
|
+
group(:doc) do
|
13
|
+
case RUBY_PLATFORM
|
14
|
+
when 'java'
|
15
|
+
gem 'maruku', '~> 0.6.0'
|
16
|
+
else
|
17
|
+
gem 'rdiscount', '~> 1.6.3'
|
18
|
+
end
|
19
|
+
|
20
|
+
gem 'yard', '~> 0.5.3'
|
21
|
+
end
|
22
|
+
|
23
|
+
group(:test) do
|
24
|
+
gem 'wsoc', '~> 0.1.3'
|
25
|
+
end
|
26
|
+
|
27
|
+
gem 'rspec', '~> 1.3.0', :group => [:development, :test]
|
data/README.md
CHANGED
data/Rakefile
CHANGED
@@ -1,27 +1,28 @@
|
|
1
1
|
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
|
4
|
+
begin
|
5
|
+
Bundler.setup(:development, :doc)
|
6
|
+
rescue Bundler::BundlerError => e
|
7
|
+
STDERR.puts e.message
|
8
|
+
STDERR.puts "Run `bundle install` to install missing gems"
|
9
|
+
exit e.status_code
|
10
|
+
end
|
11
|
+
|
2
12
|
require 'rake'
|
13
|
+
require 'jeweler'
|
3
14
|
require './lib/spidr/version.rb'
|
4
15
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
gem.authors = ['Postmodern']
|
16
|
-
gem.add_dependency 'nokogiri', '>= 1.3.0'
|
17
|
-
gem.add_development_dependency 'rspec', '~> 1.3.0'
|
18
|
-
gem.add_development_dependency 'yard', '~> 0.5.3'
|
19
|
-
gem.add_development_dependency 'wsoc', '~> 0.1.1'
|
20
|
-
gem.has_rdoc = 'yard'
|
21
|
-
end
|
22
|
-
Jeweler::GemcutterTasks.new
|
23
|
-
rescue LoadError
|
24
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
16
|
+
Jeweler::Tasks.new do |gem|
|
17
|
+
gem.name = 'spidr'
|
18
|
+
gem.version = Spidr::VERSION
|
19
|
+
gem.license = 'MIT'
|
20
|
+
gem.summary = %Q{A versatile Ruby web spidering library}
|
21
|
+
gem.description = %Q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
|
22
|
+
gem.email = 'postmodern.mod3@gmail.com'
|
23
|
+
gem.homepage = 'http://github.com/postmodern/spidr'
|
24
|
+
gem.authors = ['Postmodern']
|
25
|
+
gem.has_rdoc = 'yard'
|
25
26
|
end
|
26
27
|
|
27
28
|
require 'spec/rake/spectask'
|
@@ -31,15 +32,7 @@ Spec::Rake::SpecTask.new(:spec) do |spec|
|
|
31
32
|
spec.spec_opts = ['--options', '.specopts']
|
32
33
|
end
|
33
34
|
|
34
|
-
task :spec => :check_dependencies
|
35
35
|
task :default => :spec
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
YARD::Rake::YardocTask.new
|
41
|
-
rescue LoadError
|
42
|
-
task :yard do
|
43
|
-
abort "YARD is not available. In order to run yard, you must: gem install yard"
|
44
|
-
end
|
45
|
-
end
|
37
|
+
require 'yard'
|
38
|
+
YARD::Rake::YardocTask.new
|
data/lib/spidr/agent.rb
CHANGED
@@ -98,7 +98,7 @@ module Spidr
|
|
98
98
|
# @yieldparam [Agent] agent
|
99
99
|
# The newly created agent.
|
100
100
|
#
|
101
|
-
def initialize(options={}
|
101
|
+
def initialize(options={})
|
102
102
|
@host_header = options[:host_header]
|
103
103
|
@host_headers = {}
|
104
104
|
|
@@ -121,7 +121,7 @@ module Spidr
|
|
121
121
|
|
122
122
|
super(options)
|
123
123
|
|
124
|
-
|
124
|
+
yield self if block_given?
|
125
125
|
end
|
126
126
|
|
127
127
|
#
|
@@ -140,9 +140,9 @@ module Spidr
|
|
140
140
|
# @yieldparam [Agent] agent
|
141
141
|
# The newly created agent.
|
142
142
|
#
|
143
|
-
def self.start_at(url,options={}
|
143
|
+
def self.start_at(url,options={})
|
144
144
|
self.new(options) do |spider|
|
145
|
-
|
145
|
+
yield spider if block_given?
|
146
146
|
|
147
147
|
spider.start_at(url)
|
148
148
|
end
|
@@ -164,9 +164,9 @@ module Spidr
|
|
164
164
|
# @yieldparam [Agent] agent
|
165
165
|
# The newly created agent.
|
166
166
|
#
|
167
|
-
def self.host(name,options={}
|
167
|
+
def self.host(name,options={})
|
168
168
|
self.new(options.merge(:host => name)) do |spider|
|
169
|
-
|
169
|
+
yield spider if block_given?
|
170
170
|
|
171
171
|
spider.start_at("http://#{name}/")
|
172
172
|
end
|
@@ -188,11 +188,11 @@ module Spidr
|
|
188
188
|
# @yieldparam [Agent] agent
|
189
189
|
# The newly created agent.
|
190
190
|
#
|
191
|
-
def self.site(url,options={}
|
191
|
+
def self.site(url,options={})
|
192
192
|
url = URI(url.to_s)
|
193
193
|
|
194
194
|
return self.new(options.merge(:host => url.host)) do |spider|
|
195
|
-
|
195
|
+
yield spider if block_given?
|
196
196
|
|
197
197
|
spider.start_at(url)
|
198
198
|
end
|
@@ -457,11 +457,18 @@ module Spidr
|
|
457
457
|
link = url.to_s
|
458
458
|
|
459
459
|
begin
|
460
|
-
@every_url_blocks.each { |
|
460
|
+
@every_url_blocks.each { |url_block| url_block.call(url) }
|
461
461
|
|
462
|
-
@urls_like_blocks.each do |pattern,
|
463
|
-
|
464
|
-
|
462
|
+
@urls_like_blocks.each do |pattern,url_blocks|
|
463
|
+
match = case pattern
|
464
|
+
when Regexp
|
465
|
+
link =~ pattern
|
466
|
+
else
|
467
|
+
(pattern == link) || (pattern == url)
|
468
|
+
end
|
469
|
+
|
470
|
+
if match
|
471
|
+
url_blocks.each { |url_block| url_block.call(url) }
|
465
472
|
end
|
466
473
|
end
|
467
474
|
rescue Actions::Paused => action
|
@@ -494,7 +501,7 @@ module Spidr
|
|
494
501
|
# @return [Page, nil]
|
495
502
|
# The page for the response, or `nil` if the request failed.
|
496
503
|
#
|
497
|
-
def get_page(url
|
504
|
+
def get_page(url)
|
498
505
|
url = URI(url.to_s)
|
499
506
|
|
500
507
|
prepare_request(url) do |session,path,headers|
|
@@ -503,7 +510,7 @@ module Spidr
|
|
503
510
|
# save any new cookies
|
504
511
|
@cookies.from_page(new_page)
|
505
512
|
|
506
|
-
|
513
|
+
yield new_page if block_given?
|
507
514
|
return new_page
|
508
515
|
end
|
509
516
|
end
|
@@ -529,7 +536,7 @@ module Spidr
|
|
529
536
|
#
|
530
537
|
# @since 0.2.2
|
531
538
|
#
|
532
|
-
def post_page(url,post_data=''
|
539
|
+
def post_page(url,post_data='')
|
533
540
|
url = URI(url.to_s)
|
534
541
|
|
535
542
|
prepare_request(url) do |session,path,headers|
|
@@ -538,7 +545,7 @@ module Spidr
|
|
538
545
|
# save any new cookies
|
539
546
|
@cookies.from_page(new_page)
|
540
547
|
|
541
|
-
|
548
|
+
yield new_page if block_given?
|
542
549
|
return new_page
|
543
550
|
end
|
544
551
|
end
|
@@ -560,7 +567,7 @@ module Spidr
|
|
560
567
|
# The page that was visited. If `nil` is returned, either the request
|
561
568
|
# for the page failed, or the page was skipped.
|
562
569
|
#
|
563
|
-
def visit_page(url
|
570
|
+
def visit_page(url)
|
564
571
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
565
572
|
|
566
573
|
get_page(url) do |page|
|
@@ -569,7 +576,7 @@ module Spidr
|
|
569
576
|
begin
|
570
577
|
@every_page_blocks.each { |page_block| page_block.call(page) }
|
571
578
|
|
572
|
-
|
579
|
+
yield page if block_given?
|
573
580
|
rescue Actions::Paused => action
|
574
581
|
raise(action)
|
575
582
|
rescue Actions::SkipPage
|
@@ -668,7 +675,7 @@ module Spidr
|
|
668
675
|
begin
|
669
676
|
sleep(@delay) if @delay > 0
|
670
677
|
|
671
|
-
|
678
|
+
yield @sessions[url], path, headers
|
672
679
|
rescue SystemCallError,
|
673
680
|
Timeout::Error,
|
674
681
|
SocketError,
|
@@ -719,7 +726,7 @@ module Spidr
|
|
719
726
|
#
|
720
727
|
def failed(url)
|
721
728
|
@failures << url
|
722
|
-
@every_failed_url_blocks.each { |
|
729
|
+
@every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
|
723
730
|
return true
|
724
731
|
end
|
725
732
|
|
data/lib/spidr/cookie_jar.rb
CHANGED
@@ -130,6 +130,22 @@ module Spidr
|
|
130
130
|
@dirty.delete(host)
|
131
131
|
end
|
132
132
|
|
133
|
+
hdomain = host.split('.')
|
134
|
+
|
135
|
+
if hdomain.length > 2
|
136
|
+
parent_cookies = for_host(hdomain[1..-1].join('.'))
|
137
|
+
|
138
|
+
unless (parent_cookies.nil? || parent_cookies.empty?)
|
139
|
+
@cookies[host] = if @cookies[host].nil?
|
140
|
+
# inherit the parent cookies
|
141
|
+
parent_cookies
|
142
|
+
else
|
143
|
+
# merge the parent cookies with any host-specific cookies
|
144
|
+
"#{parent_cookies}; #{@cookies[host]}"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
133
149
|
return @cookies[host]
|
134
150
|
end
|
135
151
|
|
data/lib/spidr/events.rb
CHANGED
@@ -72,8 +72,8 @@ module Spidr
|
|
72
72
|
# @yieldparam [Hash] headers
|
73
73
|
# The headers from a response.
|
74
74
|
#
|
75
|
-
def all_headers
|
76
|
-
every_page { |page|
|
75
|
+
def all_headers
|
76
|
+
every_page { |page| yield page.headers }
|
77
77
|
end
|
78
78
|
|
79
79
|
#
|
@@ -99,9 +99,9 @@ module Spidr
|
|
99
99
|
# @yieldparam [Page] page
|
100
100
|
# A visited page.
|
101
101
|
#
|
102
|
-
def every_ok_page
|
102
|
+
def every_ok_page
|
103
103
|
every_page do |page|
|
104
|
-
|
104
|
+
yield page if (block_given? && page.ok?)
|
105
105
|
end
|
106
106
|
end
|
107
107
|
|
@@ -114,9 +114,9 @@ module Spidr
|
|
114
114
|
# @yieldparam [Page] page
|
115
115
|
# A visited page.
|
116
116
|
#
|
117
|
-
def every_redirect_page
|
117
|
+
def every_redirect_page
|
118
118
|
every_page do |page|
|
119
|
-
|
119
|
+
yield page if (block_given? && page.redirect?)
|
120
120
|
end
|
121
121
|
end
|
122
122
|
|
@@ -129,9 +129,9 @@ module Spidr
|
|
129
129
|
# @yieldparam [Page] page
|
130
130
|
# A visited page.
|
131
131
|
#
|
132
|
-
def every_timedout_page
|
132
|
+
def every_timedout_page
|
133
133
|
every_page do |page|
|
134
|
-
|
134
|
+
yield page if (block_given? && page.timedout?)
|
135
135
|
end
|
136
136
|
end
|
137
137
|
|
@@ -144,9 +144,9 @@ module Spidr
|
|
144
144
|
# @yieldparam [Page] page
|
145
145
|
# A visited page.
|
146
146
|
#
|
147
|
-
def every_bad_request_page
|
147
|
+
def every_bad_request_page
|
148
148
|
every_page do |page|
|
149
|
-
|
149
|
+
yield page if (block_given? && page.bad_request?)
|
150
150
|
end
|
151
151
|
end
|
152
152
|
|
@@ -159,9 +159,9 @@ module Spidr
|
|
159
159
|
# @yieldparam [Page] page
|
160
160
|
# A visited page.
|
161
161
|
#
|
162
|
-
def every_unauthorized_page
|
162
|
+
def every_unauthorized_page
|
163
163
|
every_page do |page|
|
164
|
-
|
164
|
+
yield page if (block_given? && page.unauthorized?)
|
165
165
|
end
|
166
166
|
end
|
167
167
|
|
@@ -174,9 +174,9 @@ module Spidr
|
|
174
174
|
# @yieldparam [Page] page
|
175
175
|
# A visited page.
|
176
176
|
#
|
177
|
-
def every_forbidden_page
|
177
|
+
def every_forbidden_page
|
178
178
|
every_page do |page|
|
179
|
-
|
179
|
+
yield page if (block_given? && page.forbidden?)
|
180
180
|
end
|
181
181
|
end
|
182
182
|
|
@@ -189,9 +189,9 @@ module Spidr
|
|
189
189
|
# @yieldparam [Page] page
|
190
190
|
# A visited page.
|
191
191
|
#
|
192
|
-
def every_missing_page
|
192
|
+
def every_missing_page
|
193
193
|
every_page do |page|
|
194
|
-
|
194
|
+
yield page if (block_given? && page.missing?)
|
195
195
|
end
|
196
196
|
end
|
197
197
|
|
@@ -205,9 +205,9 @@ module Spidr
|
|
205
205
|
# @yieldparam [Page] page
|
206
206
|
# A visited page.
|
207
207
|
#
|
208
|
-
def every_internal_server_error_page
|
208
|
+
def every_internal_server_error_page
|
209
209
|
every_page do |page|
|
210
|
-
|
210
|
+
yield page if (block_given? && page.had_internal_server_error?)
|
211
211
|
end
|
212
212
|
end
|
213
213
|
|
@@ -220,9 +220,9 @@ module Spidr
|
|
220
220
|
# @yieldparam [Page] page
|
221
221
|
# A visited page.
|
222
222
|
#
|
223
|
-
def every_txt_page
|
223
|
+
def every_txt_page
|
224
224
|
every_page do |page|
|
225
|
-
|
225
|
+
yield page if (block_given? && page.txt?)
|
226
226
|
end
|
227
227
|
end
|
228
228
|
|
@@ -235,9 +235,9 @@ module Spidr
|
|
235
235
|
# @yieldparam [Page] page
|
236
236
|
# A visited page.
|
237
237
|
#
|
238
|
-
def every_html_page
|
238
|
+
def every_html_page
|
239
239
|
every_page do |page|
|
240
|
-
|
240
|
+
yield page if (block_given? && page.html?)
|
241
241
|
end
|
242
242
|
end
|
243
243
|
|
@@ -250,9 +250,9 @@ module Spidr
|
|
250
250
|
# @yieldparam [Page] page
|
251
251
|
# A visited page.
|
252
252
|
#
|
253
|
-
def every_xml_page
|
253
|
+
def every_xml_page
|
254
254
|
every_page do |page|
|
255
|
-
|
255
|
+
yield page if (block_given? && page.xml?)
|
256
256
|
end
|
257
257
|
end
|
258
258
|
|
@@ -266,9 +266,9 @@ module Spidr
|
|
266
266
|
# @yieldparam [Page] page
|
267
267
|
# A visited page.
|
268
268
|
#
|
269
|
-
def every_xsl_page
|
269
|
+
def every_xsl_page
|
270
270
|
every_page do |page|
|
271
|
-
|
271
|
+
yield page if (block_given? && page.xsl?)
|
272
272
|
end
|
273
273
|
end
|
274
274
|
|
@@ -285,11 +285,11 @@ module Spidr
|
|
285
285
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
286
286
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
287
287
|
#
|
288
|
-
def every_doc
|
288
|
+
def every_doc
|
289
289
|
every_page do |page|
|
290
|
-
if
|
290
|
+
if block_given?
|
291
291
|
if (doc = page.doc)
|
292
|
-
|
292
|
+
yield doc
|
293
293
|
end
|
294
294
|
end
|
295
295
|
end
|
@@ -306,11 +306,11 @@ module Spidr
|
|
306
306
|
#
|
307
307
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
308
308
|
#
|
309
|
-
def every_html_doc
|
309
|
+
def every_html_doc
|
310
310
|
every_page do |page|
|
311
|
-
if (
|
311
|
+
if (block_given? && page.html?)
|
312
312
|
if (doc = page.doc)
|
313
|
-
|
313
|
+
yield doc
|
314
314
|
end
|
315
315
|
end
|
316
316
|
end
|
@@ -327,11 +327,11 @@ module Spidr
|
|
327
327
|
#
|
328
328
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
329
329
|
#
|
330
|
-
def every_xml_doc
|
330
|
+
def every_xml_doc
|
331
331
|
every_page do |page|
|
332
|
-
if (
|
332
|
+
if (block_given? && page.xml?)
|
333
333
|
if (doc = page.doc)
|
334
|
-
|
334
|
+
yield doc
|
335
335
|
end
|
336
336
|
end
|
337
337
|
end
|
@@ -349,11 +349,11 @@ module Spidr
|
|
349
349
|
#
|
350
350
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
351
351
|
#
|
352
|
-
def every_xsl_doc
|
352
|
+
def every_xsl_doc
|
353
353
|
every_page do |page|
|
354
|
-
if (
|
354
|
+
if (block_given? && page.xsl?)
|
355
355
|
if (doc = page.doc)
|
356
|
-
|
356
|
+
yield doc
|
357
357
|
end
|
358
358
|
end
|
359
359
|
end
|
@@ -370,11 +370,11 @@ module Spidr
|
|
370
370
|
#
|
371
371
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
372
372
|
#
|
373
|
-
def every_rss_doc
|
373
|
+
def every_rss_doc
|
374
374
|
every_page do |page|
|
375
|
-
if (
|
375
|
+
if (block_given? && page.rss?)
|
376
376
|
if (doc = page.doc)
|
377
|
-
|
377
|
+
yield doc
|
378
378
|
end
|
379
379
|
end
|
380
380
|
end
|
@@ -391,11 +391,11 @@ module Spidr
|
|
391
391
|
#
|
392
392
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
393
393
|
#
|
394
|
-
def every_atom_doc
|
394
|
+
def every_atom_doc
|
395
395
|
every_page do |page|
|
396
|
-
if (
|
396
|
+
if (block_given? && page.atom?)
|
397
397
|
if (doc = page.doc)
|
398
|
-
|
398
|
+
yield doc
|
399
399
|
end
|
400
400
|
end
|
401
401
|
end
|
@@ -410,9 +410,9 @@ module Spidr
|
|
410
410
|
# @yieldparam [Page] page
|
411
411
|
# A visited page.
|
412
412
|
#
|
413
|
-
def every_javascript_page
|
413
|
+
def every_javascript_page
|
414
414
|
every_page do |page|
|
415
|
-
|
415
|
+
yield page if (block_given? && page.javascript?)
|
416
416
|
end
|
417
417
|
end
|
418
418
|
|
@@ -425,9 +425,9 @@ module Spidr
|
|
425
425
|
# @yieldparam [Page] page
|
426
426
|
# A visited page.
|
427
427
|
#
|
428
|
-
def every_css_page
|
428
|
+
def every_css_page
|
429
429
|
every_page do |page|
|
430
|
-
|
430
|
+
yield page if (block_given? && page.css?)
|
431
431
|
end
|
432
432
|
end
|
433
433
|
|
@@ -440,9 +440,9 @@ module Spidr
|
|
440
440
|
# @yieldparam [Page] feed
|
441
441
|
# A visited page.
|
442
442
|
#
|
443
|
-
def every_rss_page
|
443
|
+
def every_rss_page
|
444
444
|
every_page do |page|
|
445
|
-
|
445
|
+
yield page if (block_given? && page.rss?)
|
446
446
|
end
|
447
447
|
end
|
448
448
|
|
@@ -455,9 +455,9 @@ module Spidr
|
|
455
455
|
# @yieldparam [Page] feed
|
456
456
|
# A visited page.
|
457
457
|
#
|
458
|
-
def every_atom_page
|
458
|
+
def every_atom_page
|
459
459
|
every_page do |page|
|
460
|
-
|
460
|
+
yield page if (block_given? && page.atom?)
|
461
461
|
end
|
462
462
|
end
|
463
463
|
|
@@ -470,9 +470,9 @@ module Spidr
|
|
470
470
|
# @yieldparam [Page] page
|
471
471
|
# A visited page.
|
472
472
|
#
|
473
|
-
def every_ms_word_page
|
473
|
+
def every_ms_word_page
|
474
474
|
every_page do |page|
|
475
|
-
|
475
|
+
yield page if (block_given? && page.ms_word?)
|
476
476
|
end
|
477
477
|
end
|
478
478
|
|
@@ -485,9 +485,9 @@ module Spidr
|
|
485
485
|
# @yieldparam [Page] page
|
486
486
|
# A visited page.
|
487
487
|
#
|
488
|
-
def every_pdf_page
|
488
|
+
def every_pdf_page
|
489
489
|
every_page do |page|
|
490
|
-
|
490
|
+
yield page if (block_given? && page.pdf?)
|
491
491
|
end
|
492
492
|
end
|
493
493
|
|
@@ -500,9 +500,9 @@ module Spidr
|
|
500
500
|
# @yieldparam [Page] page
|
501
501
|
# A visited page.
|
502
502
|
#
|
503
|
-
def every_zip_page
|
503
|
+
def every_zip_page
|
504
504
|
every_page do |page|
|
505
|
-
|
505
|
+
yield page if (block_given? && page.zip?)
|
506
506
|
end
|
507
507
|
end
|
508
508
|
|
data/lib/spidr/page.rb
CHANGED
@@ -62,7 +62,8 @@ module Spidr
|
|
62
62
|
|
63
63
|
#
|
64
64
|
# Determines if the response code is `300`, `301`, `302`, `303`
|
65
|
-
# or `307`.
|
65
|
+
# or `307`. Also checks for "soft" redirects added at the page
|
66
|
+
# level by a meta refresh tag.
|
66
67
|
#
|
67
68
|
# @return [Boolean]
|
68
69
|
# Specifies whether the response code is a HTTP Redirect code.
|
@@ -71,6 +72,8 @@ module Spidr
|
|
71
72
|
case code
|
72
73
|
when 300..303, 307
|
73
74
|
true
|
75
|
+
when 200
|
76
|
+
meta_redirect?
|
74
77
|
else
|
75
78
|
false
|
76
79
|
end
|
@@ -434,17 +437,7 @@ module Spidr
|
|
434
437
|
urls << url unless (url.nil? || url.empty?)
|
435
438
|
}
|
436
439
|
|
437
|
-
if self.is_redirect?
|
438
|
-
location = @headers['location']
|
439
|
-
|
440
|
-
if location.kind_of?(Array)
|
441
|
-
# handle multiple location URLs
|
442
|
-
location.each(&add_url)
|
443
|
-
else
|
444
|
-
# usually the location header contains a single String
|
445
|
-
add_url.call(location)
|
446
|
-
end
|
447
|
-
end
|
440
|
+
self.redirects_to.each(&add_url) if self.is_redirect?
|
448
441
|
|
449
442
|
if (html? && doc)
|
450
443
|
doc.search('a[@href]').each do |a|
|
@@ -471,6 +464,27 @@ module Spidr
|
|
471
464
|
return urls
|
472
465
|
end
|
473
466
|
|
467
|
+
#
|
468
|
+
# URL(s) that this document redirects to.
|
469
|
+
#
|
470
|
+
# @return [Array<String>]
|
471
|
+
# The links that this page redirects to (usually found in a
|
472
|
+
# location header or by way of a page-level meta redirect).
|
473
|
+
#
|
474
|
+
def redirects_to
|
475
|
+
location = @headers['location']
|
476
|
+
|
477
|
+
if location.nil?
|
478
|
+
# check page-level meta redirects if there isn't a location header
|
479
|
+
meta_redirect
|
480
|
+
elsif location.kind_of?(Array)
|
481
|
+
location
|
482
|
+
else
|
483
|
+
# usually the location header contains a single String
|
484
|
+
[location]
|
485
|
+
end
|
486
|
+
end
|
487
|
+
|
474
488
|
#
|
475
489
|
# Absolute URIs from within the page.
|
476
490
|
#
|
@@ -507,6 +521,43 @@ module Spidr
|
|
507
521
|
return url
|
508
522
|
end
|
509
523
|
|
524
|
+
#
|
525
|
+
# Determines if a page-level "soft" redirect is present. If yes,
|
526
|
+
# returns an array of those redirects (usually a single URL).
|
527
|
+
# Otherwise, returns false.
|
528
|
+
#
|
529
|
+
# @return [Array<String>]
|
530
|
+
# An array of redirect URLs
|
531
|
+
#
|
532
|
+
def meta_redirect
|
533
|
+
redirects = []
|
534
|
+
|
535
|
+
if (html? && doc)
|
536
|
+
search('//meta[@http-equiv and @content]').each do |node|
|
537
|
+
if node.attr('http-equiv') =~ /refresh/i
|
538
|
+
content = node.attr('content')
|
539
|
+
|
540
|
+
if (redirect = content.match(/url=(\S+)$/))
|
541
|
+
redirects << redirect[1]
|
542
|
+
end
|
543
|
+
end
|
544
|
+
end
|
545
|
+
end
|
546
|
+
|
547
|
+
return redirects.uniq
|
548
|
+
end
|
549
|
+
|
550
|
+
#
|
551
|
+
# Returns a boolean indicating whether or not page-level meta
|
552
|
+
# redirects are present in this page.
|
553
|
+
#
|
554
|
+
# @return [Boolean]
|
555
|
+
# Specifies whether the page includes page-level redirects.
|
556
|
+
#
|
557
|
+
def meta_redirect?
|
558
|
+
!meta_redirect.empty?
|
559
|
+
end
|
560
|
+
|
510
561
|
protected
|
511
562
|
|
512
563
|
#
|
data/lib/spidr/version.rb
CHANGED
data/spec/cookie_jar_spec.rb
CHANGED
@@ -101,8 +101,21 @@ describe CookieJar do
|
|
101
101
|
it "should encode multiple cookie params" do
|
102
102
|
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
103
103
|
@cookie_jar['zerosum.org'] = {'other' => '1'}
|
104
|
+
cookie = @cookie_jar.for_host('zerosum.org')
|
104
105
|
|
105
|
-
|
106
|
+
cookie.should include('admin=ofcourseiam')
|
107
|
+
cookie.should include('; ')
|
108
|
+
cookie.should include('other=1')
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should include cookies for the parent domain" do
|
112
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
113
|
+
@cookie_jar['sub.zerosum.org'] = {'other' => '1'}
|
114
|
+
cookie = @cookie_jar.for_host('sub.zerosum.org')
|
115
|
+
|
116
|
+
cookie.should include('admin=ofcourseiam')
|
117
|
+
cookie.should include('; ')
|
118
|
+
cookie.should include('other=1')
|
106
119
|
end
|
107
120
|
end
|
108
121
|
end
|
data/spec/helpers/wsoc.rb
CHANGED
data/spec/page_spec.rb
CHANGED
@@ -79,6 +79,21 @@ describe Page do
|
|
79
79
|
end
|
80
80
|
end
|
81
81
|
|
82
|
+
describe "redirects" do
|
83
|
+
before(:all) do
|
84
|
+
@page = get_page('http://spidr.rubyforge.org/course/start.html')
|
85
|
+
@page.stub!(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should provide access to page-level redirects" do
|
89
|
+
@page.redirects_to.should == ['http://spidr.rubyforge.org/redirected']
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should include meta refresh redirects in the list of links" do
|
93
|
+
@page.links.should include('http://spidr.rubyforge.org/redirected')
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
82
97
|
describe "cookies" do
|
83
98
|
before(:all) do
|
84
99
|
@page = get_page('http://twitter.com/login')
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
|
3
|
-
|
2
|
+
require 'bundler'
|
3
|
+
|
4
|
+
begin
|
5
|
+
Bundler.setup(:runtime, :test)
|
6
|
+
rescue Bundler::BundlerError => e
|
7
|
+
STDERR.puts e.message
|
8
|
+
STDERR.puts "Run `bundle install` to install missing gems"
|
9
|
+
exit e.status_code
|
10
|
+
end
|
4
11
|
|
12
|
+
require 'spec'
|
5
13
|
require 'spidr/version'
|
6
14
|
|
7
15
|
include Spidr
|
data/spidr.gemspec
CHANGED
@@ -5,112 +5,112 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{spidr}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Postmodern"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-07-02}
|
13
13
|
s.description = %q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
|
14
14
|
s.email = %q{postmodern.mod3@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"ChangeLog.md",
|
17
|
-
|
18
|
-
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
21
|
".gitignore",
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
22
|
+
".specopts",
|
23
|
+
".yardopts",
|
24
|
+
"ChangeLog.md",
|
25
|
+
"Gemfile",
|
26
|
+
"LICENSE.txt",
|
27
|
+
"README.md",
|
28
|
+
"Rakefile",
|
29
|
+
"lib/spidr.rb",
|
30
|
+
"lib/spidr/actions.rb",
|
31
|
+
"lib/spidr/actions/actions.rb",
|
32
|
+
"lib/spidr/actions/exceptions.rb",
|
33
|
+
"lib/spidr/actions/exceptions/action.rb",
|
34
|
+
"lib/spidr/actions/exceptions/paused.rb",
|
35
|
+
"lib/spidr/actions/exceptions/skip_link.rb",
|
36
|
+
"lib/spidr/actions/exceptions/skip_page.rb",
|
37
|
+
"lib/spidr/agent.rb",
|
38
|
+
"lib/spidr/auth_credential.rb",
|
39
|
+
"lib/spidr/auth_store.rb",
|
40
|
+
"lib/spidr/cookie_jar.rb",
|
41
|
+
"lib/spidr/events.rb",
|
42
|
+
"lib/spidr/extensions.rb",
|
43
|
+
"lib/spidr/extensions/uri.rb",
|
44
|
+
"lib/spidr/filters.rb",
|
45
|
+
"lib/spidr/page.rb",
|
46
|
+
"lib/spidr/rules.rb",
|
47
|
+
"lib/spidr/sanitizers.rb",
|
48
|
+
"lib/spidr/session_cache.rb",
|
49
|
+
"lib/spidr/spidr.rb",
|
50
|
+
"lib/spidr/version.rb",
|
51
|
+
"spec/actions_spec.rb",
|
52
|
+
"spec/agent_spec.rb",
|
53
|
+
"spec/auth_store_spec.rb",
|
54
|
+
"spec/cookie_jar_spec.rb",
|
55
|
+
"spec/extensions/uri_spec.rb",
|
56
|
+
"spec/filters_spec.rb",
|
57
|
+
"spec/helpers/history.rb",
|
58
|
+
"spec/helpers/page.rb",
|
59
|
+
"spec/helpers/wsoc.rb",
|
60
|
+
"spec/page_examples.rb",
|
61
|
+
"spec/page_spec.rb",
|
62
|
+
"spec/rules_spec.rb",
|
63
|
+
"spec/sanitizers_spec.rb",
|
64
|
+
"spec/session_cache.rb",
|
65
|
+
"spec/spec_helper.rb",
|
66
|
+
"spec/spidr_spec.rb",
|
67
|
+
"spidr.gemspec"
|
67
68
|
]
|
68
69
|
s.has_rdoc = %q{yard}
|
69
70
|
s.homepage = %q{http://github.com/postmodern/spidr}
|
70
71
|
s.licenses = ["MIT"]
|
71
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
72
72
|
s.require_paths = ["lib"]
|
73
|
-
s.rubygems_version = %q{1.3.
|
73
|
+
s.rubygems_version = %q{1.3.7}
|
74
74
|
s.summary = %q{A versatile Ruby web spidering library}
|
75
75
|
s.test_files = [
|
76
|
+
"spec/actions_spec.rb",
|
77
|
+
"spec/agent_spec.rb",
|
76
78
|
"spec/auth_store_spec.rb",
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
"spec/page_examples.rb",
|
91
|
-
"spec/actions_spec.rb"
|
79
|
+
"spec/cookie_jar_spec.rb",
|
80
|
+
"spec/extensions/uri_spec.rb",
|
81
|
+
"spec/filters_spec.rb",
|
82
|
+
"spec/helpers/history.rb",
|
83
|
+
"spec/helpers/page.rb",
|
84
|
+
"spec/helpers/wsoc.rb",
|
85
|
+
"spec/page_examples.rb",
|
86
|
+
"spec/page_spec.rb",
|
87
|
+
"spec/rules_spec.rb",
|
88
|
+
"spec/sanitizers_spec.rb",
|
89
|
+
"spec/session_cache.rb",
|
90
|
+
"spec/spec_helper.rb",
|
91
|
+
"spec/spidr_spec.rb"
|
92
92
|
]
|
93
93
|
|
94
94
|
if s.respond_to? :specification_version then
|
95
95
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
96
96
|
s.specification_version = 3
|
97
97
|
|
98
|
-
if Gem::Version.new(Gem::
|
98
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
99
99
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.3.0"])
|
100
|
+
s.add_development_dependency(%q<rake>, ["~> 0.8.7"])
|
101
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.4.0"])
|
100
102
|
s.add_development_dependency(%q<rspec>, ["~> 1.3.0"])
|
101
|
-
s.add_development_dependency(%q<yard>, ["~> 0.5.3"])
|
102
|
-
s.add_development_dependency(%q<wsoc>, ["~> 0.1.1"])
|
103
103
|
else
|
104
104
|
s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
|
105
|
+
s.add_dependency(%q<rake>, ["~> 0.8.7"])
|
106
|
+
s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
|
105
107
|
s.add_dependency(%q<rspec>, ["~> 1.3.0"])
|
106
|
-
s.add_dependency(%q<yard>, ["~> 0.5.3"])
|
107
|
-
s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
|
108
108
|
end
|
109
109
|
else
|
110
110
|
s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
|
111
|
+
s.add_dependency(%q<rake>, ["~> 0.8.7"])
|
112
|
+
s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
|
111
113
|
s.add_dependency(%q<rspec>, ["~> 1.3.0"])
|
112
|
-
s.add_dependency(%q<yard>, ["~> 0.5.3"])
|
113
|
-
s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 5
|
9
|
+
version: 0.2.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Postmodern
|
@@ -14,13 +14,13 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-07-02 00:00:00 -07:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
22
|
-
prerelease: false
|
23
22
|
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
24
|
requirements:
|
25
25
|
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
@@ -30,48 +30,52 @@ dependencies:
|
|
30
30
|
- 0
|
31
31
|
version: 1.3.0
|
32
32
|
type: :runtime
|
33
|
+
prerelease: false
|
33
34
|
version_requirements: *id001
|
34
35
|
- !ruby/object:Gem::Dependency
|
35
|
-
name:
|
36
|
-
prerelease: false
|
36
|
+
name: rake
|
37
37
|
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
38
39
|
requirements:
|
39
40
|
- - ~>
|
40
41
|
- !ruby/object:Gem::Version
|
41
42
|
segments:
|
42
|
-
- 1
|
43
|
-
- 3
|
44
43
|
- 0
|
45
|
-
|
44
|
+
- 8
|
45
|
+
- 7
|
46
|
+
version: 0.8.7
|
46
47
|
type: :development
|
48
|
+
prerelease: false
|
47
49
|
version_requirements: *id002
|
48
50
|
- !ruby/object:Gem::Dependency
|
49
|
-
name:
|
50
|
-
prerelease: false
|
51
|
+
name: jeweler
|
51
52
|
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
52
54
|
requirements:
|
53
55
|
- - ~>
|
54
56
|
- !ruby/object:Gem::Version
|
55
57
|
segments:
|
58
|
+
- 1
|
59
|
+
- 4
|
56
60
|
- 0
|
57
|
-
|
58
|
-
- 3
|
59
|
-
version: 0.5.3
|
61
|
+
version: 1.4.0
|
60
62
|
type: :development
|
63
|
+
prerelease: false
|
61
64
|
version_requirements: *id003
|
62
65
|
- !ruby/object:Gem::Dependency
|
63
|
-
name:
|
64
|
-
prerelease: false
|
66
|
+
name: rspec
|
65
67
|
requirement: &id004 !ruby/object:Gem::Requirement
|
68
|
+
none: false
|
66
69
|
requirements:
|
67
70
|
- - ~>
|
68
71
|
- !ruby/object:Gem::Version
|
69
72
|
segments:
|
70
|
-
- 0
|
71
73
|
- 1
|
72
|
-
-
|
73
|
-
|
74
|
+
- 3
|
75
|
+
- 0
|
76
|
+
version: 1.3.0
|
74
77
|
type: :development
|
78
|
+
prerelease: false
|
75
79
|
version_requirements: *id004
|
76
80
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
77
81
|
email: postmodern.mod3@gmail.com
|
@@ -88,6 +92,7 @@ files:
|
|
88
92
|
- .specopts
|
89
93
|
- .yardopts
|
90
94
|
- ChangeLog.md
|
95
|
+
- Gemfile
|
91
96
|
- LICENSE.txt
|
92
97
|
- README.md
|
93
98
|
- Rakefile
|
@@ -135,18 +140,21 @@ homepage: http://github.com/postmodern/spidr
|
|
135
140
|
licenses:
|
136
141
|
- MIT
|
137
142
|
post_install_message:
|
138
|
-
rdoc_options:
|
139
|
-
|
143
|
+
rdoc_options: []
|
144
|
+
|
140
145
|
require_paths:
|
141
146
|
- lib
|
142
147
|
required_ruby_version: !ruby/object:Gem::Requirement
|
148
|
+
none: false
|
143
149
|
requirements:
|
144
150
|
- - ">="
|
145
151
|
- !ruby/object:Gem::Version
|
152
|
+
hash: 740918287
|
146
153
|
segments:
|
147
154
|
- 0
|
148
155
|
version: "0"
|
149
156
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
157
|
+
none: false
|
150
158
|
requirements:
|
151
159
|
- - ">="
|
152
160
|
- !ruby/object:Gem::Version
|
@@ -156,24 +164,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
156
164
|
requirements: []
|
157
165
|
|
158
166
|
rubyforge_project:
|
159
|
-
rubygems_version: 1.3.
|
167
|
+
rubygems_version: 1.3.7
|
160
168
|
signing_key:
|
161
169
|
specification_version: 3
|
162
170
|
summary: A versatile Ruby web spidering library
|
163
171
|
test_files:
|
164
|
-
- spec/
|
165
|
-
- spec/rules_spec.rb
|
166
|
-
- spec/session_cache.rb
|
167
|
-
- spec/spec_helper.rb
|
168
|
-
- spec/sanitizers_spec.rb
|
169
|
-
- spec/filters_spec.rb
|
170
|
-
- spec/page_spec.rb
|
171
|
-
- spec/spidr_spec.rb
|
172
|
+
- spec/actions_spec.rb
|
172
173
|
- spec/agent_spec.rb
|
174
|
+
- spec/auth_store_spec.rb
|
173
175
|
- spec/cookie_jar_spec.rb
|
174
176
|
- spec/extensions/uri_spec.rb
|
177
|
+
- spec/filters_spec.rb
|
175
178
|
- spec/helpers/history.rb
|
176
179
|
- spec/helpers/page.rb
|
177
180
|
- spec/helpers/wsoc.rb
|
178
181
|
- spec/page_examples.rb
|
179
|
-
- spec/
|
182
|
+
- spec/page_spec.rb
|
183
|
+
- spec/rules_spec.rb
|
184
|
+
- spec/sanitizers_spec.rb
|
185
|
+
- spec/session_cache.rb
|
186
|
+
- spec/spec_helper.rb
|
187
|
+
- spec/spidr_spec.rb
|