spidr 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/ChangeLog.md +10 -0
- data/Gemfile +27 -0
- data/README.md +1 -0
- data/Rakefile +23 -30
- data/lib/spidr/agent.rb +27 -20
- data/lib/spidr/cookie_jar.rb +16 -0
- data/lib/spidr/events.rb +58 -58
- data/lib/spidr/page.rb +63 -12
- data/lib/spidr/version.rb +1 -1
- data/spec/cookie_jar_spec.rb +14 -1
- data/spec/helpers/wsoc.rb +1 -1
- data/spec/page_spec.rb +15 -0
- data/spec/spec_helper.rb +10 -2
- data/spidr.gemspec +73 -73
- metadata +39 -31
data/.gitignore
CHANGED
data/ChangeLog.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
### 0.2.5 / 2010-07-02
|
2
|
+
|
3
|
+
* Added {Spidr::Page#meta_redirect}.
|
4
|
+
* Added {Spidr::Page#meta_redirect?}.
|
5
|
+
* Manage development dependencies with Bundler.
|
6
|
+
* Support following "old-school" meta-refresh redirects (thanks zapnap).
|
7
|
+
* Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
|
8
|
+
* Fixed a constant lookup issue in {Spidr::Agent}.
|
9
|
+
* Use `yield` instead of `block.call` when necessary.
|
10
|
+
|
1
11
|
### 0.2.4 / 2010-05-05
|
2
12
|
|
3
13
|
* Added {Spidr::Filters#visit_urls}.
|
data/Gemfile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
group(:runtime) do
|
4
|
+
gem 'nokogiri', '>= 1.3.0'
|
5
|
+
end
|
6
|
+
|
7
|
+
group(:development) do
|
8
|
+
gem 'rake', '~> 0.8.7'
|
9
|
+
gem 'jeweler', '~> 1.4.0', :git => 'git://github.com/technicalpickles/jeweler.git'
|
10
|
+
end
|
11
|
+
|
12
|
+
group(:doc) do
|
13
|
+
case RUBY_PLATFORM
|
14
|
+
when 'java'
|
15
|
+
gem 'maruku', '~> 0.6.0'
|
16
|
+
else
|
17
|
+
gem 'rdiscount', '~> 1.6.3'
|
18
|
+
end
|
19
|
+
|
20
|
+
gem 'yard', '~> 0.5.3'
|
21
|
+
end
|
22
|
+
|
23
|
+
group(:test) do
|
24
|
+
gem 'wsoc', '~> 0.1.3'
|
25
|
+
end
|
26
|
+
|
27
|
+
gem 'rspec', '~> 1.3.0', :group => [:development, :test]
|
data/README.md
CHANGED
data/Rakefile
CHANGED
@@ -1,27 +1,28 @@
|
|
1
1
|
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
|
4
|
+
begin
|
5
|
+
Bundler.setup(:development, :doc)
|
6
|
+
rescue Bundler::BundlerError => e
|
7
|
+
STDERR.puts e.message
|
8
|
+
STDERR.puts "Run `bundle install` to install missing gems"
|
9
|
+
exit e.status_code
|
10
|
+
end
|
11
|
+
|
2
12
|
require 'rake'
|
13
|
+
require 'jeweler'
|
3
14
|
require './lib/spidr/version.rb'
|
4
15
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
gem.authors = ['Postmodern']
|
16
|
-
gem.add_dependency 'nokogiri', '>= 1.3.0'
|
17
|
-
gem.add_development_dependency 'rspec', '~> 1.3.0'
|
18
|
-
gem.add_development_dependency 'yard', '~> 0.5.3'
|
19
|
-
gem.add_development_dependency 'wsoc', '~> 0.1.1'
|
20
|
-
gem.has_rdoc = 'yard'
|
21
|
-
end
|
22
|
-
Jeweler::GemcutterTasks.new
|
23
|
-
rescue LoadError
|
24
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
16
|
+
Jeweler::Tasks.new do |gem|
|
17
|
+
gem.name = 'spidr'
|
18
|
+
gem.version = Spidr::VERSION
|
19
|
+
gem.license = 'MIT'
|
20
|
+
gem.summary = %Q{A versatile Ruby web spidering library}
|
21
|
+
gem.description = %Q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
|
22
|
+
gem.email = 'postmodern.mod3@gmail.com'
|
23
|
+
gem.homepage = 'http://github.com/postmodern/spidr'
|
24
|
+
gem.authors = ['Postmodern']
|
25
|
+
gem.has_rdoc = 'yard'
|
25
26
|
end
|
26
27
|
|
27
28
|
require 'spec/rake/spectask'
|
@@ -31,15 +32,7 @@ Spec::Rake::SpecTask.new(:spec) do |spec|
|
|
31
32
|
spec.spec_opts = ['--options', '.specopts']
|
32
33
|
end
|
33
34
|
|
34
|
-
task :spec => :check_dependencies
|
35
35
|
task :default => :spec
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
YARD::Rake::YardocTask.new
|
41
|
-
rescue LoadError
|
42
|
-
task :yard do
|
43
|
-
abort "YARD is not available. In order to run yard, you must: gem install yard"
|
44
|
-
end
|
45
|
-
end
|
37
|
+
require 'yard'
|
38
|
+
YARD::Rake::YardocTask.new
|
data/lib/spidr/agent.rb
CHANGED
@@ -98,7 +98,7 @@ module Spidr
|
|
98
98
|
# @yieldparam [Agent] agent
|
99
99
|
# The newly created agent.
|
100
100
|
#
|
101
|
-
def initialize(options={}
|
101
|
+
def initialize(options={})
|
102
102
|
@host_header = options[:host_header]
|
103
103
|
@host_headers = {}
|
104
104
|
|
@@ -121,7 +121,7 @@ module Spidr
|
|
121
121
|
|
122
122
|
super(options)
|
123
123
|
|
124
|
-
|
124
|
+
yield self if block_given?
|
125
125
|
end
|
126
126
|
|
127
127
|
#
|
@@ -140,9 +140,9 @@ module Spidr
|
|
140
140
|
# @yieldparam [Agent] agent
|
141
141
|
# The newly created agent.
|
142
142
|
#
|
143
|
-
def self.start_at(url,options={}
|
143
|
+
def self.start_at(url,options={})
|
144
144
|
self.new(options) do |spider|
|
145
|
-
|
145
|
+
yield spider if block_given?
|
146
146
|
|
147
147
|
spider.start_at(url)
|
148
148
|
end
|
@@ -164,9 +164,9 @@ module Spidr
|
|
164
164
|
# @yieldparam [Agent] agent
|
165
165
|
# The newly created agent.
|
166
166
|
#
|
167
|
-
def self.host(name,options={}
|
167
|
+
def self.host(name,options={})
|
168
168
|
self.new(options.merge(:host => name)) do |spider|
|
169
|
-
|
169
|
+
yield spider if block_given?
|
170
170
|
|
171
171
|
spider.start_at("http://#{name}/")
|
172
172
|
end
|
@@ -188,11 +188,11 @@ module Spidr
|
|
188
188
|
# @yieldparam [Agent] agent
|
189
189
|
# The newly created agent.
|
190
190
|
#
|
191
|
-
def self.site(url,options={}
|
191
|
+
def self.site(url,options={})
|
192
192
|
url = URI(url.to_s)
|
193
193
|
|
194
194
|
return self.new(options.merge(:host => url.host)) do |spider|
|
195
|
-
|
195
|
+
yield spider if block_given?
|
196
196
|
|
197
197
|
spider.start_at(url)
|
198
198
|
end
|
@@ -457,11 +457,18 @@ module Spidr
|
|
457
457
|
link = url.to_s
|
458
458
|
|
459
459
|
begin
|
460
|
-
@every_url_blocks.each { |
|
460
|
+
@every_url_blocks.each { |url_block| url_block.call(url) }
|
461
461
|
|
462
|
-
@urls_like_blocks.each do |pattern,
|
463
|
-
|
464
|
-
|
462
|
+
@urls_like_blocks.each do |pattern,url_blocks|
|
463
|
+
match = case pattern
|
464
|
+
when Regexp
|
465
|
+
link =~ pattern
|
466
|
+
else
|
467
|
+
(pattern == link) || (pattern == url)
|
468
|
+
end
|
469
|
+
|
470
|
+
if match
|
471
|
+
url_blocks.each { |url_block| url_block.call(url) }
|
465
472
|
end
|
466
473
|
end
|
467
474
|
rescue Actions::Paused => action
|
@@ -494,7 +501,7 @@ module Spidr
|
|
494
501
|
# @return [Page, nil]
|
495
502
|
# The page for the response, or `nil` if the request failed.
|
496
503
|
#
|
497
|
-
def get_page(url
|
504
|
+
def get_page(url)
|
498
505
|
url = URI(url.to_s)
|
499
506
|
|
500
507
|
prepare_request(url) do |session,path,headers|
|
@@ -503,7 +510,7 @@ module Spidr
|
|
503
510
|
# save any new cookies
|
504
511
|
@cookies.from_page(new_page)
|
505
512
|
|
506
|
-
|
513
|
+
yield new_page if block_given?
|
507
514
|
return new_page
|
508
515
|
end
|
509
516
|
end
|
@@ -529,7 +536,7 @@ module Spidr
|
|
529
536
|
#
|
530
537
|
# @since 0.2.2
|
531
538
|
#
|
532
|
-
def post_page(url,post_data=''
|
539
|
+
def post_page(url,post_data='')
|
533
540
|
url = URI(url.to_s)
|
534
541
|
|
535
542
|
prepare_request(url) do |session,path,headers|
|
@@ -538,7 +545,7 @@ module Spidr
|
|
538
545
|
# save any new cookies
|
539
546
|
@cookies.from_page(new_page)
|
540
547
|
|
541
|
-
|
548
|
+
yield new_page if block_given?
|
542
549
|
return new_page
|
543
550
|
end
|
544
551
|
end
|
@@ -560,7 +567,7 @@ module Spidr
|
|
560
567
|
# The page that was visited. If `nil` is returned, either the request
|
561
568
|
# for the page failed, or the page was skipped.
|
562
569
|
#
|
563
|
-
def visit_page(url
|
570
|
+
def visit_page(url)
|
564
571
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
565
572
|
|
566
573
|
get_page(url) do |page|
|
@@ -569,7 +576,7 @@ module Spidr
|
|
569
576
|
begin
|
570
577
|
@every_page_blocks.each { |page_block| page_block.call(page) }
|
571
578
|
|
572
|
-
|
579
|
+
yield page if block_given?
|
573
580
|
rescue Actions::Paused => action
|
574
581
|
raise(action)
|
575
582
|
rescue Actions::SkipPage
|
@@ -668,7 +675,7 @@ module Spidr
|
|
668
675
|
begin
|
669
676
|
sleep(@delay) if @delay > 0
|
670
677
|
|
671
|
-
|
678
|
+
yield @sessions[url], path, headers
|
672
679
|
rescue SystemCallError,
|
673
680
|
Timeout::Error,
|
674
681
|
SocketError,
|
@@ -719,7 +726,7 @@ module Spidr
|
|
719
726
|
#
|
720
727
|
def failed(url)
|
721
728
|
@failures << url
|
722
|
-
@every_failed_url_blocks.each { |
|
729
|
+
@every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
|
723
730
|
return true
|
724
731
|
end
|
725
732
|
|
data/lib/spidr/cookie_jar.rb
CHANGED
@@ -130,6 +130,22 @@ module Spidr
|
|
130
130
|
@dirty.delete(host)
|
131
131
|
end
|
132
132
|
|
133
|
+
hdomain = host.split('.')
|
134
|
+
|
135
|
+
if hdomain.length > 2
|
136
|
+
parent_cookies = for_host(hdomain[1..-1].join('.'))
|
137
|
+
|
138
|
+
unless (parent_cookies.nil? || parent_cookies.empty?)
|
139
|
+
@cookies[host] = if @cookies[host].nil?
|
140
|
+
# inherit the parent cookies
|
141
|
+
parent_cookies
|
142
|
+
else
|
143
|
+
# merge the parent cookies with any host-specific cookies
|
144
|
+
"#{parent_cookies}; #{@cookies[host]}"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
133
149
|
return @cookies[host]
|
134
150
|
end
|
135
151
|
|
data/lib/spidr/events.rb
CHANGED
@@ -72,8 +72,8 @@ module Spidr
|
|
72
72
|
# @yieldparam [Hash] headers
|
73
73
|
# The headers from a response.
|
74
74
|
#
|
75
|
-
def all_headers
|
76
|
-
every_page { |page|
|
75
|
+
def all_headers
|
76
|
+
every_page { |page| yield page.headers }
|
77
77
|
end
|
78
78
|
|
79
79
|
#
|
@@ -99,9 +99,9 @@ module Spidr
|
|
99
99
|
# @yieldparam [Page] page
|
100
100
|
# A visited page.
|
101
101
|
#
|
102
|
-
def every_ok_page
|
102
|
+
def every_ok_page
|
103
103
|
every_page do |page|
|
104
|
-
|
104
|
+
yield page if (block_given? && page.ok?)
|
105
105
|
end
|
106
106
|
end
|
107
107
|
|
@@ -114,9 +114,9 @@ module Spidr
|
|
114
114
|
# @yieldparam [Page] page
|
115
115
|
# A visited page.
|
116
116
|
#
|
117
|
-
def every_redirect_page
|
117
|
+
def every_redirect_page
|
118
118
|
every_page do |page|
|
119
|
-
|
119
|
+
yield page if (block_given? && page.redirect?)
|
120
120
|
end
|
121
121
|
end
|
122
122
|
|
@@ -129,9 +129,9 @@ module Spidr
|
|
129
129
|
# @yieldparam [Page] page
|
130
130
|
# A visited page.
|
131
131
|
#
|
132
|
-
def every_timedout_page
|
132
|
+
def every_timedout_page
|
133
133
|
every_page do |page|
|
134
|
-
|
134
|
+
yield page if (block_given? && page.timedout?)
|
135
135
|
end
|
136
136
|
end
|
137
137
|
|
@@ -144,9 +144,9 @@ module Spidr
|
|
144
144
|
# @yieldparam [Page] page
|
145
145
|
# A visited page.
|
146
146
|
#
|
147
|
-
def every_bad_request_page
|
147
|
+
def every_bad_request_page
|
148
148
|
every_page do |page|
|
149
|
-
|
149
|
+
yield page if (block_given? && page.bad_request?)
|
150
150
|
end
|
151
151
|
end
|
152
152
|
|
@@ -159,9 +159,9 @@ module Spidr
|
|
159
159
|
# @yieldparam [Page] page
|
160
160
|
# A visited page.
|
161
161
|
#
|
162
|
-
def every_unauthorized_page
|
162
|
+
def every_unauthorized_page
|
163
163
|
every_page do |page|
|
164
|
-
|
164
|
+
yield page if (block_given? && page.unauthorized?)
|
165
165
|
end
|
166
166
|
end
|
167
167
|
|
@@ -174,9 +174,9 @@ module Spidr
|
|
174
174
|
# @yieldparam [Page] page
|
175
175
|
# A visited page.
|
176
176
|
#
|
177
|
-
def every_forbidden_page
|
177
|
+
def every_forbidden_page
|
178
178
|
every_page do |page|
|
179
|
-
|
179
|
+
yield page if (block_given? && page.forbidden?)
|
180
180
|
end
|
181
181
|
end
|
182
182
|
|
@@ -189,9 +189,9 @@ module Spidr
|
|
189
189
|
# @yieldparam [Page] page
|
190
190
|
# A visited page.
|
191
191
|
#
|
192
|
-
def every_missing_page
|
192
|
+
def every_missing_page
|
193
193
|
every_page do |page|
|
194
|
-
|
194
|
+
yield page if (block_given? && page.missing?)
|
195
195
|
end
|
196
196
|
end
|
197
197
|
|
@@ -205,9 +205,9 @@ module Spidr
|
|
205
205
|
# @yieldparam [Page] page
|
206
206
|
# A visited page.
|
207
207
|
#
|
208
|
-
def every_internal_server_error_page
|
208
|
+
def every_internal_server_error_page
|
209
209
|
every_page do |page|
|
210
|
-
|
210
|
+
yield page if (block_given? && page.had_internal_server_error?)
|
211
211
|
end
|
212
212
|
end
|
213
213
|
|
@@ -220,9 +220,9 @@ module Spidr
|
|
220
220
|
# @yieldparam [Page] page
|
221
221
|
# A visited page.
|
222
222
|
#
|
223
|
-
def every_txt_page
|
223
|
+
def every_txt_page
|
224
224
|
every_page do |page|
|
225
|
-
|
225
|
+
yield page if (block_given? && page.txt?)
|
226
226
|
end
|
227
227
|
end
|
228
228
|
|
@@ -235,9 +235,9 @@ module Spidr
|
|
235
235
|
# @yieldparam [Page] page
|
236
236
|
# A visited page.
|
237
237
|
#
|
238
|
-
def every_html_page
|
238
|
+
def every_html_page
|
239
239
|
every_page do |page|
|
240
|
-
|
240
|
+
yield page if (block_given? && page.html?)
|
241
241
|
end
|
242
242
|
end
|
243
243
|
|
@@ -250,9 +250,9 @@ module Spidr
|
|
250
250
|
# @yieldparam [Page] page
|
251
251
|
# A visited page.
|
252
252
|
#
|
253
|
-
def every_xml_page
|
253
|
+
def every_xml_page
|
254
254
|
every_page do |page|
|
255
|
-
|
255
|
+
yield page if (block_given? && page.xml?)
|
256
256
|
end
|
257
257
|
end
|
258
258
|
|
@@ -266,9 +266,9 @@ module Spidr
|
|
266
266
|
# @yieldparam [Page] page
|
267
267
|
# A visited page.
|
268
268
|
#
|
269
|
-
def every_xsl_page
|
269
|
+
def every_xsl_page
|
270
270
|
every_page do |page|
|
271
|
-
|
271
|
+
yield page if (block_given? && page.xsl?)
|
272
272
|
end
|
273
273
|
end
|
274
274
|
|
@@ -285,11 +285,11 @@ module Spidr
|
|
285
285
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
286
286
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
287
287
|
#
|
288
|
-
def every_doc
|
288
|
+
def every_doc
|
289
289
|
every_page do |page|
|
290
|
-
if
|
290
|
+
if block_given?
|
291
291
|
if (doc = page.doc)
|
292
|
-
|
292
|
+
yield doc
|
293
293
|
end
|
294
294
|
end
|
295
295
|
end
|
@@ -306,11 +306,11 @@ module Spidr
|
|
306
306
|
#
|
307
307
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
308
308
|
#
|
309
|
-
def every_html_doc
|
309
|
+
def every_html_doc
|
310
310
|
every_page do |page|
|
311
|
-
if (
|
311
|
+
if (block_given? && page.html?)
|
312
312
|
if (doc = page.doc)
|
313
|
-
|
313
|
+
yield doc
|
314
314
|
end
|
315
315
|
end
|
316
316
|
end
|
@@ -327,11 +327,11 @@ module Spidr
|
|
327
327
|
#
|
328
328
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
329
329
|
#
|
330
|
-
def every_xml_doc
|
330
|
+
def every_xml_doc
|
331
331
|
every_page do |page|
|
332
|
-
if (
|
332
|
+
if (block_given? && page.xml?)
|
333
333
|
if (doc = page.doc)
|
334
|
-
|
334
|
+
yield doc
|
335
335
|
end
|
336
336
|
end
|
337
337
|
end
|
@@ -349,11 +349,11 @@ module Spidr
|
|
349
349
|
#
|
350
350
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
351
351
|
#
|
352
|
-
def every_xsl_doc
|
352
|
+
def every_xsl_doc
|
353
353
|
every_page do |page|
|
354
|
-
if (
|
354
|
+
if (block_given? && page.xsl?)
|
355
355
|
if (doc = page.doc)
|
356
|
-
|
356
|
+
yield doc
|
357
357
|
end
|
358
358
|
end
|
359
359
|
end
|
@@ -370,11 +370,11 @@ module Spidr
|
|
370
370
|
#
|
371
371
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
372
372
|
#
|
373
|
-
def every_rss_doc
|
373
|
+
def every_rss_doc
|
374
374
|
every_page do |page|
|
375
|
-
if (
|
375
|
+
if (block_given? && page.rss?)
|
376
376
|
if (doc = page.doc)
|
377
|
-
|
377
|
+
yield doc
|
378
378
|
end
|
379
379
|
end
|
380
380
|
end
|
@@ -391,11 +391,11 @@ module Spidr
|
|
391
391
|
#
|
392
392
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
393
393
|
#
|
394
|
-
def every_atom_doc
|
394
|
+
def every_atom_doc
|
395
395
|
every_page do |page|
|
396
|
-
if (
|
396
|
+
if (block_given? && page.atom?)
|
397
397
|
if (doc = page.doc)
|
398
|
-
|
398
|
+
yield doc
|
399
399
|
end
|
400
400
|
end
|
401
401
|
end
|
@@ -410,9 +410,9 @@ module Spidr
|
|
410
410
|
# @yieldparam [Page] page
|
411
411
|
# A visited page.
|
412
412
|
#
|
413
|
-
def every_javascript_page
|
413
|
+
def every_javascript_page
|
414
414
|
every_page do |page|
|
415
|
-
|
415
|
+
yield page if (block_given? && page.javascript?)
|
416
416
|
end
|
417
417
|
end
|
418
418
|
|
@@ -425,9 +425,9 @@ module Spidr
|
|
425
425
|
# @yieldparam [Page] page
|
426
426
|
# A visited page.
|
427
427
|
#
|
428
|
-
def every_css_page
|
428
|
+
def every_css_page
|
429
429
|
every_page do |page|
|
430
|
-
|
430
|
+
yield page if (block_given? && page.css?)
|
431
431
|
end
|
432
432
|
end
|
433
433
|
|
@@ -440,9 +440,9 @@ module Spidr
|
|
440
440
|
# @yieldparam [Page] feed
|
441
441
|
# A visited page.
|
442
442
|
#
|
443
|
-
def every_rss_page
|
443
|
+
def every_rss_page
|
444
444
|
every_page do |page|
|
445
|
-
|
445
|
+
yield page if (block_given? && page.rss?)
|
446
446
|
end
|
447
447
|
end
|
448
448
|
|
@@ -455,9 +455,9 @@ module Spidr
|
|
455
455
|
# @yieldparam [Page] feed
|
456
456
|
# A visited page.
|
457
457
|
#
|
458
|
-
def every_atom_page
|
458
|
+
def every_atom_page
|
459
459
|
every_page do |page|
|
460
|
-
|
460
|
+
yield page if (block_given? && page.atom?)
|
461
461
|
end
|
462
462
|
end
|
463
463
|
|
@@ -470,9 +470,9 @@ module Spidr
|
|
470
470
|
# @yieldparam [Page] page
|
471
471
|
# A visited page.
|
472
472
|
#
|
473
|
-
def every_ms_word_page
|
473
|
+
def every_ms_word_page
|
474
474
|
every_page do |page|
|
475
|
-
|
475
|
+
yield page if (block_given? && page.ms_word?)
|
476
476
|
end
|
477
477
|
end
|
478
478
|
|
@@ -485,9 +485,9 @@ module Spidr
|
|
485
485
|
# @yieldparam [Page] page
|
486
486
|
# A visited page.
|
487
487
|
#
|
488
|
-
def every_pdf_page
|
488
|
+
def every_pdf_page
|
489
489
|
every_page do |page|
|
490
|
-
|
490
|
+
yield page if (block_given? && page.pdf?)
|
491
491
|
end
|
492
492
|
end
|
493
493
|
|
@@ -500,9 +500,9 @@ module Spidr
|
|
500
500
|
# @yieldparam [Page] page
|
501
501
|
# A visited page.
|
502
502
|
#
|
503
|
-
def every_zip_page
|
503
|
+
def every_zip_page
|
504
504
|
every_page do |page|
|
505
|
-
|
505
|
+
yield page if (block_given? && page.zip?)
|
506
506
|
end
|
507
507
|
end
|
508
508
|
|
data/lib/spidr/page.rb
CHANGED
@@ -62,7 +62,8 @@ module Spidr
|
|
62
62
|
|
63
63
|
#
|
64
64
|
# Determines if the response code is `300`, `301`, `302`, `303`
|
65
|
-
# or `307`.
|
65
|
+
# or `307`. Also checks for "soft" redirects added at the page
|
66
|
+
# level by a meta refresh tag.
|
66
67
|
#
|
67
68
|
# @return [Boolean]
|
68
69
|
# Specifies whether the response code is a HTTP Redirect code.
|
@@ -71,6 +72,8 @@ module Spidr
|
|
71
72
|
case code
|
72
73
|
when 300..303, 307
|
73
74
|
true
|
75
|
+
when 200
|
76
|
+
meta_redirect?
|
74
77
|
else
|
75
78
|
false
|
76
79
|
end
|
@@ -434,17 +437,7 @@ module Spidr
|
|
434
437
|
urls << url unless (url.nil? || url.empty?)
|
435
438
|
}
|
436
439
|
|
437
|
-
if self.is_redirect?
|
438
|
-
location = @headers['location']
|
439
|
-
|
440
|
-
if location.kind_of?(Array)
|
441
|
-
# handle multiple location URLs
|
442
|
-
location.each(&add_url)
|
443
|
-
else
|
444
|
-
# usually the location header contains a single String
|
445
|
-
add_url.call(location)
|
446
|
-
end
|
447
|
-
end
|
440
|
+
self.redirects_to.each(&add_url) if self.is_redirect?
|
448
441
|
|
449
442
|
if (html? && doc)
|
450
443
|
doc.search('a[@href]').each do |a|
|
@@ -471,6 +464,27 @@ module Spidr
|
|
471
464
|
return urls
|
472
465
|
end
|
473
466
|
|
467
|
+
#
|
468
|
+
# URL(s) that this document redirects to.
|
469
|
+
#
|
470
|
+
# @return [Array<String>]
|
471
|
+
# The links that this page redirects to (usually found in a
|
472
|
+
# location header or by way of a page-level meta redirect).
|
473
|
+
#
|
474
|
+
def redirects_to
|
475
|
+
location = @headers['location']
|
476
|
+
|
477
|
+
if location.nil?
|
478
|
+
# check page-level meta redirects if there isn't a location header
|
479
|
+
meta_redirect
|
480
|
+
elsif location.kind_of?(Array)
|
481
|
+
location
|
482
|
+
else
|
483
|
+
# usually the location header contains a single String
|
484
|
+
[location]
|
485
|
+
end
|
486
|
+
end
|
487
|
+
|
474
488
|
#
|
475
489
|
# Absolute URIs from within the page.
|
476
490
|
#
|
@@ -507,6 +521,43 @@ module Spidr
|
|
507
521
|
return url
|
508
522
|
end
|
509
523
|
|
524
|
+
#
|
525
|
+
# Determines if a page-level "soft" redirect is present. If yes,
|
526
|
+
# returns an array of those redirects (usually a single URL).
|
527
|
+
# Otherwise, returns false.
|
528
|
+
#
|
529
|
+
# @return [Array<String>]
|
530
|
+
# An array of redirect URLs
|
531
|
+
#
|
532
|
+
def meta_redirect
|
533
|
+
redirects = []
|
534
|
+
|
535
|
+
if (html? && doc)
|
536
|
+
search('//meta[@http-equiv and @content]').each do |node|
|
537
|
+
if node.attr('http-equiv') =~ /refresh/i
|
538
|
+
content = node.attr('content')
|
539
|
+
|
540
|
+
if (redirect = content.match(/url=(\S+)$/))
|
541
|
+
redirects << redirect[1]
|
542
|
+
end
|
543
|
+
end
|
544
|
+
end
|
545
|
+
end
|
546
|
+
|
547
|
+
return redirects.uniq
|
548
|
+
end
|
549
|
+
|
550
|
+
#
|
551
|
+
# Returns a boolean indicating whether or not page-level meta
|
552
|
+
# redirects are present in this page.
|
553
|
+
#
|
554
|
+
# @return [Boolean]
|
555
|
+
# Specifies whether the page includes page-level redirects.
|
556
|
+
#
|
557
|
+
def meta_redirect?
|
558
|
+
!meta_redirect.empty?
|
559
|
+
end
|
560
|
+
|
510
561
|
protected
|
511
562
|
|
512
563
|
#
|
data/lib/spidr/version.rb
CHANGED
data/spec/cookie_jar_spec.rb
CHANGED
@@ -101,8 +101,21 @@ describe CookieJar do
|
|
101
101
|
it "should encode multiple cookie params" do
|
102
102
|
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
103
103
|
@cookie_jar['zerosum.org'] = {'other' => '1'}
|
104
|
+
cookie = @cookie_jar.for_host('zerosum.org')
|
104
105
|
|
105
|
-
|
106
|
+
cookie.should include('admin=ofcourseiam')
|
107
|
+
cookie.should include('; ')
|
108
|
+
cookie.should include('other=1')
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should include cookies for the parent domain" do
|
112
|
+
@cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
|
113
|
+
@cookie_jar['sub.zerosum.org'] = {'other' => '1'}
|
114
|
+
cookie = @cookie_jar.for_host('sub.zerosum.org')
|
115
|
+
|
116
|
+
cookie.should include('admin=ofcourseiam')
|
117
|
+
cookie.should include('; ')
|
118
|
+
cookie.should include('other=1')
|
106
119
|
end
|
107
120
|
end
|
108
121
|
end
|
data/spec/helpers/wsoc.rb
CHANGED
data/spec/page_spec.rb
CHANGED
@@ -79,6 +79,21 @@ describe Page do
|
|
79
79
|
end
|
80
80
|
end
|
81
81
|
|
82
|
+
describe "redirects" do
|
83
|
+
before(:all) do
|
84
|
+
@page = get_page('http://spidr.rubyforge.org/course/start.html')
|
85
|
+
@page.stub!(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should provide access to page-level redirects" do
|
89
|
+
@page.redirects_to.should == ['http://spidr.rubyforge.org/redirected']
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should include meta refresh redirects in the list of links" do
|
93
|
+
@page.links.should include('http://spidr.rubyforge.org/redirected')
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
82
97
|
describe "cookies" do
|
83
98
|
before(:all) do
|
84
99
|
@page = get_page('http://twitter.com/login')
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
|
3
|
-
|
2
|
+
require 'bundler'
|
3
|
+
|
4
|
+
begin
|
5
|
+
Bundler.setup(:runtime, :test)
|
6
|
+
rescue Bundler::BundlerError => e
|
7
|
+
STDERR.puts e.message
|
8
|
+
STDERR.puts "Run `bundle install` to install missing gems"
|
9
|
+
exit e.status_code
|
10
|
+
end
|
4
11
|
|
12
|
+
require 'spec'
|
5
13
|
require 'spidr/version'
|
6
14
|
|
7
15
|
include Spidr
|
data/spidr.gemspec
CHANGED
@@ -5,112 +5,112 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{spidr}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Postmodern"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-07-02}
|
13
13
|
s.description = %q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
|
14
14
|
s.email = %q{postmodern.mod3@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"ChangeLog.md",
|
17
|
-
|
18
|
-
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
21
|
".gitignore",
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
22
|
+
".specopts",
|
23
|
+
".yardopts",
|
24
|
+
"ChangeLog.md",
|
25
|
+
"Gemfile",
|
26
|
+
"LICENSE.txt",
|
27
|
+
"README.md",
|
28
|
+
"Rakefile",
|
29
|
+
"lib/spidr.rb",
|
30
|
+
"lib/spidr/actions.rb",
|
31
|
+
"lib/spidr/actions/actions.rb",
|
32
|
+
"lib/spidr/actions/exceptions.rb",
|
33
|
+
"lib/spidr/actions/exceptions/action.rb",
|
34
|
+
"lib/spidr/actions/exceptions/paused.rb",
|
35
|
+
"lib/spidr/actions/exceptions/skip_link.rb",
|
36
|
+
"lib/spidr/actions/exceptions/skip_page.rb",
|
37
|
+
"lib/spidr/agent.rb",
|
38
|
+
"lib/spidr/auth_credential.rb",
|
39
|
+
"lib/spidr/auth_store.rb",
|
40
|
+
"lib/spidr/cookie_jar.rb",
|
41
|
+
"lib/spidr/events.rb",
|
42
|
+
"lib/spidr/extensions.rb",
|
43
|
+
"lib/spidr/extensions/uri.rb",
|
44
|
+
"lib/spidr/filters.rb",
|
45
|
+
"lib/spidr/page.rb",
|
46
|
+
"lib/spidr/rules.rb",
|
47
|
+
"lib/spidr/sanitizers.rb",
|
48
|
+
"lib/spidr/session_cache.rb",
|
49
|
+
"lib/spidr/spidr.rb",
|
50
|
+
"lib/spidr/version.rb",
|
51
|
+
"spec/actions_spec.rb",
|
52
|
+
"spec/agent_spec.rb",
|
53
|
+
"spec/auth_store_spec.rb",
|
54
|
+
"spec/cookie_jar_spec.rb",
|
55
|
+
"spec/extensions/uri_spec.rb",
|
56
|
+
"spec/filters_spec.rb",
|
57
|
+
"spec/helpers/history.rb",
|
58
|
+
"spec/helpers/page.rb",
|
59
|
+
"spec/helpers/wsoc.rb",
|
60
|
+
"spec/page_examples.rb",
|
61
|
+
"spec/page_spec.rb",
|
62
|
+
"spec/rules_spec.rb",
|
63
|
+
"spec/sanitizers_spec.rb",
|
64
|
+
"spec/session_cache.rb",
|
65
|
+
"spec/spec_helper.rb",
|
66
|
+
"spec/spidr_spec.rb",
|
67
|
+
"spidr.gemspec"
|
67
68
|
]
|
68
69
|
s.has_rdoc = %q{yard}
|
69
70
|
s.homepage = %q{http://github.com/postmodern/spidr}
|
70
71
|
s.licenses = ["MIT"]
|
71
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
72
72
|
s.require_paths = ["lib"]
|
73
|
-
s.rubygems_version = %q{1.3.
|
73
|
+
s.rubygems_version = %q{1.3.7}
|
74
74
|
s.summary = %q{A versatile Ruby web spidering library}
|
75
75
|
s.test_files = [
|
76
|
+
"spec/actions_spec.rb",
|
77
|
+
"spec/agent_spec.rb",
|
76
78
|
"spec/auth_store_spec.rb",
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
"spec/page_examples.rb",
|
91
|
-
"spec/actions_spec.rb"
|
79
|
+
"spec/cookie_jar_spec.rb",
|
80
|
+
"spec/extensions/uri_spec.rb",
|
81
|
+
"spec/filters_spec.rb",
|
82
|
+
"spec/helpers/history.rb",
|
83
|
+
"spec/helpers/page.rb",
|
84
|
+
"spec/helpers/wsoc.rb",
|
85
|
+
"spec/page_examples.rb",
|
86
|
+
"spec/page_spec.rb",
|
87
|
+
"spec/rules_spec.rb",
|
88
|
+
"spec/sanitizers_spec.rb",
|
89
|
+
"spec/session_cache.rb",
|
90
|
+
"spec/spec_helper.rb",
|
91
|
+
"spec/spidr_spec.rb"
|
92
92
|
]
|
93
93
|
|
94
94
|
if s.respond_to? :specification_version then
|
95
95
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
96
96
|
s.specification_version = 3
|
97
97
|
|
98
|
-
if Gem::Version.new(Gem::
|
98
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
99
99
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.3.0"])
|
100
|
+
s.add_development_dependency(%q<rake>, ["~> 0.8.7"])
|
101
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.4.0"])
|
100
102
|
s.add_development_dependency(%q<rspec>, ["~> 1.3.0"])
|
101
|
-
s.add_development_dependency(%q<yard>, ["~> 0.5.3"])
|
102
|
-
s.add_development_dependency(%q<wsoc>, ["~> 0.1.1"])
|
103
103
|
else
|
104
104
|
s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
|
105
|
+
s.add_dependency(%q<rake>, ["~> 0.8.7"])
|
106
|
+
s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
|
105
107
|
s.add_dependency(%q<rspec>, ["~> 1.3.0"])
|
106
|
-
s.add_dependency(%q<yard>, ["~> 0.5.3"])
|
107
|
-
s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
|
108
108
|
end
|
109
109
|
else
|
110
110
|
s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
|
111
|
+
s.add_dependency(%q<rake>, ["~> 0.8.7"])
|
112
|
+
s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
|
111
113
|
s.add_dependency(%q<rspec>, ["~> 1.3.0"])
|
112
|
-
s.add_dependency(%q<yard>, ["~> 0.5.3"])
|
113
|
-
s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 5
|
9
|
+
version: 0.2.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Postmodern
|
@@ -14,13 +14,13 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-07-02 00:00:00 -07:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
22
|
-
prerelease: false
|
23
22
|
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
24
|
requirements:
|
25
25
|
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
@@ -30,48 +30,52 @@ dependencies:
|
|
30
30
|
- 0
|
31
31
|
version: 1.3.0
|
32
32
|
type: :runtime
|
33
|
+
prerelease: false
|
33
34
|
version_requirements: *id001
|
34
35
|
- !ruby/object:Gem::Dependency
|
35
|
-
name:
|
36
|
-
prerelease: false
|
36
|
+
name: rake
|
37
37
|
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
38
39
|
requirements:
|
39
40
|
- - ~>
|
40
41
|
- !ruby/object:Gem::Version
|
41
42
|
segments:
|
42
|
-
- 1
|
43
|
-
- 3
|
44
43
|
- 0
|
45
|
-
|
44
|
+
- 8
|
45
|
+
- 7
|
46
|
+
version: 0.8.7
|
46
47
|
type: :development
|
48
|
+
prerelease: false
|
47
49
|
version_requirements: *id002
|
48
50
|
- !ruby/object:Gem::Dependency
|
49
|
-
name:
|
50
|
-
prerelease: false
|
51
|
+
name: jeweler
|
51
52
|
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
52
54
|
requirements:
|
53
55
|
- - ~>
|
54
56
|
- !ruby/object:Gem::Version
|
55
57
|
segments:
|
58
|
+
- 1
|
59
|
+
- 4
|
56
60
|
- 0
|
57
|
-
|
58
|
-
- 3
|
59
|
-
version: 0.5.3
|
61
|
+
version: 1.4.0
|
60
62
|
type: :development
|
63
|
+
prerelease: false
|
61
64
|
version_requirements: *id003
|
62
65
|
- !ruby/object:Gem::Dependency
|
63
|
-
name:
|
64
|
-
prerelease: false
|
66
|
+
name: rspec
|
65
67
|
requirement: &id004 !ruby/object:Gem::Requirement
|
68
|
+
none: false
|
66
69
|
requirements:
|
67
70
|
- - ~>
|
68
71
|
- !ruby/object:Gem::Version
|
69
72
|
segments:
|
70
|
-
- 0
|
71
73
|
- 1
|
72
|
-
-
|
73
|
-
|
74
|
+
- 3
|
75
|
+
- 0
|
76
|
+
version: 1.3.0
|
74
77
|
type: :development
|
78
|
+
prerelease: false
|
75
79
|
version_requirements: *id004
|
76
80
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
77
81
|
email: postmodern.mod3@gmail.com
|
@@ -88,6 +92,7 @@ files:
|
|
88
92
|
- .specopts
|
89
93
|
- .yardopts
|
90
94
|
- ChangeLog.md
|
95
|
+
- Gemfile
|
91
96
|
- LICENSE.txt
|
92
97
|
- README.md
|
93
98
|
- Rakefile
|
@@ -135,18 +140,21 @@ homepage: http://github.com/postmodern/spidr
|
|
135
140
|
licenses:
|
136
141
|
- MIT
|
137
142
|
post_install_message:
|
138
|
-
rdoc_options:
|
139
|
-
|
143
|
+
rdoc_options: []
|
144
|
+
|
140
145
|
require_paths:
|
141
146
|
- lib
|
142
147
|
required_ruby_version: !ruby/object:Gem::Requirement
|
148
|
+
none: false
|
143
149
|
requirements:
|
144
150
|
- - ">="
|
145
151
|
- !ruby/object:Gem::Version
|
152
|
+
hash: 740918287
|
146
153
|
segments:
|
147
154
|
- 0
|
148
155
|
version: "0"
|
149
156
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
157
|
+
none: false
|
150
158
|
requirements:
|
151
159
|
- - ">="
|
152
160
|
- !ruby/object:Gem::Version
|
@@ -156,24 +164,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
156
164
|
requirements: []
|
157
165
|
|
158
166
|
rubyforge_project:
|
159
|
-
rubygems_version: 1.3.
|
167
|
+
rubygems_version: 1.3.7
|
160
168
|
signing_key:
|
161
169
|
specification_version: 3
|
162
170
|
summary: A versatile Ruby web spidering library
|
163
171
|
test_files:
|
164
|
-
- spec/
|
165
|
-
- spec/rules_spec.rb
|
166
|
-
- spec/session_cache.rb
|
167
|
-
- spec/spec_helper.rb
|
168
|
-
- spec/sanitizers_spec.rb
|
169
|
-
- spec/filters_spec.rb
|
170
|
-
- spec/page_spec.rb
|
171
|
-
- spec/spidr_spec.rb
|
172
|
+
- spec/actions_spec.rb
|
172
173
|
- spec/agent_spec.rb
|
174
|
+
- spec/auth_store_spec.rb
|
173
175
|
- spec/cookie_jar_spec.rb
|
174
176
|
- spec/extensions/uri_spec.rb
|
177
|
+
- spec/filters_spec.rb
|
175
178
|
- spec/helpers/history.rb
|
176
179
|
- spec/helpers/page.rb
|
177
180
|
- spec/helpers/wsoc.rb
|
178
181
|
- spec/page_examples.rb
|
179
|
-
- spec/
|
182
|
+
- spec/page_spec.rb
|
183
|
+
- spec/rules_spec.rb
|
184
|
+
- spec/sanitizers_spec.rb
|
185
|
+
- spec/session_cache.rb
|
186
|
+
- spec/spec_helper.rb
|
187
|
+
- spec/spidr_spec.rb
|