relevance-tarantula 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,5 @@
1
+ v0.1.8 Add timeouts for crawls to help really long builds
2
+
1
3
  v0.1.7 Minor clean up [Rob Sanheim]
2
4
 
3
5
  v0.1.6
data/README.rdoc CHANGED
@@ -134,6 +134,19 @@ This example adds custom attacks for both SQL injection and XSS. It also tells T
134
134
  app 2 times. This is important for XSS attacks because the results won't appear until the second time
135
135
  Tarantula performs the crawl.
136
136
 
137
+ == Timeout
138
+
139
+ You can specify a timeout for each specific crawl that Tarantula runs. For example:
140
+
141
+ def test_tarantula
142
+ t = tarantula_crawler(self)
143
+ t.times_to_crawl = 2
144
+ t.crawl_timeout = 5.minutes
145
+ t.crawl "/"
146
+ end
147
+
148
+ The above will crawl your app twice, and each specific crawl will timeout if it takes longer then 5 minutes. You may need a timeout to keep the tarantula test time reasonable if your app is large or just happens to have a large amount of 'never-ending' links, such as with an any sort of "auto-admin" interface.
149
+
137
150
  == Bugs/Requests
138
151
 
139
152
  Please submit your bug reports, patches, or feature requests at Lighthouse:
data/Rakefile CHANGED
@@ -19,6 +19,8 @@ begin
19
19
  s.authors = ["Relevance, Inc."]
20
20
  s.require_paths = ["lib"]
21
21
  s.files = files.flatten
22
+ s.add_dependency 'htmlentities'
23
+ s.add_dependency 'hpricot'
22
24
  end
23
25
  rescue LoadError
24
26
  puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
@@ -46,7 +48,7 @@ namespace :examples do
46
48
  t.rcov_opts = %[--exclude "gems/*,/Library/Ruby/*,config/*" --text-summary --sort coverage --no-validator-links]
47
49
  end
48
50
 
49
- RAILS_VERSIONS = %w[2.0.2 2.1.0 2.1.1 2.2.2 2.3.1]
51
+ RAILS_VERSIONS = %w[2.0.2 2.1.0 2.1.1 2.2.2 2.3.1 2.3.2]
50
52
 
51
53
  desc "Run exmaples with multiple versions of rails"
52
54
  task :multi_rails do
data/VERSION.yml CHANGED
@@ -1,4 +1,4 @@
1
1
  ---
2
+ :patch: 8
2
3
  :major: 0
3
4
  :minor: 1
4
- :patch: 7
@@ -1,204 +1,246 @@
1
1
  require File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "example_helper.rb"))
2
2
 
3
- describe 'Relevance::Tarantula::Crawler#transform_url' do
4
- before {@crawler = Relevance::Tarantula::Crawler.new}
5
- it "de-obfuscates unicode obfuscated urls" do
6
- obfuscated_mailto = "mailto:"
7
- @crawler.transform_url(obfuscated_mailto).should == "mailto:"
8
- end
3
+ describe Relevance::Tarantula::Crawler do
9
4
 
10
- it "strips the trailing name portion of a link" do
11
- @crawler.transform_url('http://host/path#name').should == 'http://host/path'
12
- end
13
- end
5
+ describe "transform_url" do
14
6
 
15
- describe 'Relevance::Tarantula::Crawler log grabbing' do
16
- it "returns nil if no grabber is specified" do
17
- crawler = Relevance::Tarantula::Crawler.new
18
- crawler.grab_log!.should == nil
19
- end
7
+ before { @crawler = Relevance::Tarantula::Crawler.new }
8
+
9
+ it "de-obfuscates unicode obfuscated urls" do
10
+ obfuscated_mailto = "mailto:"
11
+ @crawler.transform_url(obfuscated_mailto).should == "mailto:"
12
+ end
20
13
 
21
- it "returns grabber.grab if grabber is specified" do
22
- crawler = Relevance::Tarantula::Crawler.new
23
- crawler.log_grabber = stub(:grab! => "fake log entry")
24
- crawler.grab_log!.should == "fake log entry"
14
+ it "strips the trailing name portion of a link" do
15
+ @crawler.transform_url('http://host/path#name').should == 'http://host/path'
16
+ end
25
17
  end
26
- end
18
+
19
+
20
+ describe "log grabbing" do
27
21
 
28
- describe 'Relevance::Tarantula::Crawler interruption' do
29
- it 'catches interruption and writes the partial report' do
30
- crawler = Relevance::Tarantula::Crawler.new
31
- crawler.stubs(:queue_link)
32
- crawler.stubs(:do_crawl).raises(Interrupt)
33
- crawler.expects(:report_results)
34
- $stderr.expects(:puts).with("CTRL-C")
35
- crawler.crawl
36
- end
37
- end
22
+ it "returns nil if no grabber is specified" do
23
+ crawler = Relevance::Tarantula::Crawler.new
24
+ crawler.grab_log!.should == nil
25
+ end
38
26
 
39
- describe 'Relevance::Tarantula::Crawler handle_form_results' do
40
- it 'captures the result values (bugfix)' do
41
- response = stub_everything
42
- result_args = {:url => :action_stub,
43
- :data => 'nil',
44
- :response => response,
45
- :referrer => :action_stub,
46
- :log => nil,
47
- :method => :stub_method,
48
- :test_name => nil}
49
- result = Relevance::Tarantula::Result.new(result_args)
50
- Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
51
- crawler = Relevance::Tarantula::Crawler.new
52
- crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub),
53
- response)
54
- end
55
- end
56
-
57
- describe 'Relevance::Tarantula::Crawler#crawl' do
58
- it 'queues the first url, does crawl, and then reports results' do
59
- crawler = Relevance::Tarantula::Crawler.new
60
- crawler.expects(:queue_link).with("/foobar")
61
- crawler.expects(:do_crawl)
62
- crawler.expects(:report_results)
63
- crawler.crawl("/foobar")
27
+ it "returns grabber.grab if grabber is specified" do
28
+ crawler = Relevance::Tarantula::Crawler.new
29
+ crawler.log_grabber = stub(:grab! => "fake log entry")
30
+ crawler.grab_log!.should == "fake log entry"
31
+ end
32
+
64
33
  end
65
34
 
66
- it 'reports results even if the crawl fails' do
67
- crawler = Relevance::Tarantula::Crawler.new
68
- crawler.expects(:do_crawl).raises(RuntimeError)
69
- crawler.expects(:report_results)
70
- lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
71
- end
72
- end
73
-
74
- describe 'Relevance::Tarantula::Crawler queuing' do
75
- it 'queues and remembers links' do
76
- crawler = Relevance::Tarantula::Crawler.new
77
- crawler.expects(:transform_url).with("/url").returns("/transformed")
78
- crawler.queue_link("/url")
79
- crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")]
80
- crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")])
35
+ describe "interrupt" do
36
+
37
+ it 'catches interruption and writes the partial report' do
38
+ crawler = Relevance::Tarantula::Crawler.new
39
+ crawler.stubs(:queue_link)
40
+ crawler.stubs(:do_crawl).raises(Interrupt)
41
+ crawler.expects(:report_results)
42
+ $stderr.expects(:puts).with("CTRL-C")
43
+ crawler.crawl
44
+ end
45
+
81
46
  end
82
47
 
83
- it 'queues and remembers forms' do
84
- crawler = Relevance::Tarantula::Crawler.new
85
- form = Hpricot('<form action="/action" method="post"/>').at('form')
86
- signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature
87
- crawler.queue_form(form)
88
- crawler.forms_to_crawl.size.should == 1
89
- crawler.form_signatures_queued.should == Set.new([signature])
48
+ describe 'handle_form_results' do
49
+
50
+ it 'captures the result values (bugfix)' do
51
+ response = stub_everything
52
+ result_args = {:url => :action_stub,
53
+ :data => 'nil',
54
+ :response => response,
55
+ :referrer => :action_stub,
56
+ :log => nil,
57
+ :method => :stub_method,
58
+ :test_name => nil}
59
+ result = Relevance::Tarantula::Result.new(result_args)
60
+ Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
61
+ crawler = Relevance::Tarantula::Crawler.new
62
+ crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub),
63
+ response)
64
+ end
65
+
90
66
  end
91
67
 
92
- it 'remembers link referrer if there is one' do
93
- crawler = Relevance::Tarantula::Crawler.new
94
- crawler.queue_link("/url", "/some-referrer")
95
- crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"}
68
+ describe "crawl" do
69
+
70
+ it 'queues the first url, does crawl, and then reports results' do
71
+ crawler = Relevance::Tarantula::Crawler.new
72
+ crawler.expects(:queue_link).with("/foobar")
73
+ crawler.expects(:do_crawl)
74
+ crawler.expects(:report_results)
75
+ crawler.crawl("/foobar")
76
+ end
77
+
78
+ it 'reports results even if the crawl fails' do
79
+ crawler = Relevance::Tarantula::Crawler.new
80
+ crawler.expects(:do_crawl).raises(RuntimeError)
81
+ crawler.expects(:report_results)
82
+ lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
83
+ end
84
+
96
85
  end
97
86
 
98
- end
87
+ describe "queueing" do
99
88
 
100
- describe 'Relevance::Tarantula::Crawler#report_results' do
101
- it "delegates to generate_reports" do
102
- crawler = Relevance::Tarantula::Crawler.new
103
- crawler.expects(:generate_reports)
104
- crawler.report_results
105
- end
106
- end
89
+ it 'queues and remembers links' do
90
+ crawler = Relevance::Tarantula::Crawler.new
91
+ crawler.expects(:transform_url).with("/url").returns("/transformed")
92
+ crawler.queue_link("/url")
93
+ crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")]
94
+ crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")])
95
+ end
107
96
 
108
- describe 'Relevance::Tarantula::Crawler#crawling' do
97
+ it 'queues and remembers forms' do
98
+ crawler = Relevance::Tarantula::Crawler.new
99
+ form = Hpricot('<form action="/action" method="post"/>').at('form')
100
+ signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature
101
+ crawler.queue_form(form)
102
+ crawler.forms_to_crawl.size.should == 1
103
+ crawler.form_signatures_queued.should == Set.new([signature])
104
+ end
109
105
 
110
- it "converts ActiveRecord::RecordNotFound into a 404" do
111
- (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound)
112
- crawler = Relevance::Tarantula::Crawler.new
113
- crawler.proxy = proxy
114
- response = crawler.crawl_form stub_everything(:method => nil)
115
- response.code.should == "404"
116
- response.content_type.should == "text/plain"
117
- response.body.should == "ActiveRecord::RecordNotFound"
106
+ it 'remembers link referrer if there is one' do
107
+ crawler = Relevance::Tarantula::Crawler.new
108
+ crawler.queue_link("/url", "/some-referrer")
109
+ crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"}
110
+ end
111
+
118
112
  end
113
+
114
+ describe "crawling" do
115
+
116
+ it "converts ActiveRecord::RecordNotFound into a 404" do
117
+ (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound)
118
+ crawler = Relevance::Tarantula::Crawler.new
119
+ crawler.proxy = proxy
120
+ response = crawler.crawl_form stub_everything(:method => nil)
121
+ response.code.should == "404"
122
+ response.content_type.should == "text/plain"
123
+ response.body.should == "ActiveRecord::RecordNotFound"
124
+ end
119
125
 
120
- it "does four things with each link: get, log, handle, and blip" do
121
- crawler = Relevance::Tarantula::Crawler.new
122
- crawler.proxy = stub
123
- response = stub(:code => "200")
124
- crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
125
- crawler.proxy.expects(:get).returns(response).times(2)
126
- crawler.expects(:log).times(2)
127
- crawler.expects(:handle_link_results).times(2)
128
- crawler.expects(:blip).times(2)
129
- crawler.crawl_queued_links
130
- crawler.links_to_crawl.should == []
131
- end
126
+ it "does four things with each link: get, log, handle, and blip" do
127
+ crawler = Relevance::Tarantula::Crawler.new
128
+ crawler.proxy = stub
129
+ response = stub(:code => "200")
130
+ crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
131
+ crawler.proxy.expects(:get).returns(response).times(2)
132
+ crawler.expects(:log).times(2)
133
+ crawler.expects(:handle_link_results).times(2)
134
+ crawler.expects(:blip).times(2)
135
+ crawler.crawl_queued_links
136
+ crawler.links_to_crawl.should == []
137
+ end
138
+
139
+ it "invokes queued forms, logs responses, and calls handlers" do
140
+ crawler = Relevance::Tarantula::Crawler.new
141
+ crawler.forms_to_crawl << stub_everything(:method => "get",
142
+ :action => "/foo",
143
+ :data => "some data",
144
+ :to_s => "stub")
145
+ crawler.proxy = stub_everything(:send => stub(:code => "200" ))
146
+ crawler.expects(:log).with("Response 200 for stub")
147
+ crawler.expects(:blip)
148
+ crawler.crawl_queued_forms
149
+ end
150
+
151
+ it "breaks out early if a timeout is set" do
152
+ crawler = Relevance::Tarantula::Crawler.new
153
+ stub_puts_and_print(crawler)
154
+ crawler.proxy = stub
155
+ response = stub(:code => "200")
156
+ crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
157
+ crawler.proxy.expects(:get).returns(response).times(4)
158
+ crawler.forms_to_crawl << stub_everything(:method => "post",
159
+ :action => "/foo",
160
+ :data => "some data",
161
+ :to_s => "stub")
162
+ crawler.proxy.expects(:post).returns(response).times(2)
163
+ crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
164
+ crawler.times_to_crawl = 2
165
+ crawler.crawl
166
+
167
+ end
168
+
169
+ it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
170
+ crawler = Relevance::Tarantula::Crawler.new
171
+ stub_puts_and_print(crawler)
172
+ crawler.proxy = stub
173
+ response = stub(:code => "200")
174
+ crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
175
+ crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2
176
+ crawler.forms_to_crawl << stub_everything(:method => "post",
177
+ :action => "/foo",
178
+ :data => "some data",
179
+ :to_s => "stub")
180
+ crawler.proxy.expects(:post).returns(response).times(2)
181
+ crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
182
+ crawler.times_to_crawl = 2
183
+ crawler.crawl
184
+ end
132
185
 
133
- it "invokes queued forms, logs responses, and calls handlers" do
134
- crawler = Relevance::Tarantula::Crawler.new
135
- crawler.forms_to_crawl << stub_everything(:method => "get",
136
- :action => "/foo",
137
- :data => "some data",
138
- :to_s => "stub")
139
- crawler.proxy = stub_everything(:send => stub(:code => "200" ))
140
- crawler.expects(:log).with("Response 200 for stub")
141
- crawler.expects(:blip)
142
- crawler.crawl_queued_forms
143
186
  end
144
187
 
145
- it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
146
- crawler = Relevance::Tarantula::Crawler.new
147
- stub_puts_and_print(crawler)
148
- crawler.proxy = stub
149
- response = stub(:code => "200")
150
- crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
151
- crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2
152
- crawler.forms_to_crawl << stub_everything(:method => "post",
153
- :action => "/foo",
154
- :data => "some data",
155
- :to_s => "stub")
156
- crawler.proxy.expects(:post).returns(response).times(2)
157
- crawler.expects(:links_completed_count).returns(*(0..6).to_a).times(6)
158
- crawler.times_to_crawl = 2
159
- crawler.crawl
160
- end
161
- end
188
+ describe "report_results" do
162
189
 
163
- describe 'Crawler blip' do
164
- it "blips the current progress if !verbose" do
165
- crawler = Relevance::Tarantula::Crawler.new
166
- crawler.stubs(:verbose).returns false
167
- crawler.expects(:print).with("\r 0 of 0 links completed ")
168
- crawler.blip
169
- end
170
- it "blips nothing if verbose" do
171
- crawler = Relevance::Tarantula::Crawler.new
172
- crawler.stubs(:verbose).returns true
173
- crawler.expects(:print).never
174
- crawler.blip
190
+ it "delegates to generate_reports" do
191
+ crawler = Relevance::Tarantula::Crawler.new
192
+ crawler.expects(:generate_reports)
193
+ crawler.report_results
194
+ end
195
+
175
196
  end
176
- end
197
+
198
+ describe "blip" do
177
199
 
178
- describe 'Relevance::Tarantula::Crawler' do
179
- it "is finished when the links and forms are crawled" do
180
- crawler = Relevance::Tarantula::Crawler.new
181
- crawler.finished?.should == true
200
+ it "blips the current progress if !verbose" do
201
+ crawler = Relevance::Tarantula::Crawler.new
202
+ crawler.stubs(:verbose).returns false
203
+ crawler.stubs(:timeout_if_too_long)
204
+ crawler.expects(:print).with("\r 0 of 0 links completed ")
205
+ crawler.blip
206
+ end
207
+
208
+ it "blips nothing if verbose" do
209
+ crawler = Relevance::Tarantula::Crawler.new
210
+ crawler.stubs(:verbose).returns true
211
+ crawler.expects(:print).never
212
+ crawler.blip
213
+ end
214
+
182
215
  end
216
+
217
+ describe "finished?" do
183
218
 
184
- it "isn't finished when links remain" do
185
- crawler = Relevance::Tarantula::Crawler.new
186
- crawler.links_to_crawl = [:stub_link]
187
- crawler.finished?.should == false
188
- end
219
+ it "is finished when the links and forms are crawled" do
220
+ crawler = Relevance::Tarantula::Crawler.new
221
+ crawler.finished?.should == true
222
+ end
189
223
 
190
- it "isn't finished when links remain" do
191
- crawler = Relevance::Tarantula::Crawler.new
192
- crawler.forms_to_crawl = [:stub_form]
193
- crawler.finished?.should == false
224
+ it "isn't finished when links remain" do
225
+ crawler = Relevance::Tarantula::Crawler.new
226
+ crawler.links_to_crawl = [:stub_link]
227
+ crawler.finished?.should == false
228
+ end
229
+
230
+ it "isn't finished when links remain" do
231
+ crawler = Relevance::Tarantula::Crawler.new
232
+ crawler.forms_to_crawl = [:stub_form]
233
+ crawler.finished?.should == false
234
+ end
235
+
194
236
  end
195
-
237
+
196
238
  it "crawls links and forms again and again until finished?==true" do
197
239
  crawler = Relevance::Tarantula::Crawler.new
198
240
  crawler.expects(:finished?).times(3).returns(false, false, true)
199
241
  crawler.expects(:crawl_queued_links).times(2)
200
242
  crawler.expects(:crawl_queued_forms).times(2)
201
- crawler.do_crawl
243
+ crawler.do_crawl(1)
202
244
  end
203
245
 
204
246
  it "asks each reporter to write its report in report_dir" do
@@ -225,72 +267,120 @@ describe 'Relevance::Tarantula::Crawler' do
225
267
  crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
226
268
  end
227
269
 
228
- end
229
-
230
- describe "Crawler link skipping" do
231
- before do
232
- @crawler = Relevance::Tarantula::Crawler.new
233
- end
234
-
235
- it "skips links that are too long" do
236
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
237
- @crawler.max_url_length = 2
238
- @crawler.expects(:log).with("Skipping long url /foo")
239
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
240
- end
270
+ describe "link skipping" do
271
+
272
+ before { @crawler = Relevance::Tarantula::Crawler.new }
273
+
274
+ it "skips links that are too long" do
275
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
276
+ @crawler.max_url_length = 2
277
+ @crawler.expects(:log).with("Skipping long url /foo")
278
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
279
+ end
241
280
 
242
- it "skips outbound links (those that begin with http)" do
243
- @crawler.expects(:log).with("Skipping http-anything")
244
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true
245
- end
281
+ it "skips outbound links (those that begin with http)" do
282
+ @crawler.expects(:log).with("Skipping http-anything")
283
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true
284
+ end
246
285
 
247
- it "skips javascript links (those that begin with javascript)" do
248
- @crawler.expects(:log).with("Skipping javascript-anything")
249
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true
250
- end
286
+ it "skips javascript links (those that begin with javascript)" do
287
+ @crawler.expects(:log).with("Skipping javascript-anything")
288
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true
289
+ end
251
290
 
252
- it "skips mailto links (those that begin with http)" do
253
- @crawler.expects(:log).with("Skipping mailto-anything")
254
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true
255
- end
291
+ it "skips mailto links (those that begin with http)" do
292
+ @crawler.expects(:log).with("Skipping mailto-anything")
293
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true
294
+ end
256
295
 
257
- it 'skips blank links' do
258
- @crawler.queue_link(nil)
259
- @crawler.links_to_crawl.should == []
260
- @crawler.queue_link("")
261
- @crawler.links_to_crawl.should == []
262
- end
296
+ it 'skips blank links' do
297
+ @crawler.queue_link(nil)
298
+ @crawler.links_to_crawl.should == []
299
+ @crawler.queue_link("")
300
+ @crawler.links_to_crawl.should == []
301
+ end
263
302
 
264
- it "logs and skips links that match a pattern" do
265
- @crawler.expects(:log).with("Skipping /the-red-button")
266
- @crawler.skip_uri_patterns << /red-button/
267
- @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button")
268
- @crawler.queue_link("/the-red-button").should == nil
269
- end
303
+ it "logs and skips links that match a pattern" do
304
+ @crawler.expects(:log).with("Skipping /the-red-button")
305
+ @crawler.skip_uri_patterns << /red-button/
306
+ @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button")
307
+ @crawler.queue_link("/the-red-button").should == nil
308
+ end
270
309
 
271
- it "logs and skips form submissions that match a pattern" do
272
- @crawler.expects(:log).with("Skipping /reset-password-form")
273
- @crawler.skip_uri_patterns << /reset-password/
274
- fs = stub_everything(:action => "/reset-password-form")
275
- @crawler.should_skip_form_submission?(fs).should == true
310
+ it "logs and skips form submissions that match a pattern" do
311
+ @crawler.expects(:log).with("Skipping /reset-password-form")
312
+ @crawler.skip_uri_patterns << /reset-password/
313
+ fs = stub_everything(:action => "/reset-password-form")
314
+ @crawler.should_skip_form_submission?(fs).should == true
315
+ end
276
316
  end
277
- end
317
+
318
+ describe "allow_nnn_for" do
278
319
 
279
- describe "allow_nnn_for" do
280
- it "installs result as a response_code_handler" do
281
- crawler = Relevance::Tarantula::Crawler.new
282
- crawler.response_code_handler.should == Relevance::Tarantula::Result
320
+ it "installs result as a response_code_handler" do
321
+ crawler = Relevance::Tarantula::Crawler.new
322
+ crawler.response_code_handler.should == Relevance::Tarantula::Result
323
+ end
324
+
325
+ it "delegates to the response_code_handler" do
326
+ crawler = Relevance::Tarantula::Crawler.new
327
+ (response_code_handler = mock).expects(:allow_404_for).with(:stub)
328
+ crawler.response_code_handler = response_code_handler
329
+ crawler.allow_404_for(:stub)
330
+ end
331
+
332
+ it "chains up to super for method_missing" do
333
+ crawler = Relevance::Tarantula::Crawler.new
334
+ lambda{crawler.foo}.should raise_error(NoMethodError)
335
+ end
336
+
283
337
  end
284
338
 
285
- it "delegates to the response_code_handler" do
286
- crawler = Relevance::Tarantula::Crawler.new
287
- (response_code_handler = mock).expects(:allow_404_for).with(:stub)
288
- crawler.response_code_handler = response_code_handler
289
- crawler.allow_404_for(:stub)
339
+ describe "timeouts" do
340
+
341
+ it "sets start and end times for a single crawl" do
342
+ start_time = Time.parse("March 1st, 2008 10:00am")
343
+ end_time = Time.parse("March 1st, 2008 10:10am")
344
+ Time.stubs(:now).returns(start_time, end_time)
345
+
346
+ crawler = Relevance::Tarantula::Crawler.new
347
+ stub_puts_and_print(crawler)
348
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
349
+ crawler.crawl
350
+ crawler.crawl_start_times.first.should == start_time
351
+ crawler.crawl_end_times.first.should == end_time
352
+ end
353
+
354
+ it "has elasped time for a crawl" do
355
+ start_time = Time.parse("March 1st, 2008 10:00am")
356
+ elasped_time_check = Time.parse("March 1st, 2008, 10:10:00am")
357
+ Time.stubs(:now).returns(start_time, elasped_time_check)
358
+
359
+ crawler = Relevance::Tarantula::Crawler.new
360
+ stub_puts_and_print(crawler)
361
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
362
+ crawler.crawl
363
+ crawler.elasped_time_for_pass(0).should == 600.seconds
364
+ end
365
+
366
+ it "raises out of the crawl if elasped time is greater then the crawl timeout" do
367
+ start_time = Time.parse("March 1st, 2008 10:00am")
368
+ elasped_time_check = Time.parse("March 1st, 2008, 10:35:00am")
369
+ Time.stubs(:now).returns(start_time, elasped_time_check)
370
+
371
+ crawler = Relevance::Tarantula::Crawler.new
372
+ crawler.crawl_timeout = 5.minutes
373
+
374
+ crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
375
+ crawler.proxy = stub
376
+ crawler.proxy.stubs(:get).returns(response = stub(:code => "200"))
377
+
378
+ stub_puts_and_print(crawler)
379
+ lambda {
380
+ crawler.do_crawl(0)
381
+ }.should raise_error
382
+ end
383
+
290
384
  end
291
385
 
292
- it "chains up to super for method_missing" do
293
- crawler = Relevance::Tarantula::Crawler.new
294
- lambda{crawler.foo}.should raise_error(NoMethodError)
295
- end
296
- end
386
+ end
@@ -7,11 +7,13 @@ class Relevance::Tarantula::Crawler
7
7
  extend Forwardable
8
8
  include Relevance::Tarantula
9
9
 
10
+ class CrawlTimeout < RuntimeError; end
11
+
10
12
  attr_accessor :proxy, :handlers, :skip_uri_patterns, :log_grabber,
11
13
  :reporters, :links_to_crawl, :links_queued, :forms_to_crawl,
12
14
  :form_signatures_queued, :max_url_length, :response_code_handler,
13
- :times_to_crawl, :fuzzers, :test_name
14
- attr_reader :transform_url_patterns, :referrers, :failures, :successes
15
+ :times_to_crawl, :fuzzers, :test_name, :crawl_timeout
16
+ attr_reader :transform_url_patterns, :referrers, :failures, :successes, :crawl_start_times, :crawl_end_times
15
17
 
16
18
  def initialize
17
19
  @max_url_length = 1024
@@ -22,6 +24,8 @@ class Relevance::Tarantula::Crawler
22
24
  @form_signatures_queued = Set.new
23
25
  @links_to_crawl = []
24
26
  @forms_to_crawl = []
27
+ @crawl_start_times, @crawl_end_times = [], []
28
+ @crawl_timeout = 20.minutes
25
29
  @referrers = {}
26
30
  @skip_uri_patterns = [
27
31
  /^javascript/,
@@ -53,13 +57,18 @@ class Relevance::Tarantula::Crawler
53
57
  orig_form_signatures_queued = @form_signatures_queued.dup
54
58
  orig_links_to_crawl = @links_to_crawl.dup
55
59
  orig_forms_to_crawl = @forms_to_crawl.dup
56
- @times_to_crawl.times do |i|
60
+ @times_to_crawl.times do |num|
57
61
  queue_link url
58
- do_crawl
59
-
60
- puts "#{(i+1).ordinalize} crawl" if @times_to_crawl > 1
62
+
63
+ begin
64
+ do_crawl num
65
+ rescue CrawlTimeout => e
66
+ puts e.message
67
+ end
68
+
69
+ puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1
61
70
 
62
- if i + 1 < @times_to_crawl
71
+ if num + 1 < @times_to_crawl
63
72
  @links_queued = orig_links_queued
64
73
  @form_signatures_queued = orig_form_signatures_queued
65
74
  @links_to_crawl = orig_links_to_crawl
@@ -77,19 +86,21 @@ class Relevance::Tarantula::Crawler
77
86
  @links_to_crawl.empty? && @forms_to_crawl.empty?
78
87
  end
79
88
 
80
- def do_crawl
89
+ def do_crawl(number)
81
90
  while (!finished?)
82
- crawl_queued_links
83
- crawl_queued_forms
91
+ @crawl_start_times << Time.now
92
+ crawl_queued_links(number)
93
+ crawl_queued_forms(number)
94
+ @crawl_end_times << Time.now
84
95
  end
85
96
  end
86
97
 
87
- def crawl_queued_links
98
+ def crawl_queued_links(number = 0)
88
99
  while (link = @links_to_crawl.pop)
89
100
  response = proxy.send(link.method, link.href)
90
101
  log "Response #{response.code} for #{link}"
91
102
  handle_link_results(link, response)
92
- blip
103
+ blip(number)
93
104
  end
94
105
  end
95
106
 
@@ -124,13 +135,17 @@ class Relevance::Tarantula::Crawler
124
135
  Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain")
125
136
  end
126
137
 
127
- def crawl_queued_forms
138
+ def crawl_queued_forms(number = 0)
128
139
  while (form = @forms_to_crawl.pop)
129
140
  response = crawl_form(form)
130
141
  handle_form_results(form, response)
131
- blip
142
+ blip(number)
132
143
  end
133
144
  end
145
+
146
+ def elasped_time_for_pass(num)
147
+ Time.now - crawl_start_times[num]
148
+ end
134
149
 
135
150
  def grab_log!
136
151
  @log_grabber && @log_grabber.grab!
@@ -234,9 +249,16 @@ class Relevance::Tarantula::Crawler
234
249
  total_links_count - links_remaining_count
235
250
  end
236
251
 
237
- def blip
252
+ def blip(number = 0)
238
253
  unless verbose
239
254
  print "\r #{links_completed_count} of #{total_links_count} links completed "
255
+ timeout_if_too_long(number)
256
+ end
257
+ end
258
+
259
+ def timeout_if_too_long(number = 0)
260
+ if elasped_time_for_pass(number) > crawl_timeout
261
+ raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
240
262
  end
241
263
  end
242
264
  end
@@ -23,9 +23,9 @@
23
23
  and lives at <a href="http://github.com/relevance/tarantula">http://github.com/relevance/tarantula</a>.</p>
24
24
  <hr/>
25
25
  </div>
26
- <div id="page">
26
+ <div id="page">
27
27
  <div id="tabs-container">
28
- <ul class="tabs"></ul>
28
+ <ul class="tabs"> </ul>
29
29
  </div>
30
30
 
31
31
  <div id="results-container">
@@ -3,7 +3,7 @@ begin
3
3
  gem 'tidy'
4
4
  require 'tidy'
5
5
  rescue Gem::LoadError
6
- # tidy not available
6
+ puts "Tidy gem not available -- 'gem install tidy' to get it."
7
7
  end
8
8
 
9
9
  if defined? Tidy
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relevance-tarantula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Relevance, Inc.
@@ -9,10 +9,29 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-03-11 00:00:00 -07:00
12
+ date: 2009-04-06 00:00:00 -07:00
13
13
  default_executable:
14
- dependencies: []
15
-
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: htmlentities
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: hpricot
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
16
35
  description: A big hairy fuzzy spider that crawls your site, wreaking havoc
17
36
  email: opensource@thinkrelevance.com
18
37
  executables: []