relevance-tarantula 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,5 @@
1
+ v0.1.8 Add timeouts for crawls to help really long builds
2
+
1
3
  v0.1.7 Minor clean up [Rob Sanheim]
2
4
 
3
5
  v0.1.6
data/README.rdoc CHANGED
@@ -134,6 +134,19 @@ This example adds custom attacks for both SQL injection and XSS. It also tells T
134
134
  app 2 times. This is important for XSS attacks because the results won't appear until the second time
135
135
  Tarantula performs the crawl.
136
136
 
137
+ == Timeout
138
+
139
+ You can specify a timeout for each specific crawl that Tarantula runs. For example:
140
+
141
+ def test_tarantula
142
+ t = tarantula_crawler(self)
143
+ t.times_to_crawl = 2
144
+ t.crawl_timeout = 5.minutes
145
+ t.crawl "/"
146
+ end
147
+
148
+ The above will crawl your app twice, and each specific crawl will timeout if it takes longer then 5 minutes. You may need a timeout to keep the tarantula test time reasonable if your app is large or just happens to have a large amount of 'never-ending' links, such as with an any sort of "auto-admin" interface.
149
+
137
150
  == Bugs/Requests
138
151
 
139
152
  Please submit your bug reports, patches, or feature requests at Lighthouse:
data/Rakefile CHANGED
@@ -19,6 +19,8 @@ begin
19
19
  s.authors = ["Relevance, Inc."]
20
20
  s.require_paths = ["lib"]
21
21
  s.files = files.flatten
22
+ s.add_dependency 'htmlentities'
23
+ s.add_dependency 'hpricot'
22
24
  end
23
25
  rescue LoadError
24
26
  puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
@@ -46,7 +48,7 @@ namespace :examples do
46
48
  t.rcov_opts = %[--exclude "gems/*,/Library/Ruby/*,config/*" --text-summary --sort coverage --no-validator-links]
47
49
  end
48
50
 
49
- RAILS_VERSIONS = %w[2.0.2 2.1.0 2.1.1 2.2.2 2.3.1]
51
+ RAILS_VERSIONS = %w[2.0.2 2.1.0 2.1.1 2.2.2 2.3.1 2.3.2]
50
52
 
51
53
  desc "Run exmaples with multiple versions of rails"
52
54
  task :multi_rails do
data/VERSION.yml CHANGED
@@ -1,4 +1,4 @@
1
1
  ---
2
+ :patch: 8
2
3
  :major: 0
3
4
  :minor: 1
4
- :patch: 7
@@ -1,204 +1,246 @@
1
1
  require File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "example_helper.rb"))
2
2
 
3
- describe 'Relevance::Tarantula::Crawler#transform_url' do
4
- before {@crawler = Relevance::Tarantula::Crawler.new}
5
- it "de-obfuscates unicode obfuscated urls" do
6
- obfuscated_mailto = "mailto:"
7
- @crawler.transform_url(obfuscated_mailto).should == "mailto:"
8
- end
3
+ describe Relevance::Tarantula::Crawler do
9
4
 
10
- it "strips the trailing name portion of a link" do
11
- @crawler.transform_url('http://host/path#name').should == 'http://host/path'
12
- end
13
- end
5
+ describe "transform_url" do
14
6
 
15
- describe 'Relevance::Tarantula::Crawler log grabbing' do
16
- it "returns nil if no grabber is specified" do
17
- crawler = Relevance::Tarantula::Crawler.new
18
- crawler.grab_log!.should == nil
19
- end
7
+ before { @crawler = Relevance::Tarantula::Crawler.new }
8
+
9
+ it "de-obfuscates unicode obfuscated urls" do
10
+ obfuscated_mailto = "mailto:"
11
+ @crawler.transform_url(obfuscated_mailto).should == "mailto:"
12
+ end
20
13
 
21
- it "returns grabber.grab if grabber is specified" do
22
- crawler = Relevance::Tarantula::Crawler.new
23
- crawler.log_grabber = stub(:grab! => "fake log entry")
24
- crawler.grab_log!.should == "fake log entry"
14
+ it "strips the trailing name portion of a link" do
15
+ @crawler.transform_url('http://host/path#name').should == 'http://host/path'
16
+ end
25
17
  end
26
- end
18
+
19
+
20
+ describe "log grabbing" do
27
21
 
28
- describe 'Relevance::Tarantula::Crawler interruption' do
29
- it 'catches interruption and writes the partial report' do
30
- crawler = Relevance::Tarantula::Crawler.new
31
- crawler.stubs(:queue_link)
32
- crawler.stubs(:do_crawl).raises(Interrupt)
33
- crawler.expects(:report_results)
34
- $stderr.expects(:puts).with("CTRL-C")
35
- crawler.crawl
36
- end
37
- end
22
+ it "returns nil if no grabber is specified" do
23
+ crawler = Relevance::Tarantula::Crawler.new
24
+ crawler.grab_log!.should == nil
25
+ end
38
26
 
39
- describe 'Relevance::Tarantula::Crawler handle_form_results' do
40
- it 'captures the result values (bugfix)' do
41
- response = stub_everything
42
- result_args = {:url => :action_stub,
43
- :data => 'nil',
44
- :response => response,
45
- :referrer => :action_stub,
46
- :log => nil,
47
- :method => :stub_method,
48
- :test_name => nil}
49
- result = Relevance::Tarantula::Result.new(result_args)
50
- Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
51
- crawler = Relevance::Tarantula::Crawler.new
52
- crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub),
53
- response)
54
- end
55
- end
56
-
57
- describe 'Relevance::Tarantula::Crawler#crawl' do
58
- it 'queues the first url, does crawl, and then reports results' do
59
- crawler = Relevance::Tarantula::Crawler.new
60
- crawler.expects(:queue_link).with("/foobar")
61
- crawler.expects(:do_crawl)
62
- crawler.expects(:report_results)
63
- crawler.crawl("/foobar")
27
+ it "returns grabber.grab if grabber is specified" do
28
+ crawler = Relevance::Tarantula::Crawler.new
29
+ crawler.log_grabber = stub(:grab! => "fake log entry")
30
+ crawler.grab_log!.should == "fake log entry"
31
+ end
32
+
64
33
  end
65
34
 
66
- it 'reports results even if the crawl fails' do
67
- crawler = Relevance::Tarantula::Crawler.new
68
- crawler.expects(:do_crawl).raises(RuntimeError)
69
- crawler.expects(:report_results)
70
- lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
71
- end
72
- end
73
-
74
- describe 'Relevance::Tarantula::Crawler queuing' do
75
- it 'queues and remembers links' do
76
- crawler = Relevance::Tarantula::Crawler.new
77
- crawler.expects(:transform_url).with("/url").returns("/transformed")
78
- crawler.queue_link("/url")
79
- crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")]
80
- crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")])
35
+ describe "interrupt" do
36
+
37
+ it 'catches interruption and writes the partial report' do
38
+ crawler = Relevance::Tarantula::Crawler.new
39
+ crawler.stubs(:queue_link)
40
+ crawler.stubs(:do_crawl).raises(Interrupt)
41
+ crawler.expects(:report_results)
42
+ $stderr.expects(:puts).with("CTRL-C")
43
+ crawler.crawl
44
+ end
45
+
81
46
  end
82
47
 
83
- it 'queues and remembers forms' do
84
- crawler = Relevance::Tarantula::Crawler.new
85
- form = Hpricot('<form action="/action" method="post"/>').at('form')
86
- signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature
87
- crawler.queue_form(form)
88
- crawler.forms_to_crawl.size.should == 1
89
- crawler.form_signatures_queued.should == Set.new([signature])
48
+ describe 'handle_form_results' do
49
+
50
+ it 'captures the result values (bugfix)' do
51
+ response = stub_everything
52
+ result_args = {:url => :action_stub,
53
+ :data => 'nil',
54
+ :response => response,
55
+ :referrer => :action_stub,
56
+ :log => nil,
57
+ :method => :stub_method,
58
+ :test_name => nil}
59
+ result = Relevance::Tarantula::Result.new(result_args)
60
+ Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
61
+ crawler = Relevance::Tarantula::Crawler.new
62
+ crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub),
63
+ response)
64
+ end
65
+
90
66
  end
91
67
 
92
- it 'remembers link referrer if there is one' do
93
- crawler = Relevance::Tarantula::Crawler.new
94
- crawler.queue_link("/url", "/some-referrer")
95
- crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"}
68
+ describe "crawl" do
69
+
70
+ it 'queues the first url, does crawl, and then reports results' do
71
+ crawler = Relevance::Tarantula::Crawler.new
72
+ crawler.expects(:queue_link).with("/foobar")
73
+ crawler.expects(:do_crawl)
74
+ crawler.expects(:report_results)
75
+ crawler.crawl("/foobar")
76
+ end
77
+
78
+ it 'reports results even if the crawl fails' do
79
+ crawler = Relevance::Tarantula::Crawler.new
80
+ crawler.expects(:do_crawl).raises(RuntimeError)
81
+ crawler.expects(:report_results)
82
+ lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
83
+ end
84
+
96
85
  end
97
86
 
98
- end
87
+ describe "queueing" do
99
88
 
100
- describe 'Relevance::Tarantula::Crawler#report_results' do
101
- it "delegates to generate_reports" do
102
- crawler = Relevance::Tarantula::Crawler.new
103
- crawler.expects(:generate_reports)
104
- crawler.report_results
105
- end
106
- end
89
+ it 'queues and remembers links' do
90
+ crawler = Relevance::Tarantula::Crawler.new
91
+ crawler.expects(:transform_url).with("/url").returns("/transformed")
92
+ crawler.queue_link("/url")
93
+ crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")]
94
+ crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")])
95
+ end
107
96
 
108
- describe 'Relevance::Tarantula::Crawler#crawling' do
97
+ it 'queues and remembers forms' do
98
+ crawler = Relevance::Tarantula::Crawler.new
99
+ form = Hpricot('<form action="/action" method="post"/>').at('form')
100
+ signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature
101
+ crawler.queue_form(form)
102
+ crawler.forms_to_crawl.size.should == 1
103
+ crawler.form_signatures_queued.should == Set.new([signature])
104
+ end
109
105
 
110
- it "converts ActiveRecord::RecordNotFound into a 404" do
111
- (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound)
112
- crawler = Relevance::Tarantula::Crawler.new
113
- crawler.proxy = proxy
114
- response = crawler.crawl_form stub_everything(:method => nil)
115
- response.code.should == "404"
116
- response.content_type.should == "text/plain"
117
- response.body.should == "ActiveRecord::RecordNotFound"
106
+ it 'remembers link referrer if there is one' do
107
+ crawler = Relevance::Tarantula::Crawler.new
108
+ crawler.queue_link("/url", "/some-referrer")
109
+ crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"}
110
+ end
111
+
118
112
  end
113
+
114
+ describe "crawling" do
115
+
116
+ it "converts ActiveRecord::RecordNotFound into a 404" do
117
+ (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound)
118
+ crawler = Relevance::Tarantula::Crawler.new
119
+ crawler.proxy = proxy
120
+ response = crawler.crawl_form stub_everything(:method => nil)
121
+ response.code.should == "404"
122
+ response.content_type.should == "text/plain"
123
+ response.body.should == "ActiveRecord::RecordNotFound"
124
+ end
119
125
 
120
- it "does four things with each link: get, log, handle, and blip" do
121
- crawler = Relevance::Tarantula::Crawler.new
122
- crawler.proxy = stub
123
- response = stub(:code => "200")
124
- crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
125
- crawler.proxy.expects(:get).returns(response).times(2)
126
- crawler.expects(:log).times(2)
127
- crawler.expects(:handle_link_results).times(2)
128
- crawler.expects(:blip).times(2)
129
- crawler.crawl_queued_links
130
- crawler.links_to_crawl.should == []
131
- end
126
+ it "does four things with each link: get, log, handle, and blip" do
127
+ crawler = Relevance::Tarantula::Crawler.new
128
+ crawler.proxy = stub
129
+ response = stub(:code => "200")
130
+ crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
131
+ crawler.proxy.expects(:get).returns(response).times(2)
132
+ crawler.expects(:log).times(2)
133
+ crawler.expects(:handle_link_results).times(2)
134
+ crawler.expects(:blip).times(2)
135
+ crawler.crawl_queued_links
136
+ crawler.links_to_crawl.should == []
137
+ end
138
+
139
+ it "invokes queued forms, logs responses, and calls handlers" do
140
+ crawler = Relevance::Tarantula::Crawler.new
141
+ crawler.forms_to_crawl << stub_everything(:method => "get",
142
+ :action => "/foo",
143
+ :data => "some data",
144
+ :to_s => "stub")
145
+ crawler.proxy = stub_everything(:send => stub(:code => "200" ))
146
+ crawler.expects(:log).with("Response 200 for stub")
147
+ crawler.expects(:blip)
148
+ crawler.crawl_queued_forms
149
+ end
150
+
151
+ it "breaks out early if a timeout is set" do
152
+ crawler = Relevance::Tarantula::Crawler.new
153
+ stub_puts_and_print(crawler)
154
+ crawler.proxy = stub
155
+ response = stub(:code => "200")
156
+ crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
157
+ crawler.proxy.expects(:get).returns(response).times(4)
158
+ crawler.forms_to_crawl << stub_everything(:method => "post",
159
+ :action => "/foo",
160
+ :data => "some data",
161
+ :to_s => "stub")
162
+ crawler.proxy.expects(:post).returns(response).times(2)
163
+ crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
164
+ crawler.times_to_crawl = 2
165
+ crawler.crawl
166
+
167
+ end
168
+
169
+ it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
170
+ crawler = Relevance::Tarantula::Crawler.new
171
+ stub_puts_and_print(crawler)
172
+ crawler.proxy = stub
173
+ response = stub(:code => "200")
174
+ crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
175
+ crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2
176
+ crawler.forms_to_crawl << stub_everything(:method => "post",
177
+ :action => "/foo",
178
+ :data => "some data",
179
+ :to_s => "stub")
180
+ crawler.proxy.expects(:post).returns(response).times(2)
181
+ crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
182
+ crawler.times_to_crawl = 2
183
+ crawler.crawl
184
+ end
132
185
 
133
- it "invokes queued forms, logs responses, and calls handlers" do
134
- crawler = Relevance::Tarantula::Crawler.new
135
- crawler.forms_to_crawl << stub_everything(:method => "get",
136
- :action => "/foo",
137
- :data => "some data",
138
- :to_s => "stub")
139
- crawler.proxy = stub_everything(:send => stub(:code => "200" ))
140
- crawler.expects(:log).with("Response 200 for stub")
141
- crawler.expects(:blip)
142
- crawler.crawl_queued_forms
143
186
  end
144
187
 
145
- it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
146
- crawler = Relevance::Tarantula::Crawler.new
147
- stub_puts_and_print(crawler)
148
- crawler.proxy = stub
149
- response = stub(:code => "200")
150
- crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
151
- crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2
152
- crawler.forms_to_crawl << stub_everything(:method => "post",
153
- :action => "/foo",
154
- :data => "some data",
155
- :to_s => "stub")
156
- crawler.proxy.expects(:post).returns(response).times(2)
157
- crawler.expects(:links_completed_count).returns(*(0..6).to_a).times(6)
158
- crawler.times_to_crawl = 2
159
- crawler.crawl
160
- end
161
- end
188
+ describe "report_results" do
162
189
 
163
- describe 'Crawler blip' do
164
- it "blips the current progress if !verbose" do
165
- crawler = Relevance::Tarantula::Crawler.new
166
- crawler.stubs(:verbose).returns false
167
- crawler.expects(:print).with("\r 0 of 0 links completed ")
168
- crawler.blip
169
- end
170
- it "blips nothing if verbose" do
171
- crawler = Relevance::Tarantula::Crawler.new
172
- crawler.stubs(:verbose).returns true
173
- crawler.expects(:print).never
174
- crawler.blip
190
+ it "delegates to generate_reports" do
191
+ crawler = Relevance::Tarantula::Crawler.new
192
+ crawler.expects(:generate_reports)
193
+ crawler.report_results
194
+ end
195
+
175
196
  end
176
- end
197
+
198
+ describe "blip" do
177
199
 
178
- describe 'Relevance::Tarantula::Crawler' do
179
- it "is finished when the links and forms are crawled" do
180
- crawler = Relevance::Tarantula::Crawler.new
181
- crawler.finished?.should == true
200
+ it "blips the current progress if !verbose" do
201
+ crawler = Relevance::Tarantula::Crawler.new
202
+ crawler.stubs(:verbose).returns false
203
+ crawler.stubs(:timeout_if_too_long)
204
+ crawler.expects(:print).with("\r 0 of 0 links completed ")
205
+ crawler.blip
206
+ end
207
+
208
+ it "blips nothing if verbose" do
209
+ crawler = Relevance::Tarantula::Crawler.new
210
+ crawler.stubs(:verbose).returns true
211
+ crawler.expects(:print).never
212
+ crawler.blip
213
+ end
214
+
182
215
  end
216
+
217
+ describe "finished?" do
183
218
 
184
- it "isn't finished when links remain" do
185
- crawler = Relevance::Tarantula::Crawler.new
186
- crawler.links_to_crawl = [:stub_link]
187
- crawler.finished?.should == false
188
- end
219
+ it "is finished when the links and forms are crawled" do
220
+ crawler = Relevance::Tarantula::Crawler.new
221
+ crawler.finished?.should == true
222
+ end
189
223
 
190
- it "isn't finished when links remain" do
191
- crawler = Relevance::Tarantula::Crawler.new
192
- crawler.forms_to_crawl = [:stub_form]
193
- crawler.finished?.should == false
224
+ it "isn't finished when links remain" do
225
+ crawler = Relevance::Tarantula::Crawler.new
226
+ crawler.links_to_crawl = [:stub_link]
227
+ crawler.finished?.should == false
228
+ end
229
+
230
+ it "isn't finished when links remain" do
231
+ crawler = Relevance::Tarantula::Crawler.new
232
+ crawler.forms_to_crawl = [:stub_form]
233
+ crawler.finished?.should == false
234
+ end
235
+
194
236
  end
195
-
237
+
196
238
  it "crawls links and forms again and again until finished?==true" do
197
239
  crawler = Relevance::Tarantula::Crawler.new
198
240
  crawler.expects(:finished?).times(3).returns(false, false, true)
199
241
  crawler.expects(:crawl_queued_links).times(2)
200
242
  crawler.expects(:crawl_queued_forms).times(2)
201
- crawler.do_crawl
243
+ crawler.do_crawl(1)
202
244
  end
203
245
 
204
246
  it "asks each reporter to write its report in report_dir" do
@@ -225,72 +267,120 @@ describe 'Relevance::Tarantula::Crawler' do
225
267
  crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
226
268
  end
227
269
 
228
- end
229
-
230
- describe "Crawler link skipping" do
231
- before do
232
- @crawler = Relevance::Tarantula::Crawler.new
233
- end
234
-
235
- it "skips links that are too long" do
236
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
237
- @crawler.max_url_length = 2
238
- @crawler.expects(:log).with("Skipping long url /foo")
239
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
240
- end
270
+ describe "link skipping" do
271
+
272
+ before { @crawler = Relevance::Tarantula::Crawler.new }
273
+
274
+ it "skips links that are too long" do
275
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
276
+ @crawler.max_url_length = 2
277
+ @crawler.expects(:log).with("Skipping long url /foo")
278
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
279
+ end
241
280
 
242
- it "skips outbound links (those that begin with http)" do
243
- @crawler.expects(:log).with("Skipping http-anything")
244
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true
245
- end
281
+ it "skips outbound links (those that begin with http)" do
282
+ @crawler.expects(:log).with("Skipping http-anything")
283
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true
284
+ end
246
285
 
247
- it "skips javascript links (those that begin with javascript)" do
248
- @crawler.expects(:log).with("Skipping javascript-anything")
249
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true
250
- end
286
+ it "skips javascript links (those that begin with javascript)" do
287
+ @crawler.expects(:log).with("Skipping javascript-anything")
288
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true
289
+ end
251
290
 
252
- it "skips mailto links (those that begin with http)" do
253
- @crawler.expects(:log).with("Skipping mailto-anything")
254
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true
255
- end
291
+ it "skips mailto links (those that begin with http)" do
292
+ @crawler.expects(:log).with("Skipping mailto-anything")
293
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true
294
+ end
256
295
 
257
- it 'skips blank links' do
258
- @crawler.queue_link(nil)
259
- @crawler.links_to_crawl.should == []
260
- @crawler.queue_link("")
261
- @crawler.links_to_crawl.should == []
262
- end
296
+ it 'skips blank links' do
297
+ @crawler.queue_link(nil)
298
+ @crawler.links_to_crawl.should == []
299
+ @crawler.queue_link("")
300
+ @crawler.links_to_crawl.should == []
301
+ end
263
302
 
264
- it "logs and skips links that match a pattern" do
265
- @crawler.expects(:log).with("Skipping /the-red-button")
266
- @crawler.skip_uri_patterns << /red-button/
267
- @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button")
268
- @crawler.queue_link("/the-red-button").should == nil
269
- end
303
+ it "logs and skips links that match a pattern" do
304
+ @crawler.expects(:log).with("Skipping /the-red-button")
305
+ @crawler.skip_uri_patterns << /red-button/
306
+ @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button")
307
+ @crawler.queue_link("/the-red-button").should == nil
308
+ end
270
309
 
271
- it "logs and skips form submissions that match a pattern" do
272
- @crawler.expects(:log).with("Skipping /reset-password-form")
273
- @crawler.skip_uri_patterns << /reset-password/
274
- fs = stub_everything(:action => "/reset-password-form")
275
- @crawler.should_skip_form_submission?(fs).should == true
310
+ it "logs and skips form submissions that match a pattern" do
311
+ @crawler.expects(:log).with("Skipping /reset-password-form")
312
+ @crawler.skip_uri_patterns << /reset-password/
313
+ fs = stub_everything(:action => "/reset-password-form")
314
+ @crawler.should_skip_form_submission?(fs).should == true
315
+ end
276
316
  end
277
- end
317
+
318
+ describe "allow_nnn_for" do
278
319
 
279
- describe "allow_nnn_for" do
280
- it "installs result as a response_code_handler" do
281
- crawler = Relevance::Tarantula::Crawler.new
282
- crawler.response_code_handler.should == Relevance::Tarantula::Result
320
+ it "installs result as a response_code_handler" do
321
+ crawler = Relevance::Tarantula::Crawler.new
322
+ crawler.response_code_handler.should == Relevance::Tarantula::Result
323
+ end
324
+
325
+ it "delegates to the response_code_handler" do
326
+ crawler = Relevance::Tarantula::Crawler.new
327
+ (response_code_handler = mock).expects(:allow_404_for).with(:stub)
328
+ crawler.response_code_handler = response_code_handler
329
+ crawler.allow_404_for(:stub)
330
+ end
331
+
332
+ it "chains up to super for method_missing" do
333
+ crawler = Relevance::Tarantula::Crawler.new
334
+ lambda{crawler.foo}.should raise_error(NoMethodError)
335
+ end
336
+
283
337
  end
284
338
 
285
- it "delegates to the response_code_handler" do
286
- crawler = Relevance::Tarantula::Crawler.new
287
- (response_code_handler = mock).expects(:allow_404_for).with(:stub)
288
- crawler.response_code_handler = response_code_handler
289
- crawler.allow_404_for(:stub)
339
+ describe "timeouts" do
340
+
341
+ it "sets start and end times for a single crawl" do
342
+ start_time = Time.parse("March 1st, 2008 10:00am")
343
+ end_time = Time.parse("March 1st, 2008 10:10am")
344
+ Time.stubs(:now).returns(start_time, end_time)
345
+
346
+ crawler = Relevance::Tarantula::Crawler.new
347
+ stub_puts_and_print(crawler)
348
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
349
+ crawler.crawl
350
+ crawler.crawl_start_times.first.should == start_time
351
+ crawler.crawl_end_times.first.should == end_time
352
+ end
353
+
354
+ it "has elasped time for a crawl" do
355
+ start_time = Time.parse("March 1st, 2008 10:00am")
356
+ elasped_time_check = Time.parse("March 1st, 2008, 10:10:00am")
357
+ Time.stubs(:now).returns(start_time, elasped_time_check)
358
+
359
+ crawler = Relevance::Tarantula::Crawler.new
360
+ stub_puts_and_print(crawler)
361
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
362
+ crawler.crawl
363
+ crawler.elasped_time_for_pass(0).should == 600.seconds
364
+ end
365
+
366
+ it "raises out of the crawl if elasped time is greater then the crawl timeout" do
367
+ start_time = Time.parse("March 1st, 2008 10:00am")
368
+ elasped_time_check = Time.parse("March 1st, 2008, 10:35:00am")
369
+ Time.stubs(:now).returns(start_time, elasped_time_check)
370
+
371
+ crawler = Relevance::Tarantula::Crawler.new
372
+ crawler.crawl_timeout = 5.minutes
373
+
374
+ crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
375
+ crawler.proxy = stub
376
+ crawler.proxy.stubs(:get).returns(response = stub(:code => "200"))
377
+
378
+ stub_puts_and_print(crawler)
379
+ lambda {
380
+ crawler.do_crawl(0)
381
+ }.should raise_error
382
+ end
383
+
290
384
  end
291
385
 
292
- it "chains up to super for method_missing" do
293
- crawler = Relevance::Tarantula::Crawler.new
294
- lambda{crawler.foo}.should raise_error(NoMethodError)
295
- end
296
- end
386
+ end
@@ -7,11 +7,13 @@ class Relevance::Tarantula::Crawler
7
7
  extend Forwardable
8
8
  include Relevance::Tarantula
9
9
 
10
+ class CrawlTimeout < RuntimeError; end
11
+
10
12
  attr_accessor :proxy, :handlers, :skip_uri_patterns, :log_grabber,
11
13
  :reporters, :links_to_crawl, :links_queued, :forms_to_crawl,
12
14
  :form_signatures_queued, :max_url_length, :response_code_handler,
13
- :times_to_crawl, :fuzzers, :test_name
14
- attr_reader :transform_url_patterns, :referrers, :failures, :successes
15
+ :times_to_crawl, :fuzzers, :test_name, :crawl_timeout
16
+ attr_reader :transform_url_patterns, :referrers, :failures, :successes, :crawl_start_times, :crawl_end_times
15
17
 
16
18
  def initialize
17
19
  @max_url_length = 1024
@@ -22,6 +24,8 @@ class Relevance::Tarantula::Crawler
22
24
  @form_signatures_queued = Set.new
23
25
  @links_to_crawl = []
24
26
  @forms_to_crawl = []
27
+ @crawl_start_times, @crawl_end_times = [], []
28
+ @crawl_timeout = 20.minutes
25
29
  @referrers = {}
26
30
  @skip_uri_patterns = [
27
31
  /^javascript/,
@@ -53,13 +57,18 @@ class Relevance::Tarantula::Crawler
53
57
  orig_form_signatures_queued = @form_signatures_queued.dup
54
58
  orig_links_to_crawl = @links_to_crawl.dup
55
59
  orig_forms_to_crawl = @forms_to_crawl.dup
56
- @times_to_crawl.times do |i|
60
+ @times_to_crawl.times do |num|
57
61
  queue_link url
58
- do_crawl
59
-
60
- puts "#{(i+1).ordinalize} crawl" if @times_to_crawl > 1
62
+
63
+ begin
64
+ do_crawl num
65
+ rescue CrawlTimeout => e
66
+ puts e.message
67
+ end
68
+
69
+ puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1
61
70
 
62
- if i + 1 < @times_to_crawl
71
+ if num + 1 < @times_to_crawl
63
72
  @links_queued = orig_links_queued
64
73
  @form_signatures_queued = orig_form_signatures_queued
65
74
  @links_to_crawl = orig_links_to_crawl
@@ -77,19 +86,21 @@ class Relevance::Tarantula::Crawler
77
86
  @links_to_crawl.empty? && @forms_to_crawl.empty?
78
87
  end
79
88
 
80
- def do_crawl
89
+ def do_crawl(number)
81
90
  while (!finished?)
82
- crawl_queued_links
83
- crawl_queued_forms
91
+ @crawl_start_times << Time.now
92
+ crawl_queued_links(number)
93
+ crawl_queued_forms(number)
94
+ @crawl_end_times << Time.now
84
95
  end
85
96
  end
86
97
 
87
- def crawl_queued_links
98
+ def crawl_queued_links(number = 0)
88
99
  while (link = @links_to_crawl.pop)
89
100
  response = proxy.send(link.method, link.href)
90
101
  log "Response #{response.code} for #{link}"
91
102
  handle_link_results(link, response)
92
- blip
103
+ blip(number)
93
104
  end
94
105
  end
95
106
 
@@ -124,13 +135,17 @@ class Relevance::Tarantula::Crawler
124
135
  Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain")
125
136
  end
126
137
 
127
- def crawl_queued_forms
138
+ def crawl_queued_forms(number = 0)
128
139
  while (form = @forms_to_crawl.pop)
129
140
  response = crawl_form(form)
130
141
  handle_form_results(form, response)
131
- blip
142
+ blip(number)
132
143
  end
133
144
  end
145
+
146
+ def elasped_time_for_pass(num)
147
+ Time.now - crawl_start_times[num]
148
+ end
134
149
 
135
150
  def grab_log!
136
151
  @log_grabber && @log_grabber.grab!
@@ -234,9 +249,16 @@ class Relevance::Tarantula::Crawler
234
249
  total_links_count - links_remaining_count
235
250
  end
236
251
 
237
- def blip
252
+ def blip(number = 0)
238
253
  unless verbose
239
254
  print "\r #{links_completed_count} of #{total_links_count} links completed "
255
+ timeout_if_too_long(number)
256
+ end
257
+ end
258
+
259
+ def timeout_if_too_long(number = 0)
260
+ if elasped_time_for_pass(number) > crawl_timeout
261
+ raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
240
262
  end
241
263
  end
242
264
  end
@@ -23,9 +23,9 @@
23
23
  and lives at <a href="http://github.com/relevance/tarantula">http://github.com/relevance/tarantula</a>.</p>
24
24
  <hr/>
25
25
  </div>
26
- <div id="page">
26
+ <div id="page">
27
27
  <div id="tabs-container">
28
- <ul class="tabs"></ul>
28
+ <ul class="tabs"> </ul>
29
29
  </div>
30
30
 
31
31
  <div id="results-container">
@@ -3,7 +3,7 @@ begin
3
3
  gem 'tidy'
4
4
  require 'tidy'
5
5
  rescue Gem::LoadError
6
- # tidy not available
6
+ puts "Tidy gem not available -- 'gem install tidy' to get it."
7
7
  end
8
8
 
9
9
  if defined? Tidy
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relevance-tarantula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Relevance, Inc.
@@ -9,10 +9,29 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-03-11 00:00:00 -07:00
12
+ date: 2009-04-06 00:00:00 -07:00
13
13
  default_executable:
14
- dependencies: []
15
-
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: htmlentities
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: hpricot
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
16
35
  description: A big hairy fuzzy spider that crawls your site, wreaking havoc
17
36
  email: opensource@thinkrelevance.com
18
37
  executables: []