codez-tarantula 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. data/.autotest +14 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CHANGELOG +64 -0
  5. data/DSL_EXAMPLES.md +120 -0
  6. data/Gemfile +2 -0
  7. data/LICENSE +20 -0
  8. data/README.rdoc +136 -0
  9. data/Rakefile +36 -0
  10. data/ci/rails2.gemfile +4 -0
  11. data/ci/rails3.gemfile +4 -0
  12. data/laf/images/header_bg.jpg +0 -0
  13. data/laf/images/logo.png +0 -0
  14. data/laf/images/tagline.png +0 -0
  15. data/laf/javascripts/jquery-1.2.3.js +3408 -0
  16. data/laf/javascripts/jquery-ui-tabs.js +890 -0
  17. data/laf/javascripts/jquery.tablesorter.js +861 -0
  18. data/laf/javascripts/tarantula.js +10 -0
  19. data/laf/stylesheets/tarantula.css +346 -0
  20. data/lib/relevance/core_extensions/ellipsize.rb +38 -0
  21. data/lib/relevance/core_extensions/file.rb +15 -0
  22. data/lib/relevance/core_extensions/metaclass.rb +78 -0
  23. data/lib/relevance/core_extensions/response.rb +14 -0
  24. data/lib/relevance/core_extensions/test_case.rb +21 -0
  25. data/lib/relevance/tarantula.rb +55 -0
  26. data/lib/relevance/tarantula/attack.rb +22 -0
  27. data/lib/relevance/tarantula/attack_handler.rb +43 -0
  28. data/lib/relevance/tarantula/basic_attack.rb +44 -0
  29. data/lib/relevance/tarantula/crawler.rb +271 -0
  30. data/lib/relevance/tarantula/detail.html.erb +81 -0
  31. data/lib/relevance/tarantula/form.rb +29 -0
  32. data/lib/relevance/tarantula/form_submission.rb +98 -0
  33. data/lib/relevance/tarantula/html_document_handler.rb +42 -0
  34. data/lib/relevance/tarantula/html_report_helper.rb +46 -0
  35. data/lib/relevance/tarantula/html_reporter.rb +111 -0
  36. data/lib/relevance/tarantula/index.html.erb +37 -0
  37. data/lib/relevance/tarantula/invalid_html_handler.rb +27 -0
  38. data/lib/relevance/tarantula/io_reporter.rb +40 -0
  39. data/lib/relevance/tarantula/link.rb +105 -0
  40. data/lib/relevance/tarantula/log_grabber.rb +22 -0
  41. data/lib/relevance/tarantula/rails_integration_proxy.rb +90 -0
  42. data/lib/relevance/tarantula/recording.rb +12 -0
  43. data/lib/relevance/tarantula/response.rb +19 -0
  44. data/lib/relevance/tarantula/result.rb +83 -0
  45. data/lib/relevance/tarantula/test_report.html.erb +32 -0
  46. data/lib/relevance/tarantula/tidy_handler.rb +35 -0
  47. data/lib/relevance/tarantula/transform.rb +21 -0
  48. data/lib/relevance/tarantula/version.rb +5 -0
  49. data/lib/relevance/tasks/tarantula_tasks.rake +42 -0
  50. data/lib/tarantula-rails3.rb +9 -0
  51. data/spec/relevance/core_extensions/ellipsize_spec.rb +19 -0
  52. data/spec/relevance/core_extensions/file_spec.rb +7 -0
  53. data/spec/relevance/core_extensions/response_spec.rb +48 -0
  54. data/spec/relevance/core_extensions/test_case_spec.rb +19 -0
  55. data/spec/relevance/tarantula/attack_handler_spec.rb +29 -0
  56. data/spec/relevance/tarantula/basic_attack_spec.rb +12 -0
  57. data/spec/relevance/tarantula/crawler_spec.rb +409 -0
  58. data/spec/relevance/tarantula/form_spec.rb +50 -0
  59. data/spec/relevance/tarantula/form_submission_spec.rb +171 -0
  60. data/spec/relevance/tarantula/html_document_handler_spec.rb +43 -0
  61. data/spec/relevance/tarantula/html_report_helper_spec.rb +46 -0
  62. data/spec/relevance/tarantula/html_reporter_spec.rb +82 -0
  63. data/spec/relevance/tarantula/invalid_html_handler_spec.rb +33 -0
  64. data/spec/relevance/tarantula/io_reporter_spec.rb +11 -0
  65. data/spec/relevance/tarantula/link_spec.rb +132 -0
  66. data/spec/relevance/tarantula/log_grabber_spec.rb +26 -0
  67. data/spec/relevance/tarantula/rails_integration_proxy_spec.rb +100 -0
  68. data/spec/relevance/tarantula/result_spec.rb +85 -0
  69. data/spec/relevance/tarantula/tidy_handler_spec.rb +58 -0
  70. data/spec/relevance/tarantula/transform_spec.rb +20 -0
  71. data/spec/relevance/tarantula_spec.rb +23 -0
  72. data/spec/spec_helper.rb +43 -0
  73. data/tarantula.gemspec +25 -0
  74. data/template/tarantula_test.rb +22 -0
  75. data/vendor/xss-shield/MIT-LICENSE +20 -0
  76. data/vendor/xss-shield/README +76 -0
  77. data/vendor/xss-shield/init.rb +16 -0
  78. data/vendor/xss-shield/lib/xss_shield.rb +6 -0
  79. data/vendor/xss-shield/lib/xss_shield/erb_hacks.rb +111 -0
  80. data/vendor/xss-shield/lib/xss_shield/haml_hacks.rb +42 -0
  81. data/vendor/xss-shield/lib/xss_shield/safe_string.rb +47 -0
  82. data/vendor/xss-shield/lib/xss_shield/secure_helpers.rb +40 -0
  83. data/vendor/xss-shield/test/test_actionview_integration.rb +40 -0
  84. data/vendor/xss-shield/test/test_erb.rb +44 -0
  85. data/vendor/xss-shield/test/test_haml.rb +43 -0
  86. data/vendor/xss-shield/test/test_helpers.rb +25 -0
  87. data/vendor/xss-shield/test/test_safe_string.rb +55 -0
  88. metadata +247 -0
@@ -0,0 +1,21 @@
1
+ module Relevance
2
+ module Tarantula
3
+
4
+ class Transform
5
+ attr_accessor :from, :to
6
+ def initialize(from, to)
7
+ @from = from
8
+ @to = to
9
+ end
10
+ def [](string)
11
+ case to
12
+ when Proc
13
+ string.gsub(from, &to)
14
+ else
15
+ string.gsub(from, to)
16
+ end
17
+ end
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module Relevance
2
+ module Tarantula
3
+ VERSION = "0.5.0"
4
+ end
5
+ end
@@ -0,0 +1,42 @@
1
+ require 'rake'
2
+
3
+ namespace :tarantula do
4
+
5
+ desc 'Run tarantula tests.'
6
+ task :test do
7
+ rm_rf "tmp/tarantula"
8
+ Rake::TestTask.new(:tarantula_test) do |t|
9
+ t.libs << 'test'
10
+ t.pattern = 'test/tarantula/**/*_test.rb'
11
+ t.verbose = true
12
+ end
13
+
14
+ Rake::Task[:tarantula_test].invoke
15
+ end
16
+
17
+ desc 'Run tarantula tests and open results in your browser.'
18
+ task :report do
19
+ begin
20
+ Rake::Task['tarantula:test'].invoke
21
+ rescue RuntimeError => e
22
+ puts e.message
23
+ end
24
+
25
+ Dir.glob("tmp/tarantula/**/index.html") do |file|
26
+ if PLATFORM['darwin']
27
+ system("open #{file}")
28
+ elsif PLATFORM[/linux/]
29
+ system("firefox #{file}")
30
+ else
31
+ puts "You can view tarantula results at #{file}"
32
+ end
33
+ end
34
+ end
35
+
36
+ desc 'Generate a default tarantula test'
37
+ task :setup do
38
+ mkdir_p "test/tarantula"
39
+ template_path = File.expand_path(File.join(File.dirname(__FILE__), "../../..", "template", "tarantula_test.rb"))
40
+ cp template_path, "test/tarantula/"
41
+ end
42
+ end
@@ -0,0 +1,9 @@
1
+ module Relevance
2
+ module Tarantula
3
+ class Railtie < ::Rails::Railtie
4
+ rake_tasks do
5
+ load "relevance/tasks/tarantula_tasks.rake"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ require "spec_helper"
2
+
3
+ describe "Relevance::CoreExtensions::Object#ellipsize" do
4
+ it "converts nil to empty string" do
5
+ nil.ellipsize.should == ""
6
+ end
7
+
8
+ it "doesn't touch short strings" do
9
+ "hello".ellipsize.should == "hello"
10
+ end
11
+
12
+ it "calls inspect on non-strings" do
13
+ [1,2,3].ellipsize.should == "[1, 2, 3]"
14
+ end
15
+
16
+ it "shortens long strings and adds ..." do
17
+ "long-string".ellipsize(5).should == "long-..."
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ require "spec_helper"
2
+
3
+ describe "Relevance::CoreExtensions::File#extension" do
4
+ it "should return the extension without the leading dot" do
5
+ File.extension("foo.bar").should == "bar"
6
+ end
7
+ end
@@ -0,0 +1,48 @@
1
+ require "spec_helper"
2
+
3
+ describe "Relevance::CoreExtensions::Response#html?" do
4
+ before do
5
+ @response = OpenStruct.new
6
+ @response.extend(Relevance::CoreExtensions::Response)
7
+ end
8
+
9
+ context 'when content_type is a String (Rails 2)' do
10
+ it "should be html if the content-type is 'text/html'" do
11
+ @response.content_type = "text/html"
12
+ @response.should be_html
13
+ @response.content_type = "text/html;charset=iso-8859-2"
14
+ @response.should be_html
15
+ end
16
+
17
+ it "should not be html if the content-type isn't an html type" do
18
+ @response.content_type = "text/plain"
19
+ @response.should_not be_html
20
+ @response.content_type = "application/pdf"
21
+ @response.should_not be_html
22
+ end
23
+ end
24
+
25
+ context 'when content_type is a Mime::Type (Rails 3)' do
26
+ it "should be html if the content-type is 'text/html'" do
27
+ @response.content_type = Mime::Type.new("text/html")
28
+ @response.should be_html
29
+ @response.content_type = Mime::Type.new("text/html;charset=iso-8859-2")
30
+ @response.should be_html
31
+ end
32
+
33
+ it "should not be html if the content-type isn't an html type" do
34
+ @response.content_type = Mime::Type.new("text/plain")
35
+ @response.should_not be_html
36
+ @response.content_type = Mime::Type.new("application/pdf")
37
+ @response.should_not be_html
38
+ end
39
+ end
40
+
41
+ # better ideas welcome, but be careful not to
42
+ # castrate tarantula for proxies that don't set the content-type
43
+ it "should pretend we have html if the content-type is nil" do
44
+ @response.content_type = nil
45
+ @response.should be_html
46
+ end
47
+
48
+ end
@@ -0,0 +1,19 @@
1
+ require "spec_helper"
2
+
3
+ describe "TestCase extensions" do
4
+ pending "can create the crawler" do
5
+ Relevance::Tarantula::RailsIntegrationProxy.stubs(:rails_root).returns("STUB_RAILS_ROOT")
6
+ Relevance::Tarantula::Crawler.any_instance.stubs(:rails_root).returns("STUB_RAILS_ROOT")
7
+ tarantula_crawler(stub_everything)
8
+ end
9
+
10
+ pending "can crawl" do
11
+ (crawler = mock).expects(:crawl).with("/foo")
12
+ expects(:tarantula_crawler).returns(crawler)
13
+ tarantula_crawl(:integration_test_stub, :url => "/foo")
14
+ end
15
+
16
+ it "should get mixed into ActionController::IntegrationTest" do
17
+ ActionController::IntegrationTest.ancestors.should include(Relevance::CoreExtensions::TestCaseExtensions)
18
+ end
19
+ end
@@ -0,0 +1,29 @@
1
+ require "spec_helper"
2
+
3
+ describe "Relevance::Tarantula::AttackHandler" do
4
+ before do
5
+ @handler = Relevance::Tarantula::AttackHandler.new
6
+ attack = Relevance::Tarantula::Attack.new({:name => 'foo_name', :input => 'foo_code', :output => '<bad>'})
7
+ @handler.stubs(:attacks).returns([attack])
8
+ end
9
+
10
+ it "lets safe documents through" do
11
+ result = @handler.handle(Relevance::Tarantula::Result.new(:response => stub(:html? => true, :body => '<a href="/foo">good</a>')))
12
+ result.should == nil
13
+ end
14
+
15
+ it "detects the supplied code" do
16
+ result = @handler.handle(Relevance::Tarantula::Result.new(:response => stub(:html? => true, :body => '<a href="/foo"><bad></a>')))
17
+ result.success.should == false
18
+ end
19
+ end
20
+
21
+ describe "Attacks without an output specified" do
22
+ it "never matches anything" do
23
+ handler = Relevance::Tarantula::AttackHandler.new
24
+ attack = Relevance::Tarantula::Attack.new({:name => 'foo_name', :input => 'foo_code'})
25
+ Relevance::Tarantula::FormSubmission.stubs(:attacks).returns([attack])
26
+ result = handler.handle(Relevance::Tarantula::Result.new(:response => stub(:html? => true, :body => '<a href="/foo">good</a>')))
27
+ result.should == nil
28
+ end
29
+ end
@@ -0,0 +1,12 @@
1
+ require "spec_helper"
2
+
3
+ describe Relevance::Tarantula::BasicAttack do
4
+ before do
5
+ @attack = Relevance::Tarantula::BasicAttack.new
6
+ end
7
+
8
+ it "can generate a random whole number" do
9
+ @attack.random_whole_number.should >= 0
10
+ Fixnum.should === @attack.random_whole_number
11
+ end
12
+ end
@@ -0,0 +1,409 @@
1
+ require "spec_helper"
2
+
3
+ describe Relevance::Tarantula::Crawler do
4
+
5
+ describe "transform_url" do
6
+
7
+ before { @crawler = Relevance::Tarantula::Crawler.new }
8
+
9
+ it "de-obfuscates unicode obfuscated urls" do
10
+ obfuscated_mailto = "&#109;&#97;&#105;&#108;&#116;&#111;&#58;"
11
+ @crawler.transform_url(obfuscated_mailto).should == "mailto:"
12
+ end
13
+
14
+ it "strips the trailing name portion of a link" do
15
+ @crawler.transform_url('http://host/path#name').should == 'http://host/path'
16
+ end
17
+ end
18
+
19
+
20
+ describe "log grabbing" do
21
+
22
+ it "returns nil if no grabber is specified" do
23
+ crawler = Relevance::Tarantula::Crawler.new
24
+ crawler.grab_log!.should == nil
25
+ end
26
+
27
+ it "returns grabber.grab if grabber is specified" do
28
+ crawler = Relevance::Tarantula::Crawler.new
29
+ crawler.log_grabber = stub(:grab! => "fake log entry")
30
+ crawler.grab_log!.should == "fake log entry"
31
+ end
32
+
33
+ end
34
+
35
+ describe "interrupt" do
36
+
37
+ it 'catches interruption and writes the partial report' do
38
+ crawler = Relevance::Tarantula::Crawler.new
39
+ crawler.stubs(:queue_link)
40
+ crawler.stubs(:do_crawl).raises(Interrupt)
41
+ crawler.expects(:report_results)
42
+ $stderr.expects(:puts).with("CTRL-C")
43
+ crawler.crawl
44
+ end
45
+
46
+ end
47
+
48
+ describe 'handle_form_results' do
49
+
50
+ it 'captures the result values (bugfix)' do
51
+ response = stub_everything
52
+ result_args = {:url => :action_stub,
53
+ :data => 'nil',
54
+ :response => response,
55
+ :referrer => :action_stub,
56
+ :log => nil,
57
+ :method => :stub_method,
58
+ :test_name => nil}
59
+ result = Relevance::Tarantula::Result.new(result_args)
60
+ Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
61
+ crawler = Relevance::Tarantula::Crawler.new
62
+ crawler.handle_form_results(stub_everything(:meth => :stub_method, :action => :action_stub),
63
+ response)
64
+ end
65
+
66
+ end
67
+
68
+ describe "crawl" do
69
+
70
+ it 'queues the first url, does crawl, and then reports results' do
71
+ crawler = Relevance::Tarantula::Crawler.new
72
+ crawler.expects(:queue_link).with("/foobar")
73
+ crawler.expects(:do_crawl)
74
+ crawler.expects(:report_results)
75
+ crawler.crawl("/foobar")
76
+ end
77
+
78
+ it 'reports results even if the crawl fails' do
79
+ crawler = Relevance::Tarantula::Crawler.new
80
+ crawler.expects(:do_crawl).raises(RuntimeError)
81
+ crawler.expects(:report_results)
82
+ lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
83
+ end
84
+
85
+ end
86
+
87
+ describe "queueing" do
88
+
89
+ it 'queues and remembers links' do
90
+ crawler = Relevance::Tarantula::Crawler.new
91
+ crawler.expects(:transform_url).with("/url").returns("/transformed").at_least_once
92
+ crawler.queue_link("/url")
93
+ # TODO not sure this is the best way to test this anymore; relying on result of transform in both actual and expected
94
+ crawler.crawl_queue.should == [make_link("/url", crawler)]
95
+ crawler.links_queued.should == Set.new([make_link("/url", crawler)])
96
+ end
97
+
98
+ it 'queues and remembers forms' do
99
+ crawler = Relevance::Tarantula::Crawler.new
100
+ form = Hpricot('<form action="/action" method="post"/>').at('form')
101
+ signature = Relevance::Tarantula::FormSubmission.new(make_form(form)).signature
102
+ crawler.queue_form(form)
103
+ crawler.crawl_queue.size.should == 1
104
+ crawler.form_signatures_queued.should == Set.new([signature])
105
+ end
106
+
107
+ it "passes link, self, and referrer when creating Link objects" do
108
+ crawler = Relevance::Tarantula::Crawler.new
109
+ Relevance::Tarantula::Link.expects(:new).with('/url', crawler, '/some-referrer')
110
+ crawler.expects(:append_to_queue)
111
+ crawler.stubs(:should_skip_link?)
112
+ crawler.queue_link('/url', '/some-referrer')
113
+ end
114
+
115
+ it "queues DELETE requests at the end, everything else before" do
116
+ crawler = Relevance::Tarantula::Crawler.new
117
+ create_link = Hpricot('<a href="/create" data-method="post">Create</a>').at('a')
118
+ crawler.queue_link(create_link)
119
+ create_link = Relevance::Tarantula::Link.new(create_link, crawler, nil)
120
+ crawler.crawl_queue.should == [create_link]
121
+ delete_link = Hpricot('<a href="/destroy" data-method="delete">Destroy</a>').at('a')
122
+ crawler.queue_link(delete_link)
123
+ delete_link = Relevance::Tarantula::Link.new(delete_link, crawler, nil)
124
+ crawler.crawl_queue.should == [create_link, delete_link]
125
+ get_link = Hpricot('<a href="/read">Show</a>').at('a')
126
+ crawler.queue_link(get_link)
127
+ get_link = Relevance::Tarantula::Link.new(get_link, crawler, nil)
128
+ crawler.crawl_queue.should == [create_link, get_link, delete_link]
129
+ end
130
+
131
+ it "queues is crawled from tip not tail" do
132
+ crawler = Relevance::Tarantula::Crawler.new
133
+
134
+ create_link = Hpricot('<a href="/create" data-method="post">Create</a>').at('a')
135
+ crawler.queue_link(create_link)
136
+ delete_link = Hpricot('<a href="/destroy" data-method="delete">Destroy</a>').at('a')
137
+ crawler.queue_link(delete_link)
138
+ get_link = Hpricot('<a href="/read">Show</a>').at('a')
139
+ crawler.queue_link(get_link)
140
+
141
+ q = sequence('queue')
142
+ response = stub(:code => "200")
143
+ crawler.expects(:follow).with('post', '/create').returns(response).in_sequence(q)
144
+ crawler.expects(:follow).with('get', '/read').returns(response).in_sequence(q)
145
+ crawler.expects(:follow).with('delete', '/destroy').returns(response).in_sequence(q)
146
+ crawler.do_crawl(0)
147
+ end
148
+ end
149
+
150
+ describe "crawling" do
151
+ before do
152
+ @form = Hpricot('<form action="/action" method="post"/>').at('form')
153
+ end
154
+
155
+ it "does two things with each link: crawl and blip" do
156
+ crawler = Relevance::Tarantula::Crawler.new
157
+ crawler.proxy = stub
158
+ crawler.crawl_queue = links = [make_link("/foo1", crawler), make_link("/foo2", crawler)]
159
+
160
+ links.each{|link| link.expects(:crawl)}
161
+ crawler.expects(:blip).times(2)
162
+
163
+ crawler.crawl_the_queue
164
+ crawler.crawl_queue.should == []
165
+ end
166
+
167
+ it "invokes queued forms, logs responses, and calls handlers" do
168
+ crawler = Relevance::Tarantula::Crawler.new
169
+ crawler.crawl_queue << Relevance::Tarantula::FormSubmission.new(make_form(@form, crawler))
170
+ crawler.expects(:submit).returns(stub(:code => "200"))
171
+ crawler.expects(:blip)
172
+ crawler.crawl_the_queue
173
+ end
174
+
175
+ # TODO this is the same as "resets to the initial links/forms ..." and doesn't appear to test anything related to a timeout.
176
+ it "breaks out early if a timeout is set"
177
+
178
+ it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
179
+ crawler = Relevance::Tarantula::Crawler.new
180
+ stub_puts_and_print(crawler)
181
+ response = stub(:code => "200")
182
+ crawler.queue_link('/foo')
183
+ crawler.expects(:follow).returns(response).times(4) # (stub and "/") * 2
184
+ crawler.queue_form(@form)
185
+ crawler.expects(:submit).returns(response).times(2)
186
+ crawler.expects(:blip).times(6)
187
+ crawler.times_to_crawl = 2
188
+ crawler.crawl
189
+ end
190
+
191
+ end
192
+
193
+ describe "report_results" do
194
+ it "prints a final summary line" do
195
+ crawler = Relevance::Tarantula::Crawler.new
196
+ crawler.stubs(:generate_reports)
197
+ crawler.expects(:total_links_count).returns(42)
198
+ crawler.expects(:puts).with("Crawled 42 links and forms.")
199
+ crawler.report_results
200
+ end
201
+
202
+ it "delegates to generate_reports" do
203
+ crawler = Relevance::Tarantula::Crawler.new
204
+ crawler.stubs(:puts)
205
+ crawler.expects(:generate_reports)
206
+ crawler.report_results
207
+ end
208
+
209
+ end
210
+
211
+ describe "blip" do
212
+
213
+ it "blips the current progress if !verbose" do
214
+ $stdout.stubs(:tty?).returns(true)
215
+ crawler = Relevance::Tarantula::Crawler.new
216
+ crawler.stubs(:verbose).returns false
217
+ crawler.stubs(:timeout_if_too_long)
218
+ crawler.expects(:print).with("\r 0 of 0 links completed ")
219
+ crawler.blip
220
+ end
221
+
222
+ it "suppresses the blip message if not writing to a tty" do
223
+ $stdout.stubs(:tty?).returns(false)
224
+ crawler = Relevance::Tarantula::Crawler.new
225
+ crawler.stubs(:verbose).returns false
226
+ crawler.stubs(:timeout_if_too_long)
227
+ crawler.expects(:print).never
228
+ crawler.blip
229
+ end
230
+
231
+ it "blips nothing if verbose" do
232
+ $stdout.stubs(:tty?).returns(true)
233
+ crawler = Relevance::Tarantula::Crawler.new
234
+ crawler.stubs(:verbose).returns true
235
+ crawler.expects(:print).never
236
+ crawler.blip
237
+ end
238
+
239
+ end
240
+
241
+ describe "finished?" do
242
+
243
+ it "is finished when the links and forms are crawled" do
244
+ crawler = Relevance::Tarantula::Crawler.new
245
+ crawler.finished?.should == true
246
+ end
247
+
248
+ it "isn't finished when links remain" do
249
+ crawler = Relevance::Tarantula::Crawler.new
250
+ crawler.crawl_queue = [:stub_link]
251
+ crawler.finished?.should == false
252
+ end
253
+
254
+ it "isn't finished when forms remain" do
255
+ crawler = Relevance::Tarantula::Crawler.new
256
+ crawler.crawl_queue = [:stub_form]
257
+ crawler.finished?.should == false
258
+ end
259
+
260
+ end
261
+
262
+ it "crawls links and forms again and again until finished?==true" do
263
+ crawler = Relevance::Tarantula::Crawler.new
264
+ crawler.expects(:finished?).times(3).returns(false, false, true)
265
+ crawler.expects(:crawl_the_queue).times(2)
266
+ crawler.do_crawl(1)
267
+ end
268
+
269
+ it "asks each reporter to write its report in report_dir" do
270
+ crawler = Relevance::Tarantula::Crawler.new
271
+ crawler.stubs(:report_dir).returns(test_output_dir)
272
+ reporter = stub_everything
273
+ reporter.expects(:report)
274
+ reporter.expects(:finish_report)
275
+ crawler.reporters = [reporter]
276
+ crawler.save_result stub(:code => "404", :url => "/uh-oh")
277
+ crawler.generate_reports
278
+ end
279
+
280
+ it "builds a report dir relative to rails root" do
281
+ crawler = Relevance::Tarantula::Crawler.new
282
+ crawler.expects(:rails_root).returns("faux_rails_root")
283
+ crawler.report_dir.should == "faux_rails_root/tmp/tarantula"
284
+ end
285
+
286
+ it "skips links that are already queued" do
287
+ crawler = Relevance::Tarantula::Crawler.new
288
+ crawler.should_skip_link?(make_link("/foo")).should == false
289
+ crawler.queue_link("/foo").should == make_link("/foo")
290
+ crawler.should_skip_link?(make_link("/foo")).should == true
291
+ end
292
+
293
+ describe "link skipping" do
294
+
295
+ before { @crawler = Relevance::Tarantula::Crawler.new }
296
+
297
+ it "skips links that are too long" do
298
+ @crawler.should_skip_link?(make_link("/foo")).should == false
299
+ @crawler.max_url_length = 2
300
+ @crawler.expects(:log).with("Skipping long url /foo")
301
+ @crawler.should_skip_link?(make_link("/foo")).should == true
302
+ end
303
+
304
+ it "skips outbound links (those that begin with http)" do
305
+ @crawler.expects(:log).with("Skipping http-anything")
306
+ @crawler.should_skip_link?(make_link("http-anything")).should == true
307
+ end
308
+
309
+ it "skips javascript links (those that begin with javascript)" do
310
+ @crawler.expects(:log).with("Skipping javascript-anything")
311
+ @crawler.should_skip_link?(make_link("javascript-anything")).should == true
312
+ end
313
+
314
+ it "skips mailto links (those that begin with http)" do
315
+ @crawler.expects(:log).with("Skipping mailto-anything")
316
+ @crawler.should_skip_link?(make_link("mailto-anything")).should == true
317
+ end
318
+
319
+ it 'skips blank links' do
320
+ @crawler.queue_link(nil)
321
+ @crawler.crawl_queue.should == []
322
+ @crawler.queue_link("")
323
+ @crawler.crawl_queue.should == []
324
+ end
325
+
326
+ it "logs and skips links that match a pattern" do
327
+ @crawler.expects(:log).with("Skipping /the-red-button")
328
+ @crawler.skip_uri_patterns << /red-button/
329
+ @crawler.queue_link("/blue-button").should == make_link("/blue-button")
330
+ @crawler.queue_link("/the-red-button").should == nil
331
+ end
332
+
333
+ it "logs and skips form submissions that match a pattern" do
334
+ @crawler.expects(:log).with("Skipping /reset-password-form")
335
+ @crawler.skip_uri_patterns << /reset-password/
336
+ fs = stub_everything(:action => "/reset-password-form")
337
+ @crawler.should_skip_form_submission?(fs).should == true
338
+ end
339
+ end
340
+
341
+ describe "allow_nnn_for" do
342
+
343
+ it "installs result as a response_code_handler" do
344
+ crawler = Relevance::Tarantula::Crawler.new
345
+ crawler.response_code_handler.should == Relevance::Tarantula::Result
346
+ end
347
+
348
+ it "delegates to the response_code_handler" do
349
+ crawler = Relevance::Tarantula::Crawler.new
350
+ (response_code_handler = mock).expects(:allow_404_for).with(:stub)
351
+ crawler.response_code_handler = response_code_handler
352
+ crawler.allow_404_for(:stub)
353
+ end
354
+
355
+ it "chains up to super for method_missing" do
356
+ crawler = Relevance::Tarantula::Crawler.new
357
+ lambda{crawler.foo}.should raise_error(NoMethodError)
358
+ end
359
+
360
+ end
361
+
362
+ describe "timeouts" do
363
+
364
+ it "sets start and end times for a single crawl" do
365
+ start_time = Time.parse("March 1st, 2008 10:00am")
366
+ end_time = Time.parse("March 1st, 2008 10:10am")
367
+ Time.stubs(:now).returns(start_time, end_time)
368
+
369
+ crawler = Relevance::Tarantula::Crawler.new
370
+ stub_puts_and_print(crawler)
371
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
372
+ crawler.crawl
373
+ crawler.crawl_start_times.first.should == start_time
374
+ crawler.crawl_end_times.first.should == end_time
375
+ end
376
+
377
+ it "has elasped time for a crawl" do
378
+ start_time = Time.parse("March 1st, 2008 10:00am")
379
+ elasped_time_check = Time.parse("March 1st, 2008, 10:10:00am")
380
+ Time.stubs(:now).returns(start_time, elasped_time_check)
381
+
382
+ crawler = Relevance::Tarantula::Crawler.new
383
+ stub_puts_and_print(crawler)
384
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
385
+ crawler.crawl
386
+ crawler.elasped_time_for_pass(0).should == 600.seconds
387
+ end
388
+
389
+ it "raises out of the crawl if elasped time is greater then the crawl timeout" do
390
+ start_time = Time.parse("March 1st, 2008 10:00am")
391
+ elasped_time_check = Time.parse("March 1st, 2008, 10:35:00am")
392
+ Time.stubs(:now).returns(start_time, elasped_time_check)
393
+
394
+ crawler = Relevance::Tarantula::Crawler.new
395
+ crawler.crawl_timeout = 5.minutes
396
+
397
+ crawler.crawl_queue = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
398
+ crawler.proxy = stub
399
+ crawler.proxy.stubs(:get).returns(response = stub(:code => "200"))
400
+
401
+ stub_puts_and_print(crawler)
402
+ lambda {
403
+ crawler.do_crawl(0)
404
+ }.should raise_error
405
+ end
406
+
407
+ end
408
+
409
+ end