codez-tarantula 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. data/.autotest +14 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CHANGELOG +64 -0
  5. data/DSL_EXAMPLES.md +120 -0
  6. data/Gemfile +2 -0
  7. data/LICENSE +20 -0
  8. data/README.rdoc +136 -0
  9. data/Rakefile +36 -0
  10. data/ci/rails2.gemfile +4 -0
  11. data/ci/rails3.gemfile +4 -0
  12. data/laf/images/header_bg.jpg +0 -0
  13. data/laf/images/logo.png +0 -0
  14. data/laf/images/tagline.png +0 -0
  15. data/laf/javascripts/jquery-1.2.3.js +3408 -0
  16. data/laf/javascripts/jquery-ui-tabs.js +890 -0
  17. data/laf/javascripts/jquery.tablesorter.js +861 -0
  18. data/laf/javascripts/tarantula.js +10 -0
  19. data/laf/stylesheets/tarantula.css +346 -0
  20. data/lib/relevance/core_extensions/ellipsize.rb +38 -0
  21. data/lib/relevance/core_extensions/file.rb +15 -0
  22. data/lib/relevance/core_extensions/metaclass.rb +78 -0
  23. data/lib/relevance/core_extensions/response.rb +14 -0
  24. data/lib/relevance/core_extensions/test_case.rb +21 -0
  25. data/lib/relevance/tarantula.rb +55 -0
  26. data/lib/relevance/tarantula/attack.rb +22 -0
  27. data/lib/relevance/tarantula/attack_handler.rb +43 -0
  28. data/lib/relevance/tarantula/basic_attack.rb +44 -0
  29. data/lib/relevance/tarantula/crawler.rb +271 -0
  30. data/lib/relevance/tarantula/detail.html.erb +81 -0
  31. data/lib/relevance/tarantula/form.rb +29 -0
  32. data/lib/relevance/tarantula/form_submission.rb +98 -0
  33. data/lib/relevance/tarantula/html_document_handler.rb +42 -0
  34. data/lib/relevance/tarantula/html_report_helper.rb +46 -0
  35. data/lib/relevance/tarantula/html_reporter.rb +111 -0
  36. data/lib/relevance/tarantula/index.html.erb +37 -0
  37. data/lib/relevance/tarantula/invalid_html_handler.rb +27 -0
  38. data/lib/relevance/tarantula/io_reporter.rb +40 -0
  39. data/lib/relevance/tarantula/link.rb +105 -0
  40. data/lib/relevance/tarantula/log_grabber.rb +22 -0
  41. data/lib/relevance/tarantula/rails_integration_proxy.rb +90 -0
  42. data/lib/relevance/tarantula/recording.rb +12 -0
  43. data/lib/relevance/tarantula/response.rb +19 -0
  44. data/lib/relevance/tarantula/result.rb +83 -0
  45. data/lib/relevance/tarantula/test_report.html.erb +32 -0
  46. data/lib/relevance/tarantula/tidy_handler.rb +35 -0
  47. data/lib/relevance/tarantula/transform.rb +21 -0
  48. data/lib/relevance/tarantula/version.rb +5 -0
  49. data/lib/relevance/tasks/tarantula_tasks.rake +42 -0
  50. data/lib/tarantula-rails3.rb +9 -0
  51. data/spec/relevance/core_extensions/ellipsize_spec.rb +19 -0
  52. data/spec/relevance/core_extensions/file_spec.rb +7 -0
  53. data/spec/relevance/core_extensions/response_spec.rb +48 -0
  54. data/spec/relevance/core_extensions/test_case_spec.rb +19 -0
  55. data/spec/relevance/tarantula/attack_handler_spec.rb +29 -0
  56. data/spec/relevance/tarantula/basic_attack_spec.rb +12 -0
  57. data/spec/relevance/tarantula/crawler_spec.rb +409 -0
  58. data/spec/relevance/tarantula/form_spec.rb +50 -0
  59. data/spec/relevance/tarantula/form_submission_spec.rb +171 -0
  60. data/spec/relevance/tarantula/html_document_handler_spec.rb +43 -0
  61. data/spec/relevance/tarantula/html_report_helper_spec.rb +46 -0
  62. data/spec/relevance/tarantula/html_reporter_spec.rb +82 -0
  63. data/spec/relevance/tarantula/invalid_html_handler_spec.rb +33 -0
  64. data/spec/relevance/tarantula/io_reporter_spec.rb +11 -0
  65. data/spec/relevance/tarantula/link_spec.rb +132 -0
  66. data/spec/relevance/tarantula/log_grabber_spec.rb +26 -0
  67. data/spec/relevance/tarantula/rails_integration_proxy_spec.rb +100 -0
  68. data/spec/relevance/tarantula/result_spec.rb +85 -0
  69. data/spec/relevance/tarantula/tidy_handler_spec.rb +58 -0
  70. data/spec/relevance/tarantula/transform_spec.rb +20 -0
  71. data/spec/relevance/tarantula_spec.rb +23 -0
  72. data/spec/spec_helper.rb +43 -0
  73. data/tarantula.gemspec +25 -0
  74. data/template/tarantula_test.rb +22 -0
  75. data/vendor/xss-shield/MIT-LICENSE +20 -0
  76. data/vendor/xss-shield/README +76 -0
  77. data/vendor/xss-shield/init.rb +16 -0
  78. data/vendor/xss-shield/lib/xss_shield.rb +6 -0
  79. data/vendor/xss-shield/lib/xss_shield/erb_hacks.rb +111 -0
  80. data/vendor/xss-shield/lib/xss_shield/haml_hacks.rb +42 -0
  81. data/vendor/xss-shield/lib/xss_shield/safe_string.rb +47 -0
  82. data/vendor/xss-shield/lib/xss_shield/secure_helpers.rb +40 -0
  83. data/vendor/xss-shield/test/test_actionview_integration.rb +40 -0
  84. data/vendor/xss-shield/test/test_erb.rb +44 -0
  85. data/vendor/xss-shield/test/test_haml.rb +43 -0
  86. data/vendor/xss-shield/test/test_helpers.rb +25 -0
  87. data/vendor/xss-shield/test/test_safe_string.rb +55 -0
  88. metadata +247 -0
@@ -0,0 +1,21 @@
1
+ module Relevance
2
+ module Tarantula
3
+
4
+ class Transform
5
+ attr_accessor :from, :to
6
+ def initialize(from, to)
7
+ @from = from
8
+ @to = to
9
+ end
10
+ def [](string)
11
+ case to
12
+ when Proc
13
+ string.gsub(from, &to)
14
+ else
15
+ string.gsub(from, to)
16
+ end
17
+ end
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module Relevance
2
+ module Tarantula
3
+ VERSION = "0.5.0"
4
+ end
5
+ end
@@ -0,0 +1,42 @@
1
+ require 'rake'
2
+
3
+ namespace :tarantula do
4
+
5
+ desc 'Run tarantula tests.'
6
+ task :test do
7
+ rm_rf "tmp/tarantula"
8
+ Rake::TestTask.new(:tarantula_test) do |t|
9
+ t.libs << 'test'
10
+ t.pattern = 'test/tarantula/**/*_test.rb'
11
+ t.verbose = true
12
+ end
13
+
14
+ Rake::Task[:tarantula_test].invoke
15
+ end
16
+
17
+ desc 'Run tarantula tests and open results in your browser.'
18
+ task :report do
19
+ begin
20
+ Rake::Task['tarantula:test'].invoke
21
+ rescue RuntimeError => e
22
+ puts e.message
23
+ end
24
+
25
+ Dir.glob("tmp/tarantula/**/index.html") do |file|
26
+ if PLATFORM['darwin']
27
+ system("open #{file}")
28
+ elsif PLATFORM[/linux/]
29
+ system("firefox #{file}")
30
+ else
31
+ puts "You can view tarantula results at #{file}"
32
+ end
33
+ end
34
+ end
35
+
36
+ desc 'Generate a default tarantula test'
37
+ task :setup do
38
+ mkdir_p "test/tarantula"
39
+ template_path = File.expand_path(File.join(File.dirname(__FILE__), "../../..", "template", "tarantula_test.rb"))
40
+ cp template_path, "test/tarantula/"
41
+ end
42
+ end
@@ -0,0 +1,9 @@
1
+ module Relevance
2
+ module Tarantula
3
+ class Railtie < ::Rails::Railtie
4
+ rake_tasks do
5
+ load "relevance/tasks/tarantula_tasks.rake"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ require "spec_helper"
2
+
3
+ describe "Relevance::CoreExtensions::Object#ellipsize" do
4
+ it "converts nil to empty string" do
5
+ nil.ellipsize.should == ""
6
+ end
7
+
8
+ it "doesn't touch short strings" do
9
+ "hello".ellipsize.should == "hello"
10
+ end
11
+
12
+ it "calls inspect on non-strings" do
13
+ [1,2,3].ellipsize.should == "[1, 2, 3]"
14
+ end
15
+
16
+ it "shortens long strings and adds ..." do
17
+ "long-string".ellipsize(5).should == "long-..."
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ require "spec_helper"
2
+
3
+ describe "Relevance::CoreExtensions::File#extension" do
4
+ it "should return the extension without the leading dot" do
5
+ File.extension("foo.bar").should == "bar"
6
+ end
7
+ end
@@ -0,0 +1,48 @@
1
+ require "spec_helper"
2
+
3
+ describe "Relevance::CoreExtensions::Response#html?" do
4
+ before do
5
+ @response = OpenStruct.new
6
+ @response.extend(Relevance::CoreExtensions::Response)
7
+ end
8
+
9
+ context 'when content_type is a String (Rails 2)' do
10
+ it "should be html if the content-type is 'text/html'" do
11
+ @response.content_type = "text/html"
12
+ @response.should be_html
13
+ @response.content_type = "text/html;charset=iso-8859-2"
14
+ @response.should be_html
15
+ end
16
+
17
+ it "should not be html if the content-type isn't an html type" do
18
+ @response.content_type = "text/plain"
19
+ @response.should_not be_html
20
+ @response.content_type = "application/pdf"
21
+ @response.should_not be_html
22
+ end
23
+ end
24
+
25
+ context 'when content_type is a Mime::Type (Rails 3)' do
26
+ it "should be html if the content-type is 'text/html'" do
27
+ @response.content_type = Mime::Type.new("text/html")
28
+ @response.should be_html
29
+ @response.content_type = Mime::Type.new("text/html;charset=iso-8859-2")
30
+ @response.should be_html
31
+ end
32
+
33
+ it "should not be html if the content-type isn't an html type" do
34
+ @response.content_type = Mime::Type.new("text/plain")
35
+ @response.should_not be_html
36
+ @response.content_type = Mime::Type.new("application/pdf")
37
+ @response.should_not be_html
38
+ end
39
+ end
40
+
41
+ # better ideas welcome, but be careful not to
42
+ # castrate tarantula for proxies that don't set the content-type
43
+ it "should pretend we have html if the content-type is nil" do
44
+ @response.content_type = nil
45
+ @response.should be_html
46
+ end
47
+
48
+ end
@@ -0,0 +1,19 @@
1
+ require "spec_helper"
2
+
3
+ describe "TestCase extensions" do
4
+ pending "can create the crawler" do
5
+ Relevance::Tarantula::RailsIntegrationProxy.stubs(:rails_root).returns("STUB_RAILS_ROOT")
6
+ Relevance::Tarantula::Crawler.any_instance.stubs(:rails_root).returns("STUB_RAILS_ROOT")
7
+ tarantula_crawler(stub_everything)
8
+ end
9
+
10
+ pending "can crawl" do
11
+ (crawler = mock).expects(:crawl).with("/foo")
12
+ expects(:tarantula_crawler).returns(crawler)
13
+ tarantula_crawl(:integration_test_stub, :url => "/foo")
14
+ end
15
+
16
+ it "should get mixed into ActionController::IntegrationTest" do
17
+ ActionController::IntegrationTest.ancestors.should include(Relevance::CoreExtensions::TestCaseExtensions)
18
+ end
19
+ end
@@ -0,0 +1,29 @@
1
+ require "spec_helper"
2
+
3
+ describe "Relevance::Tarantula::AttackHandler" do
4
+ before do
5
+ @handler = Relevance::Tarantula::AttackHandler.new
6
+ attack = Relevance::Tarantula::Attack.new({:name => 'foo_name', :input => 'foo_code', :output => '<bad>'})
7
+ @handler.stubs(:attacks).returns([attack])
8
+ end
9
+
10
+ it "lets safe documents through" do
11
+ result = @handler.handle(Relevance::Tarantula::Result.new(:response => stub(:html? => true, :body => '<a href="/foo">good</a>')))
12
+ result.should == nil
13
+ end
14
+
15
+ it "detects the supplied code" do
16
+ result = @handler.handle(Relevance::Tarantula::Result.new(:response => stub(:html? => true, :body => '<a href="/foo"><bad></a>')))
17
+ result.success.should == false
18
+ end
19
+ end
20
+
21
+ describe "Attacks without an output specified" do
22
+ it "never matches anything" do
23
+ handler = Relevance::Tarantula::AttackHandler.new
24
+ attack = Relevance::Tarantula::Attack.new({:name => 'foo_name', :input => 'foo_code'})
25
+ Relevance::Tarantula::FormSubmission.stubs(:attacks).returns([attack])
26
+ result = handler.handle(Relevance::Tarantula::Result.new(:response => stub(:html? => true, :body => '<a href="/foo">good</a>')))
27
+ result.should == nil
28
+ end
29
+ end
@@ -0,0 +1,12 @@
1
+ require "spec_helper"
2
+
3
+ describe Relevance::Tarantula::BasicAttack do
4
+ before do
5
+ @attack = Relevance::Tarantula::BasicAttack.new
6
+ end
7
+
8
+ it "can generate a random whole number" do
9
+ @attack.random_whole_number.should >= 0
10
+ Fixnum.should === @attack.random_whole_number
11
+ end
12
+ end
@@ -0,0 +1,409 @@
1
+ require "spec_helper"
2
+
3
+ describe Relevance::Tarantula::Crawler do
4
+
5
+ describe "transform_url" do
6
+
7
+ before { @crawler = Relevance::Tarantula::Crawler.new }
8
+
9
+ it "de-obfuscates unicode obfuscated urls" do
10
+ obfuscated_mailto = "&#109;&#97;&#105;&#108;&#116;&#111;&#58;"
11
+ @crawler.transform_url(obfuscated_mailto).should == "mailto:"
12
+ end
13
+
14
+ it "strips the trailing name portion of a link" do
15
+ @crawler.transform_url('http://host/path#name').should == 'http://host/path'
16
+ end
17
+ end
18
+
19
+
20
+ describe "log grabbing" do
21
+
22
+ it "returns nil if no grabber is specified" do
23
+ crawler = Relevance::Tarantula::Crawler.new
24
+ crawler.grab_log!.should == nil
25
+ end
26
+
27
+ it "returns grabber.grab if grabber is specified" do
28
+ crawler = Relevance::Tarantula::Crawler.new
29
+ crawler.log_grabber = stub(:grab! => "fake log entry")
30
+ crawler.grab_log!.should == "fake log entry"
31
+ end
32
+
33
+ end
34
+
35
+ describe "interrupt" do
36
+
37
+ it 'catches interruption and writes the partial report' do
38
+ crawler = Relevance::Tarantula::Crawler.new
39
+ crawler.stubs(:queue_link)
40
+ crawler.stubs(:do_crawl).raises(Interrupt)
41
+ crawler.expects(:report_results)
42
+ $stderr.expects(:puts).with("CTRL-C")
43
+ crawler.crawl
44
+ end
45
+
46
+ end
47
+
48
+ describe 'handle_form_results' do
49
+
50
+ it 'captures the result values (bugfix)' do
51
+ response = stub_everything
52
+ result_args = {:url => :action_stub,
53
+ :data => 'nil',
54
+ :response => response,
55
+ :referrer => :action_stub,
56
+ :log => nil,
57
+ :method => :stub_method,
58
+ :test_name => nil}
59
+ result = Relevance::Tarantula::Result.new(result_args)
60
+ Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
61
+ crawler = Relevance::Tarantula::Crawler.new
62
+ crawler.handle_form_results(stub_everything(:meth => :stub_method, :action => :action_stub),
63
+ response)
64
+ end
65
+
66
+ end
67
+
68
+ describe "crawl" do
69
+
70
+ it 'queues the first url, does crawl, and then reports results' do
71
+ crawler = Relevance::Tarantula::Crawler.new
72
+ crawler.expects(:queue_link).with("/foobar")
73
+ crawler.expects(:do_crawl)
74
+ crawler.expects(:report_results)
75
+ crawler.crawl("/foobar")
76
+ end
77
+
78
+ it 'reports results even if the crawl fails' do
79
+ crawler = Relevance::Tarantula::Crawler.new
80
+ crawler.expects(:do_crawl).raises(RuntimeError)
81
+ crawler.expects(:report_results)
82
+ lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
83
+ end
84
+
85
+ end
86
+
87
+ describe "queueing" do
88
+
89
+ it 'queues and remembers links' do
90
+ crawler = Relevance::Tarantula::Crawler.new
91
+ crawler.expects(:transform_url).with("/url").returns("/transformed").at_least_once
92
+ crawler.queue_link("/url")
93
+ # TODO not sure this is the best way to test this anymore; relying on result of transform in both actual and expected
94
+ crawler.crawl_queue.should == [make_link("/url", crawler)]
95
+ crawler.links_queued.should == Set.new([make_link("/url", crawler)])
96
+ end
97
+
98
+ it 'queues and remembers forms' do
99
+ crawler = Relevance::Tarantula::Crawler.new
100
+ form = Hpricot('<form action="/action" method="post"/>').at('form')
101
+ signature = Relevance::Tarantula::FormSubmission.new(make_form(form)).signature
102
+ crawler.queue_form(form)
103
+ crawler.crawl_queue.size.should == 1
104
+ crawler.form_signatures_queued.should == Set.new([signature])
105
+ end
106
+
107
+ it "passes link, self, and referrer when creating Link objects" do
108
+ crawler = Relevance::Tarantula::Crawler.new
109
+ Relevance::Tarantula::Link.expects(:new).with('/url', crawler, '/some-referrer')
110
+ crawler.expects(:append_to_queue)
111
+ crawler.stubs(:should_skip_link?)
112
+ crawler.queue_link('/url', '/some-referrer')
113
+ end
114
+
115
+ it "queues DELETE requests at the end, everything else before" do
116
+ crawler = Relevance::Tarantula::Crawler.new
117
+ create_link = Hpricot('<a href="/create" data-method="post">Create</a>').at('a')
118
+ crawler.queue_link(create_link)
119
+ create_link = Relevance::Tarantula::Link.new(create_link, crawler, nil)
120
+ crawler.crawl_queue.should == [create_link]
121
+ delete_link = Hpricot('<a href="/destroy" data-method="delete">Destroy</a>').at('a')
122
+ crawler.queue_link(delete_link)
123
+ delete_link = Relevance::Tarantula::Link.new(delete_link, crawler, nil)
124
+ crawler.crawl_queue.should == [create_link, delete_link]
125
+ get_link = Hpricot('<a href="/read">Show</a>').at('a')
126
+ crawler.queue_link(get_link)
127
+ get_link = Relevance::Tarantula::Link.new(get_link, crawler, nil)
128
+ crawler.crawl_queue.should == [create_link, get_link, delete_link]
129
+ end
130
+
131
+ it "queues is crawled from tip not tail" do
132
+ crawler = Relevance::Tarantula::Crawler.new
133
+
134
+ create_link = Hpricot('<a href="/create" data-method="post">Create</a>').at('a')
135
+ crawler.queue_link(create_link)
136
+ delete_link = Hpricot('<a href="/destroy" data-method="delete">Destroy</a>').at('a')
137
+ crawler.queue_link(delete_link)
138
+ get_link = Hpricot('<a href="/read">Show</a>').at('a')
139
+ crawler.queue_link(get_link)
140
+
141
+ q = sequence('queue')
142
+ response = stub(:code => "200")
143
+ crawler.expects(:follow).with('post', '/create').returns(response).in_sequence(q)
144
+ crawler.expects(:follow).with('get', '/read').returns(response).in_sequence(q)
145
+ crawler.expects(:follow).with('delete', '/destroy').returns(response).in_sequence(q)
146
+ crawler.do_crawl(0)
147
+ end
148
+ end
149
+
150
+ describe "crawling" do
151
+ before do
152
+ @form = Hpricot('<form action="/action" method="post"/>').at('form')
153
+ end
154
+
155
+ it "does two things with each link: crawl and blip" do
156
+ crawler = Relevance::Tarantula::Crawler.new
157
+ crawler.proxy = stub
158
+ crawler.crawl_queue = links = [make_link("/foo1", crawler), make_link("/foo2", crawler)]
159
+
160
+ links.each{|link| link.expects(:crawl)}
161
+ crawler.expects(:blip).times(2)
162
+
163
+ crawler.crawl_the_queue
164
+ crawler.crawl_queue.should == []
165
+ end
166
+
167
+ it "invokes queued forms, logs responses, and calls handlers" do
168
+ crawler = Relevance::Tarantula::Crawler.new
169
+ crawler.crawl_queue << Relevance::Tarantula::FormSubmission.new(make_form(@form, crawler))
170
+ crawler.expects(:submit).returns(stub(:code => "200"))
171
+ crawler.expects(:blip)
172
+ crawler.crawl_the_queue
173
+ end
174
+
175
+ # TODO this is the same as "resets to the initial links/forms ..." and doesn't appear to test anything related to a timeout.
176
+ it "breaks out early if a timeout is set"
177
+
178
+ it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
179
+ crawler = Relevance::Tarantula::Crawler.new
180
+ stub_puts_and_print(crawler)
181
+ response = stub(:code => "200")
182
+ crawler.queue_link('/foo')
183
+ crawler.expects(:follow).returns(response).times(4) # (stub and "/") * 2
184
+ crawler.queue_form(@form)
185
+ crawler.expects(:submit).returns(response).times(2)
186
+ crawler.expects(:blip).times(6)
187
+ crawler.times_to_crawl = 2
188
+ crawler.crawl
189
+ end
190
+
191
+ end
192
+
193
+ describe "report_results" do
194
+ it "prints a final summary line" do
195
+ crawler = Relevance::Tarantula::Crawler.new
196
+ crawler.stubs(:generate_reports)
197
+ crawler.expects(:total_links_count).returns(42)
198
+ crawler.expects(:puts).with("Crawled 42 links and forms.")
199
+ crawler.report_results
200
+ end
201
+
202
+ it "delegates to generate_reports" do
203
+ crawler = Relevance::Tarantula::Crawler.new
204
+ crawler.stubs(:puts)
205
+ crawler.expects(:generate_reports)
206
+ crawler.report_results
207
+ end
208
+
209
+ end
210
+
211
+ describe "blip" do
212
+
213
+ it "blips the current progress if !verbose" do
214
+ $stdout.stubs(:tty?).returns(true)
215
+ crawler = Relevance::Tarantula::Crawler.new
216
+ crawler.stubs(:verbose).returns false
217
+ crawler.stubs(:timeout_if_too_long)
218
+ crawler.expects(:print).with("\r 0 of 0 links completed ")
219
+ crawler.blip
220
+ end
221
+
222
+ it "suppresses the blip message if not writing to a tty" do
223
+ $stdout.stubs(:tty?).returns(false)
224
+ crawler = Relevance::Tarantula::Crawler.new
225
+ crawler.stubs(:verbose).returns false
226
+ crawler.stubs(:timeout_if_too_long)
227
+ crawler.expects(:print).never
228
+ crawler.blip
229
+ end
230
+
231
+ it "blips nothing if verbose" do
232
+ $stdout.stubs(:tty?).returns(true)
233
+ crawler = Relevance::Tarantula::Crawler.new
234
+ crawler.stubs(:verbose).returns true
235
+ crawler.expects(:print).never
236
+ crawler.blip
237
+ end
238
+
239
+ end
240
+
241
+ describe "finished?" do
242
+
243
+ it "is finished when the links and forms are crawled" do
244
+ crawler = Relevance::Tarantula::Crawler.new
245
+ crawler.finished?.should == true
246
+ end
247
+
248
+ it "isn't finished when links remain" do
249
+ crawler = Relevance::Tarantula::Crawler.new
250
+ crawler.crawl_queue = [:stub_link]
251
+ crawler.finished?.should == false
252
+ end
253
+
254
+ it "isn't finished when forms remain" do
255
+ crawler = Relevance::Tarantula::Crawler.new
256
+ crawler.crawl_queue = [:stub_form]
257
+ crawler.finished?.should == false
258
+ end
259
+
260
+ end
261
+
262
+ it "crawls links and forms again and again until finished?==true" do
263
+ crawler = Relevance::Tarantula::Crawler.new
264
+ crawler.expects(:finished?).times(3).returns(false, false, true)
265
+ crawler.expects(:crawl_the_queue).times(2)
266
+ crawler.do_crawl(1)
267
+ end
268
+
269
+ it "asks each reporter to write its report in report_dir" do
270
+ crawler = Relevance::Tarantula::Crawler.new
271
+ crawler.stubs(:report_dir).returns(test_output_dir)
272
+ reporter = stub_everything
273
+ reporter.expects(:report)
274
+ reporter.expects(:finish_report)
275
+ crawler.reporters = [reporter]
276
+ crawler.save_result stub(:code => "404", :url => "/uh-oh")
277
+ crawler.generate_reports
278
+ end
279
+
280
+ it "builds a report dir relative to rails root" do
281
+ crawler = Relevance::Tarantula::Crawler.new
282
+ crawler.expects(:rails_root).returns("faux_rails_root")
283
+ crawler.report_dir.should == "faux_rails_root/tmp/tarantula"
284
+ end
285
+
286
+ it "skips links that are already queued" do
287
+ crawler = Relevance::Tarantula::Crawler.new
288
+ crawler.should_skip_link?(make_link("/foo")).should == false
289
+ crawler.queue_link("/foo").should == make_link("/foo")
290
+ crawler.should_skip_link?(make_link("/foo")).should == true
291
+ end
292
+
293
+ describe "link skipping" do
294
+
295
+ before { @crawler = Relevance::Tarantula::Crawler.new }
296
+
297
+ it "skips links that are too long" do
298
+ @crawler.should_skip_link?(make_link("/foo")).should == false
299
+ @crawler.max_url_length = 2
300
+ @crawler.expects(:log).with("Skipping long url /foo")
301
+ @crawler.should_skip_link?(make_link("/foo")).should == true
302
+ end
303
+
304
+ it "skips outbound links (those that begin with http)" do
305
+ @crawler.expects(:log).with("Skipping http-anything")
306
+ @crawler.should_skip_link?(make_link("http-anything")).should == true
307
+ end
308
+
309
+ it "skips javascript links (those that begin with javascript)" do
310
+ @crawler.expects(:log).with("Skipping javascript-anything")
311
+ @crawler.should_skip_link?(make_link("javascript-anything")).should == true
312
+ end
313
+
314
+ it "skips mailto links (those that begin with http)" do
315
+ @crawler.expects(:log).with("Skipping mailto-anything")
316
+ @crawler.should_skip_link?(make_link("mailto-anything")).should == true
317
+ end
318
+
319
+ it 'skips blank links' do
320
+ @crawler.queue_link(nil)
321
+ @crawler.crawl_queue.should == []
322
+ @crawler.queue_link("")
323
+ @crawler.crawl_queue.should == []
324
+ end
325
+
326
+ it "logs and skips links that match a pattern" do
327
+ @crawler.expects(:log).with("Skipping /the-red-button")
328
+ @crawler.skip_uri_patterns << /red-button/
329
+ @crawler.queue_link("/blue-button").should == make_link("/blue-button")
330
+ @crawler.queue_link("/the-red-button").should == nil
331
+ end
332
+
333
+ it "logs and skips form submissions that match a pattern" do
334
+ @crawler.expects(:log).with("Skipping /reset-password-form")
335
+ @crawler.skip_uri_patterns << /reset-password/
336
+ fs = stub_everything(:action => "/reset-password-form")
337
+ @crawler.should_skip_form_submission?(fs).should == true
338
+ end
339
+ end
340
+
341
+ describe "allow_nnn_for" do
342
+
343
+ it "installs result as a response_code_handler" do
344
+ crawler = Relevance::Tarantula::Crawler.new
345
+ crawler.response_code_handler.should == Relevance::Tarantula::Result
346
+ end
347
+
348
+ it "delegates to the response_code_handler" do
349
+ crawler = Relevance::Tarantula::Crawler.new
350
+ (response_code_handler = mock).expects(:allow_404_for).with(:stub)
351
+ crawler.response_code_handler = response_code_handler
352
+ crawler.allow_404_for(:stub)
353
+ end
354
+
355
+ it "chains up to super for method_missing" do
356
+ crawler = Relevance::Tarantula::Crawler.new
357
+ lambda{crawler.foo}.should raise_error(NoMethodError)
358
+ end
359
+
360
+ end
361
+
362
+ describe "timeouts" do
363
+
364
+ it "sets start and end times for a single crawl" do
365
+ start_time = Time.parse("March 1st, 2008 10:00am")
366
+ end_time = Time.parse("March 1st, 2008 10:10am")
367
+ Time.stubs(:now).returns(start_time, end_time)
368
+
369
+ crawler = Relevance::Tarantula::Crawler.new
370
+ stub_puts_and_print(crawler)
371
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
372
+ crawler.crawl
373
+ crawler.crawl_start_times.first.should == start_time
374
+ crawler.crawl_end_times.first.should == end_time
375
+ end
376
+
377
+ it "has elasped time for a crawl" do
378
+ start_time = Time.parse("March 1st, 2008 10:00am")
379
+ elasped_time_check = Time.parse("March 1st, 2008, 10:10:00am")
380
+ Time.stubs(:now).returns(start_time, elasped_time_check)
381
+
382
+ crawler = Relevance::Tarantula::Crawler.new
383
+ stub_puts_and_print(crawler)
384
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
385
+ crawler.crawl
386
+ crawler.elasped_time_for_pass(0).should == 600.seconds
387
+ end
388
+
389
+ it "raises out of the crawl if elasped time is greater then the crawl timeout" do
390
+ start_time = Time.parse("March 1st, 2008 10:00am")
391
+ elasped_time_check = Time.parse("March 1st, 2008, 10:35:00am")
392
+ Time.stubs(:now).returns(start_time, elasped_time_check)
393
+
394
+ crawler = Relevance::Tarantula::Crawler.new
395
+ crawler.crawl_timeout = 5.minutes
396
+
397
+ crawler.crawl_queue = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
398
+ crawler.proxy = stub
399
+ crawler.proxy.stubs(:get).returns(response = stub(:code => "200"))
400
+
401
+ stub_puts_and_print(crawler)
402
+ lambda {
403
+ crawler.do_crawl(0)
404
+ }.should raise_error
405
+ end
406
+
407
+ end
408
+
409
+ end