codez-tarantula 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +14 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/CHANGELOG +64 -0
- data/DSL_EXAMPLES.md +120 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.rdoc +136 -0
- data/Rakefile +36 -0
- data/ci/rails2.gemfile +4 -0
- data/ci/rails3.gemfile +4 -0
- data/laf/images/header_bg.jpg +0 -0
- data/laf/images/logo.png +0 -0
- data/laf/images/tagline.png +0 -0
- data/laf/javascripts/jquery-1.2.3.js +3408 -0
- data/laf/javascripts/jquery-ui-tabs.js +890 -0
- data/laf/javascripts/jquery.tablesorter.js +861 -0
- data/laf/javascripts/tarantula.js +10 -0
- data/laf/stylesheets/tarantula.css +346 -0
- data/lib/relevance/core_extensions/ellipsize.rb +38 -0
- data/lib/relevance/core_extensions/file.rb +15 -0
- data/lib/relevance/core_extensions/metaclass.rb +78 -0
- data/lib/relevance/core_extensions/response.rb +14 -0
- data/lib/relevance/core_extensions/test_case.rb +21 -0
- data/lib/relevance/tarantula.rb +55 -0
- data/lib/relevance/tarantula/attack.rb +22 -0
- data/lib/relevance/tarantula/attack_handler.rb +43 -0
- data/lib/relevance/tarantula/basic_attack.rb +44 -0
- data/lib/relevance/tarantula/crawler.rb +271 -0
- data/lib/relevance/tarantula/detail.html.erb +81 -0
- data/lib/relevance/tarantula/form.rb +29 -0
- data/lib/relevance/tarantula/form_submission.rb +98 -0
- data/lib/relevance/tarantula/html_document_handler.rb +42 -0
- data/lib/relevance/tarantula/html_report_helper.rb +46 -0
- data/lib/relevance/tarantula/html_reporter.rb +111 -0
- data/lib/relevance/tarantula/index.html.erb +37 -0
- data/lib/relevance/tarantula/invalid_html_handler.rb +27 -0
- data/lib/relevance/tarantula/io_reporter.rb +40 -0
- data/lib/relevance/tarantula/link.rb +105 -0
- data/lib/relevance/tarantula/log_grabber.rb +22 -0
- data/lib/relevance/tarantula/rails_integration_proxy.rb +90 -0
- data/lib/relevance/tarantula/recording.rb +12 -0
- data/lib/relevance/tarantula/response.rb +19 -0
- data/lib/relevance/tarantula/result.rb +83 -0
- data/lib/relevance/tarantula/test_report.html.erb +32 -0
- data/lib/relevance/tarantula/tidy_handler.rb +35 -0
- data/lib/relevance/tarantula/transform.rb +21 -0
- data/lib/relevance/tarantula/version.rb +5 -0
- data/lib/relevance/tasks/tarantula_tasks.rake +42 -0
- data/lib/tarantula-rails3.rb +9 -0
- data/spec/relevance/core_extensions/ellipsize_spec.rb +19 -0
- data/spec/relevance/core_extensions/file_spec.rb +7 -0
- data/spec/relevance/core_extensions/response_spec.rb +48 -0
- data/spec/relevance/core_extensions/test_case_spec.rb +19 -0
- data/spec/relevance/tarantula/attack_handler_spec.rb +29 -0
- data/spec/relevance/tarantula/basic_attack_spec.rb +12 -0
- data/spec/relevance/tarantula/crawler_spec.rb +409 -0
- data/spec/relevance/tarantula/form_spec.rb +50 -0
- data/spec/relevance/tarantula/form_submission_spec.rb +171 -0
- data/spec/relevance/tarantula/html_document_handler_spec.rb +43 -0
- data/spec/relevance/tarantula/html_report_helper_spec.rb +46 -0
- data/spec/relevance/tarantula/html_reporter_spec.rb +82 -0
- data/spec/relevance/tarantula/invalid_html_handler_spec.rb +33 -0
- data/spec/relevance/tarantula/io_reporter_spec.rb +11 -0
- data/spec/relevance/tarantula/link_spec.rb +132 -0
- data/spec/relevance/tarantula/log_grabber_spec.rb +26 -0
- data/spec/relevance/tarantula/rails_integration_proxy_spec.rb +100 -0
- data/spec/relevance/tarantula/result_spec.rb +85 -0
- data/spec/relevance/tarantula/tidy_handler_spec.rb +58 -0
- data/spec/relevance/tarantula/transform_spec.rb +20 -0
- data/spec/relevance/tarantula_spec.rb +23 -0
- data/spec/spec_helper.rb +43 -0
- data/tarantula.gemspec +25 -0
- data/template/tarantula_test.rb +22 -0
- data/vendor/xss-shield/MIT-LICENSE +20 -0
- data/vendor/xss-shield/README +76 -0
- data/vendor/xss-shield/init.rb +16 -0
- data/vendor/xss-shield/lib/xss_shield.rb +6 -0
- data/vendor/xss-shield/lib/xss_shield/erb_hacks.rb +111 -0
- data/vendor/xss-shield/lib/xss_shield/haml_hacks.rb +42 -0
- data/vendor/xss-shield/lib/xss_shield/safe_string.rb +47 -0
- data/vendor/xss-shield/lib/xss_shield/secure_helpers.rb +40 -0
- data/vendor/xss-shield/test/test_actionview_integration.rb +40 -0
- data/vendor/xss-shield/test/test_erb.rb +44 -0
- data/vendor/xss-shield/test/test_haml.rb +43 -0
- data/vendor/xss-shield/test/test_helpers.rb +25 -0
- data/vendor/xss-shield/test/test_safe_string.rb +55 -0
- metadata +247 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module Relevance
|
|
2
|
+
module Tarantula
|
|
3
|
+
|
|
4
|
+
class Transform
|
|
5
|
+
attr_accessor :from, :to
|
|
6
|
+
def initialize(from, to)
|
|
7
|
+
@from = from
|
|
8
|
+
@to = to
|
|
9
|
+
end
|
|
10
|
+
def [](string)
|
|
11
|
+
case to
|
|
12
|
+
when Proc
|
|
13
|
+
string.gsub(from, &to)
|
|
14
|
+
else
|
|
15
|
+
string.gsub(from, to)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
require 'rake'
|
|
2
|
+
|
|
3
|
+
namespace :tarantula do
|
|
4
|
+
|
|
5
|
+
desc 'Run tarantula tests.'
|
|
6
|
+
task :test do
|
|
7
|
+
rm_rf "tmp/tarantula"
|
|
8
|
+
Rake::TestTask.new(:tarantula_test) do |t|
|
|
9
|
+
t.libs << 'test'
|
|
10
|
+
t.pattern = 'test/tarantula/**/*_test.rb'
|
|
11
|
+
t.verbose = true
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
Rake::Task[:tarantula_test].invoke
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
desc 'Run tarantula tests and open results in your browser.'
|
|
18
|
+
task :report do
|
|
19
|
+
begin
|
|
20
|
+
Rake::Task['tarantula:test'].invoke
|
|
21
|
+
rescue RuntimeError => e
|
|
22
|
+
puts e.message
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
Dir.glob("tmp/tarantula/**/index.html") do |file|
|
|
26
|
+
if PLATFORM['darwin']
|
|
27
|
+
system("open #{file}")
|
|
28
|
+
elsif PLATFORM[/linux/]
|
|
29
|
+
system("firefox #{file}")
|
|
30
|
+
else
|
|
31
|
+
puts "You can view tarantula results at #{file}"
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
desc 'Generate a default tarantula test'
|
|
37
|
+
task :setup do
|
|
38
|
+
mkdir_p "test/tarantula"
|
|
39
|
+
template_path = File.expand_path(File.join(File.dirname(__FILE__), "../../..", "template", "tarantula_test.rb"))
|
|
40
|
+
cp template_path, "test/tarantula/"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe "Relevance::CoreExtensions::Object#ellipsize" do
|
|
4
|
+
it "converts nil to empty string" do
|
|
5
|
+
nil.ellipsize.should == ""
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
it "doesn't touch short strings" do
|
|
9
|
+
"hello".ellipsize.should == "hello"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "calls inspect on non-strings" do
|
|
13
|
+
[1,2,3].ellipsize.should == "[1, 2, 3]"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it "shortens long strings and adds ..." do
|
|
17
|
+
"long-string".ellipsize(5).should == "long-..."
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe "Relevance::CoreExtensions::Response#html?" do
|
|
4
|
+
before do
|
|
5
|
+
@response = OpenStruct.new
|
|
6
|
+
@response.extend(Relevance::CoreExtensions::Response)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
context 'when content_type is a String (Rails 2)' do
|
|
10
|
+
it "should be html if the content-type is 'text/html'" do
|
|
11
|
+
@response.content_type = "text/html"
|
|
12
|
+
@response.should be_html
|
|
13
|
+
@response.content_type = "text/html;charset=iso-8859-2"
|
|
14
|
+
@response.should be_html
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it "should not be html if the content-type isn't an html type" do
|
|
18
|
+
@response.content_type = "text/plain"
|
|
19
|
+
@response.should_not be_html
|
|
20
|
+
@response.content_type = "application/pdf"
|
|
21
|
+
@response.should_not be_html
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
context 'when content_type is a Mime::Type (Rails 3)' do
|
|
26
|
+
it "should be html if the content-type is 'text/html'" do
|
|
27
|
+
@response.content_type = Mime::Type.new("text/html")
|
|
28
|
+
@response.should be_html
|
|
29
|
+
@response.content_type = Mime::Type.new("text/html;charset=iso-8859-2")
|
|
30
|
+
@response.should be_html
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "should not be html if the content-type isn't an html type" do
|
|
34
|
+
@response.content_type = Mime::Type.new("text/plain")
|
|
35
|
+
@response.should_not be_html
|
|
36
|
+
@response.content_type = Mime::Type.new("application/pdf")
|
|
37
|
+
@response.should_not be_html
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# better ideas welcome, but be careful not to
|
|
42
|
+
# castrate tarantula for proxies that don't set the content-type
|
|
43
|
+
it "should pretend we have html if the content-type is nil" do
|
|
44
|
+
@response.content_type = nil
|
|
45
|
+
@response.should be_html
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe "TestCase extensions" do
|
|
4
|
+
pending "can create the crawler" do
|
|
5
|
+
Relevance::Tarantula::RailsIntegrationProxy.stubs(:rails_root).returns("STUB_RAILS_ROOT")
|
|
6
|
+
Relevance::Tarantula::Crawler.any_instance.stubs(:rails_root).returns("STUB_RAILS_ROOT")
|
|
7
|
+
tarantula_crawler(stub_everything)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
pending "can crawl" do
|
|
11
|
+
(crawler = mock).expects(:crawl).with("/foo")
|
|
12
|
+
expects(:tarantula_crawler).returns(crawler)
|
|
13
|
+
tarantula_crawl(:integration_test_stub, :url => "/foo")
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it "should get mixed into ActionController::IntegrationTest" do
|
|
17
|
+
ActionController::IntegrationTest.ancestors.should include(Relevance::CoreExtensions::TestCaseExtensions)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe "Relevance::Tarantula::AttackHandler" do
|
|
4
|
+
before do
|
|
5
|
+
@handler = Relevance::Tarantula::AttackHandler.new
|
|
6
|
+
attack = Relevance::Tarantula::Attack.new({:name => 'foo_name', :input => 'foo_code', :output => '<bad>'})
|
|
7
|
+
@handler.stubs(:attacks).returns([attack])
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
it "lets safe documents through" do
|
|
11
|
+
result = @handler.handle(Relevance::Tarantula::Result.new(:response => stub(:html? => true, :body => '<a href="/foo">good</a>')))
|
|
12
|
+
result.should == nil
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
it "detects the supplied code" do
|
|
16
|
+
result = @handler.handle(Relevance::Tarantula::Result.new(:response => stub(:html? => true, :body => '<a href="/foo"><bad></a>')))
|
|
17
|
+
result.success.should == false
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
describe "Attacks without an output specified" do
|
|
22
|
+
it "never matches anything" do
|
|
23
|
+
handler = Relevance::Tarantula::AttackHandler.new
|
|
24
|
+
attack = Relevance::Tarantula::Attack.new({:name => 'foo_name', :input => 'foo_code'})
|
|
25
|
+
Relevance::Tarantula::FormSubmission.stubs(:attacks).returns([attack])
|
|
26
|
+
result = handler.handle(Relevance::Tarantula::Result.new(:response => stub(:html? => true, :body => '<a href="/foo">good</a>')))
|
|
27
|
+
result.should == nil
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe Relevance::Tarantula::BasicAttack do
|
|
4
|
+
before do
|
|
5
|
+
@attack = Relevance::Tarantula::BasicAttack.new
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
it "can generate a random whole number" do
|
|
9
|
+
@attack.random_whole_number.should >= 0
|
|
10
|
+
Fixnum.should === @attack.random_whole_number
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe Relevance::Tarantula::Crawler do
|
|
4
|
+
|
|
5
|
+
describe "transform_url" do
|
|
6
|
+
|
|
7
|
+
before { @crawler = Relevance::Tarantula::Crawler.new }
|
|
8
|
+
|
|
9
|
+
it "de-obfuscates unicode obfuscated urls" do
|
|
10
|
+
obfuscated_mailto = "mailto:"
|
|
11
|
+
@crawler.transform_url(obfuscated_mailto).should == "mailto:"
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
it "strips the trailing name portion of a link" do
|
|
15
|
+
@crawler.transform_url('http://host/path#name').should == 'http://host/path'
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
describe "log grabbing" do
|
|
21
|
+
|
|
22
|
+
it "returns nil if no grabber is specified" do
|
|
23
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
24
|
+
crawler.grab_log!.should == nil
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it "returns grabber.grab if grabber is specified" do
|
|
28
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
29
|
+
crawler.log_grabber = stub(:grab! => "fake log entry")
|
|
30
|
+
crawler.grab_log!.should == "fake log entry"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
describe "interrupt" do
|
|
36
|
+
|
|
37
|
+
it 'catches interruption and writes the partial report' do
|
|
38
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
39
|
+
crawler.stubs(:queue_link)
|
|
40
|
+
crawler.stubs(:do_crawl).raises(Interrupt)
|
|
41
|
+
crawler.expects(:report_results)
|
|
42
|
+
$stderr.expects(:puts).with("CTRL-C")
|
|
43
|
+
crawler.crawl
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
describe 'handle_form_results' do
|
|
49
|
+
|
|
50
|
+
it 'captures the result values (bugfix)' do
|
|
51
|
+
response = stub_everything
|
|
52
|
+
result_args = {:url => :action_stub,
|
|
53
|
+
:data => 'nil',
|
|
54
|
+
:response => response,
|
|
55
|
+
:referrer => :action_stub,
|
|
56
|
+
:log => nil,
|
|
57
|
+
:method => :stub_method,
|
|
58
|
+
:test_name => nil}
|
|
59
|
+
result = Relevance::Tarantula::Result.new(result_args)
|
|
60
|
+
Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
|
|
61
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
62
|
+
crawler.handle_form_results(stub_everything(:meth => :stub_method, :action => :action_stub),
|
|
63
|
+
response)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
describe "crawl" do
|
|
69
|
+
|
|
70
|
+
it 'queues the first url, does crawl, and then reports results' do
|
|
71
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
72
|
+
crawler.expects(:queue_link).with("/foobar")
|
|
73
|
+
crawler.expects(:do_crawl)
|
|
74
|
+
crawler.expects(:report_results)
|
|
75
|
+
crawler.crawl("/foobar")
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it 'reports results even if the crawl fails' do
|
|
79
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
80
|
+
crawler.expects(:do_crawl).raises(RuntimeError)
|
|
81
|
+
crawler.expects(:report_results)
|
|
82
|
+
lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
describe "queueing" do
|
|
88
|
+
|
|
89
|
+
it 'queues and remembers links' do
|
|
90
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
91
|
+
crawler.expects(:transform_url).with("/url").returns("/transformed").at_least_once
|
|
92
|
+
crawler.queue_link("/url")
|
|
93
|
+
# TODO not sure this is the best way to test this anymore; relying on result of transform in both actual and expected
|
|
94
|
+
crawler.crawl_queue.should == [make_link("/url", crawler)]
|
|
95
|
+
crawler.links_queued.should == Set.new([make_link("/url", crawler)])
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
it 'queues and remembers forms' do
|
|
99
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
100
|
+
form = Hpricot('<form action="/action" method="post"/>').at('form')
|
|
101
|
+
signature = Relevance::Tarantula::FormSubmission.new(make_form(form)).signature
|
|
102
|
+
crawler.queue_form(form)
|
|
103
|
+
crawler.crawl_queue.size.should == 1
|
|
104
|
+
crawler.form_signatures_queued.should == Set.new([signature])
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
it "passes link, self, and referrer when creating Link objects" do
|
|
108
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
109
|
+
Relevance::Tarantula::Link.expects(:new).with('/url', crawler, '/some-referrer')
|
|
110
|
+
crawler.expects(:append_to_queue)
|
|
111
|
+
crawler.stubs(:should_skip_link?)
|
|
112
|
+
crawler.queue_link('/url', '/some-referrer')
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
it "queues DELETE requests at the end, everything else before" do
|
|
116
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
117
|
+
create_link = Hpricot('<a href="/create" data-method="post">Create</a>').at('a')
|
|
118
|
+
crawler.queue_link(create_link)
|
|
119
|
+
create_link = Relevance::Tarantula::Link.new(create_link, crawler, nil)
|
|
120
|
+
crawler.crawl_queue.should == [create_link]
|
|
121
|
+
delete_link = Hpricot('<a href="/destroy" data-method="delete">Destroy</a>').at('a')
|
|
122
|
+
crawler.queue_link(delete_link)
|
|
123
|
+
delete_link = Relevance::Tarantula::Link.new(delete_link, crawler, nil)
|
|
124
|
+
crawler.crawl_queue.should == [create_link, delete_link]
|
|
125
|
+
get_link = Hpricot('<a href="/read">Show</a>').at('a')
|
|
126
|
+
crawler.queue_link(get_link)
|
|
127
|
+
get_link = Relevance::Tarantula::Link.new(get_link, crawler, nil)
|
|
128
|
+
crawler.crawl_queue.should == [create_link, get_link, delete_link]
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it "queues is crawled from tip not tail" do
|
|
132
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
133
|
+
|
|
134
|
+
create_link = Hpricot('<a href="/create" data-method="post">Create</a>').at('a')
|
|
135
|
+
crawler.queue_link(create_link)
|
|
136
|
+
delete_link = Hpricot('<a href="/destroy" data-method="delete">Destroy</a>').at('a')
|
|
137
|
+
crawler.queue_link(delete_link)
|
|
138
|
+
get_link = Hpricot('<a href="/read">Show</a>').at('a')
|
|
139
|
+
crawler.queue_link(get_link)
|
|
140
|
+
|
|
141
|
+
q = sequence('queue')
|
|
142
|
+
response = stub(:code => "200")
|
|
143
|
+
crawler.expects(:follow).with('post', '/create').returns(response).in_sequence(q)
|
|
144
|
+
crawler.expects(:follow).with('get', '/read').returns(response).in_sequence(q)
|
|
145
|
+
crawler.expects(:follow).with('delete', '/destroy').returns(response).in_sequence(q)
|
|
146
|
+
crawler.do_crawl(0)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
describe "crawling" do
|
|
151
|
+
before do
|
|
152
|
+
@form = Hpricot('<form action="/action" method="post"/>').at('form')
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it "does two things with each link: crawl and blip" do
|
|
156
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
157
|
+
crawler.proxy = stub
|
|
158
|
+
crawler.crawl_queue = links = [make_link("/foo1", crawler), make_link("/foo2", crawler)]
|
|
159
|
+
|
|
160
|
+
links.each{|link| link.expects(:crawl)}
|
|
161
|
+
crawler.expects(:blip).times(2)
|
|
162
|
+
|
|
163
|
+
crawler.crawl_the_queue
|
|
164
|
+
crawler.crawl_queue.should == []
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
it "invokes queued forms, logs responses, and calls handlers" do
|
|
168
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
169
|
+
crawler.crawl_queue << Relevance::Tarantula::FormSubmission.new(make_form(@form, crawler))
|
|
170
|
+
crawler.expects(:submit).returns(stub(:code => "200"))
|
|
171
|
+
crawler.expects(:blip)
|
|
172
|
+
crawler.crawl_the_queue
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# TODO this is the same as "resets to the initial links/forms ..." and doesn't appear to test anything related to a timeout.
|
|
176
|
+
it "breaks out early if a timeout is set"
|
|
177
|
+
|
|
178
|
+
it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
|
|
179
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
180
|
+
stub_puts_and_print(crawler)
|
|
181
|
+
response = stub(:code => "200")
|
|
182
|
+
crawler.queue_link('/foo')
|
|
183
|
+
crawler.expects(:follow).returns(response).times(4) # (stub and "/") * 2
|
|
184
|
+
crawler.queue_form(@form)
|
|
185
|
+
crawler.expects(:submit).returns(response).times(2)
|
|
186
|
+
crawler.expects(:blip).times(6)
|
|
187
|
+
crawler.times_to_crawl = 2
|
|
188
|
+
crawler.crawl
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
describe "report_results" do
|
|
194
|
+
it "prints a final summary line" do
|
|
195
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
196
|
+
crawler.stubs(:generate_reports)
|
|
197
|
+
crawler.expects(:total_links_count).returns(42)
|
|
198
|
+
crawler.expects(:puts).with("Crawled 42 links and forms.")
|
|
199
|
+
crawler.report_results
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it "delegates to generate_reports" do
|
|
203
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
204
|
+
crawler.stubs(:puts)
|
|
205
|
+
crawler.expects(:generate_reports)
|
|
206
|
+
crawler.report_results
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
describe "blip" do
|
|
212
|
+
|
|
213
|
+
it "blips the current progress if !verbose" do
|
|
214
|
+
$stdout.stubs(:tty?).returns(true)
|
|
215
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
216
|
+
crawler.stubs(:verbose).returns false
|
|
217
|
+
crawler.stubs(:timeout_if_too_long)
|
|
218
|
+
crawler.expects(:print).with("\r 0 of 0 links completed ")
|
|
219
|
+
crawler.blip
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
it "suppresses the blip message if not writing to a tty" do
|
|
223
|
+
$stdout.stubs(:tty?).returns(false)
|
|
224
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
225
|
+
crawler.stubs(:verbose).returns false
|
|
226
|
+
crawler.stubs(:timeout_if_too_long)
|
|
227
|
+
crawler.expects(:print).never
|
|
228
|
+
crawler.blip
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
it "blips nothing if verbose" do
|
|
232
|
+
$stdout.stubs(:tty?).returns(true)
|
|
233
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
234
|
+
crawler.stubs(:verbose).returns true
|
|
235
|
+
crawler.expects(:print).never
|
|
236
|
+
crawler.blip
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
describe "finished?" do
|
|
242
|
+
|
|
243
|
+
it "is finished when the links and forms are crawled" do
|
|
244
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
245
|
+
crawler.finished?.should == true
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
it "isn't finished when links remain" do
|
|
249
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
250
|
+
crawler.crawl_queue = [:stub_link]
|
|
251
|
+
crawler.finished?.should == false
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
it "isn't finished when forms remain" do
|
|
255
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
256
|
+
crawler.crawl_queue = [:stub_form]
|
|
257
|
+
crawler.finished?.should == false
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
it "crawls links and forms again and again until finished?==true" do
|
|
263
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
264
|
+
crawler.expects(:finished?).times(3).returns(false, false, true)
|
|
265
|
+
crawler.expects(:crawl_the_queue).times(2)
|
|
266
|
+
crawler.do_crawl(1)
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
it "asks each reporter to write its report in report_dir" do
|
|
270
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
271
|
+
crawler.stubs(:report_dir).returns(test_output_dir)
|
|
272
|
+
reporter = stub_everything
|
|
273
|
+
reporter.expects(:report)
|
|
274
|
+
reporter.expects(:finish_report)
|
|
275
|
+
crawler.reporters = [reporter]
|
|
276
|
+
crawler.save_result stub(:code => "404", :url => "/uh-oh")
|
|
277
|
+
crawler.generate_reports
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
it "builds a report dir relative to rails root" do
|
|
281
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
282
|
+
crawler.expects(:rails_root).returns("faux_rails_root")
|
|
283
|
+
crawler.report_dir.should == "faux_rails_root/tmp/tarantula"
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
it "skips links that are already queued" do
|
|
287
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
288
|
+
crawler.should_skip_link?(make_link("/foo")).should == false
|
|
289
|
+
crawler.queue_link("/foo").should == make_link("/foo")
|
|
290
|
+
crawler.should_skip_link?(make_link("/foo")).should == true
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
describe "link skipping" do
|
|
294
|
+
|
|
295
|
+
before { @crawler = Relevance::Tarantula::Crawler.new }
|
|
296
|
+
|
|
297
|
+
it "skips links that are too long" do
|
|
298
|
+
@crawler.should_skip_link?(make_link("/foo")).should == false
|
|
299
|
+
@crawler.max_url_length = 2
|
|
300
|
+
@crawler.expects(:log).with("Skipping long url /foo")
|
|
301
|
+
@crawler.should_skip_link?(make_link("/foo")).should == true
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
it "skips outbound links (those that begin with http)" do
|
|
305
|
+
@crawler.expects(:log).with("Skipping http-anything")
|
|
306
|
+
@crawler.should_skip_link?(make_link("http-anything")).should == true
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
it "skips javascript links (those that begin with javascript)" do
|
|
310
|
+
@crawler.expects(:log).with("Skipping javascript-anything")
|
|
311
|
+
@crawler.should_skip_link?(make_link("javascript-anything")).should == true
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
it "skips mailto links (those that begin with http)" do
|
|
315
|
+
@crawler.expects(:log).with("Skipping mailto-anything")
|
|
316
|
+
@crawler.should_skip_link?(make_link("mailto-anything")).should == true
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
it 'skips blank links' do
|
|
320
|
+
@crawler.queue_link(nil)
|
|
321
|
+
@crawler.crawl_queue.should == []
|
|
322
|
+
@crawler.queue_link("")
|
|
323
|
+
@crawler.crawl_queue.should == []
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
it "logs and skips links that match a pattern" do
|
|
327
|
+
@crawler.expects(:log).with("Skipping /the-red-button")
|
|
328
|
+
@crawler.skip_uri_patterns << /red-button/
|
|
329
|
+
@crawler.queue_link("/blue-button").should == make_link("/blue-button")
|
|
330
|
+
@crawler.queue_link("/the-red-button").should == nil
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
it "logs and skips form submissions that match a pattern" do
|
|
334
|
+
@crawler.expects(:log).with("Skipping /reset-password-form")
|
|
335
|
+
@crawler.skip_uri_patterns << /reset-password/
|
|
336
|
+
fs = stub_everything(:action => "/reset-password-form")
|
|
337
|
+
@crawler.should_skip_form_submission?(fs).should == true
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
describe "allow_nnn_for" do
|
|
342
|
+
|
|
343
|
+
it "installs result as a response_code_handler" do
|
|
344
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
345
|
+
crawler.response_code_handler.should == Relevance::Tarantula::Result
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
it "delegates to the response_code_handler" do
|
|
349
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
350
|
+
(response_code_handler = mock).expects(:allow_404_for).with(:stub)
|
|
351
|
+
crawler.response_code_handler = response_code_handler
|
|
352
|
+
crawler.allow_404_for(:stub)
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
it "chains up to super for method_missing" do
|
|
356
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
357
|
+
lambda{crawler.foo}.should raise_error(NoMethodError)
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
describe "timeouts" do
|
|
363
|
+
|
|
364
|
+
it "sets start and end times for a single crawl" do
|
|
365
|
+
start_time = Time.parse("March 1st, 2008 10:00am")
|
|
366
|
+
end_time = Time.parse("March 1st, 2008 10:10am")
|
|
367
|
+
Time.stubs(:now).returns(start_time, end_time)
|
|
368
|
+
|
|
369
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
370
|
+
stub_puts_and_print(crawler)
|
|
371
|
+
crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
|
|
372
|
+
crawler.crawl
|
|
373
|
+
crawler.crawl_start_times.first.should == start_time
|
|
374
|
+
crawler.crawl_end_times.first.should == end_time
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
it "has elasped time for a crawl" do
|
|
378
|
+
start_time = Time.parse("March 1st, 2008 10:00am")
|
|
379
|
+
elasped_time_check = Time.parse("March 1st, 2008, 10:10:00am")
|
|
380
|
+
Time.stubs(:now).returns(start_time, elasped_time_check)
|
|
381
|
+
|
|
382
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
383
|
+
stub_puts_and_print(crawler)
|
|
384
|
+
crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
|
|
385
|
+
crawler.crawl
|
|
386
|
+
crawler.elasped_time_for_pass(0).should == 600.seconds
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
it "raises out of the crawl if elasped time is greater then the crawl timeout" do
|
|
390
|
+
start_time = Time.parse("March 1st, 2008 10:00am")
|
|
391
|
+
elasped_time_check = Time.parse("March 1st, 2008, 10:35:00am")
|
|
392
|
+
Time.stubs(:now).returns(start_time, elasped_time_check)
|
|
393
|
+
|
|
394
|
+
crawler = Relevance::Tarantula::Crawler.new
|
|
395
|
+
crawler.crawl_timeout = 5.minutes
|
|
396
|
+
|
|
397
|
+
crawler.crawl_queue = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
|
|
398
|
+
crawler.proxy = stub
|
|
399
|
+
crawler.proxy.stubs(:get).returns(response = stub(:code => "200"))
|
|
400
|
+
|
|
401
|
+
stub_puts_and_print(crawler)
|
|
402
|
+
lambda {
|
|
403
|
+
crawler.do_crawl(0)
|
|
404
|
+
}.should raise_error
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
end
|