relevance-tarantula 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +2 -0
- data/README.rdoc +13 -0
- data/Rakefile +3 -1
- data/VERSION.yml +1 -1
- data/examples/relevance/tarantula/crawler_example.rb +312 -222
- data/lib/relevance/tarantula/crawler.rb +37 -15
- data/lib/relevance/tarantula/index.html.erb +2 -2
- data/lib/relevance/tarantula/tidy_handler.rb +1 -1
- metadata +23 -4
data/CHANGELOG
CHANGED
data/README.rdoc
CHANGED
@@ -134,6 +134,19 @@ This example adds custom attacks for both SQL injection and XSS. It also tells T
|
|
134
134
|
app 2 times. This is important for XSS attacks because the results won't appear until the second time
|
135
135
|
Tarantula performs the crawl.
|
136
136
|
|
137
|
+
== Timeout
|
138
|
+
|
139
|
+
You can specify a timeout for each specific crawl that Tarantula runs. For example:
|
140
|
+
|
141
|
+
def test_tarantula
|
142
|
+
t = tarantula_crawler(self)
|
143
|
+
t.times_to_crawl = 2
|
144
|
+
t.crawl_timeout = 5.minutes
|
145
|
+
t.crawl "/"
|
146
|
+
end
|
147
|
+
|
148
|
+
The above will crawl your app twice, and each specific crawl will timeout if it takes longer then 5 minutes. You may need a timeout to keep the tarantula test time reasonable if your app is large or just happens to have a large amount of 'never-ending' links, such as with an any sort of "auto-admin" interface.
|
149
|
+
|
137
150
|
== Bugs/Requests
|
138
151
|
|
139
152
|
Please submit your bug reports, patches, or feature requests at Lighthouse:
|
data/Rakefile
CHANGED
@@ -19,6 +19,8 @@ begin
|
|
19
19
|
s.authors = ["Relevance, Inc."]
|
20
20
|
s.require_paths = ["lib"]
|
21
21
|
s.files = files.flatten
|
22
|
+
s.add_dependency 'htmlentities'
|
23
|
+
s.add_dependency 'hpricot'
|
22
24
|
end
|
23
25
|
rescue LoadError
|
24
26
|
puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
|
@@ -46,7 +48,7 @@ namespace :examples do
|
|
46
48
|
t.rcov_opts = %[--exclude "gems/*,/Library/Ruby/*,config/*" --text-summary --sort coverage --no-validator-links]
|
47
49
|
end
|
48
50
|
|
49
|
-
RAILS_VERSIONS = %w[2.0.2 2.1.0 2.1.1 2.2.2 2.3.1]
|
51
|
+
RAILS_VERSIONS = %w[2.0.2 2.1.0 2.1.1 2.2.2 2.3.1 2.3.2]
|
50
52
|
|
51
53
|
desc "Run exmaples with multiple versions of rails"
|
52
54
|
task :multi_rails do
|
data/VERSION.yml
CHANGED
@@ -1,204 +1,246 @@
|
|
1
1
|
require File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "example_helper.rb"))
|
2
2
|
|
3
|
-
describe
|
4
|
-
before {@crawler = Relevance::Tarantula::Crawler.new}
|
5
|
-
it "de-obfuscates unicode obfuscated urls" do
|
6
|
-
obfuscated_mailto = "mailto:"
|
7
|
-
@crawler.transform_url(obfuscated_mailto).should == "mailto:"
|
8
|
-
end
|
3
|
+
describe Relevance::Tarantula::Crawler do
|
9
4
|
|
10
|
-
|
11
|
-
@crawler.transform_url('http://host/path#name').should == 'http://host/path'
|
12
|
-
end
|
13
|
-
end
|
5
|
+
describe "transform_url" do
|
14
6
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
7
|
+
before { @crawler = Relevance::Tarantula::Crawler.new }
|
8
|
+
|
9
|
+
it "de-obfuscates unicode obfuscated urls" do
|
10
|
+
obfuscated_mailto = "mailto:"
|
11
|
+
@crawler.transform_url(obfuscated_mailto).should == "mailto:"
|
12
|
+
end
|
20
13
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
crawler.grab_log!.should == "fake log entry"
|
14
|
+
it "strips the trailing name portion of a link" do
|
15
|
+
@crawler.transform_url('http://host/path#name').should == 'http://host/path'
|
16
|
+
end
|
25
17
|
end
|
26
|
-
|
18
|
+
|
19
|
+
|
20
|
+
describe "log grabbing" do
|
27
21
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
crawler.stubs(:do_crawl).raises(Interrupt)
|
33
|
-
crawler.expects(:report_results)
|
34
|
-
$stderr.expects(:puts).with("CTRL-C")
|
35
|
-
crawler.crawl
|
36
|
-
end
|
37
|
-
end
|
22
|
+
it "returns nil if no grabber is specified" do
|
23
|
+
crawler = Relevance::Tarantula::Crawler.new
|
24
|
+
crawler.grab_log!.should == nil
|
25
|
+
end
|
38
26
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
:referrer => :action_stub,
|
46
|
-
:log => nil,
|
47
|
-
:method => :stub_method,
|
48
|
-
:test_name => nil}
|
49
|
-
result = Relevance::Tarantula::Result.new(result_args)
|
50
|
-
Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
|
51
|
-
crawler = Relevance::Tarantula::Crawler.new
|
52
|
-
crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub),
|
53
|
-
response)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
describe 'Relevance::Tarantula::Crawler#crawl' do
|
58
|
-
it 'queues the first url, does crawl, and then reports results' do
|
59
|
-
crawler = Relevance::Tarantula::Crawler.new
|
60
|
-
crawler.expects(:queue_link).with("/foobar")
|
61
|
-
crawler.expects(:do_crawl)
|
62
|
-
crawler.expects(:report_results)
|
63
|
-
crawler.crawl("/foobar")
|
27
|
+
it "returns grabber.grab if grabber is specified" do
|
28
|
+
crawler = Relevance::Tarantula::Crawler.new
|
29
|
+
crawler.log_grabber = stub(:grab! => "fake log entry")
|
30
|
+
crawler.grab_log!.should == "fake log entry"
|
31
|
+
end
|
32
|
+
|
64
33
|
end
|
65
34
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
crawler.expects(:transform_url).with("/url").returns("/transformed")
|
78
|
-
crawler.queue_link("/url")
|
79
|
-
crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")]
|
80
|
-
crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")])
|
35
|
+
describe "interrupt" do
|
36
|
+
|
37
|
+
it 'catches interruption and writes the partial report' do
|
38
|
+
crawler = Relevance::Tarantula::Crawler.new
|
39
|
+
crawler.stubs(:queue_link)
|
40
|
+
crawler.stubs(:do_crawl).raises(Interrupt)
|
41
|
+
crawler.expects(:report_results)
|
42
|
+
$stderr.expects(:puts).with("CTRL-C")
|
43
|
+
crawler.crawl
|
44
|
+
end
|
45
|
+
|
81
46
|
end
|
82
47
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
48
|
+
describe 'handle_form_results' do
|
49
|
+
|
50
|
+
it 'captures the result values (bugfix)' do
|
51
|
+
response = stub_everything
|
52
|
+
result_args = {:url => :action_stub,
|
53
|
+
:data => 'nil',
|
54
|
+
:response => response,
|
55
|
+
:referrer => :action_stub,
|
56
|
+
:log => nil,
|
57
|
+
:method => :stub_method,
|
58
|
+
:test_name => nil}
|
59
|
+
result = Relevance::Tarantula::Result.new(result_args)
|
60
|
+
Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
|
61
|
+
crawler = Relevance::Tarantula::Crawler.new
|
62
|
+
crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub),
|
63
|
+
response)
|
64
|
+
end
|
65
|
+
|
90
66
|
end
|
91
67
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
68
|
+
describe "crawl" do
|
69
|
+
|
70
|
+
it 'queues the first url, does crawl, and then reports results' do
|
71
|
+
crawler = Relevance::Tarantula::Crawler.new
|
72
|
+
crawler.expects(:queue_link).with("/foobar")
|
73
|
+
crawler.expects(:do_crawl)
|
74
|
+
crawler.expects(:report_results)
|
75
|
+
crawler.crawl("/foobar")
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'reports results even if the crawl fails' do
|
79
|
+
crawler = Relevance::Tarantula::Crawler.new
|
80
|
+
crawler.expects(:do_crawl).raises(RuntimeError)
|
81
|
+
crawler.expects(:report_results)
|
82
|
+
lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
|
83
|
+
end
|
84
|
+
|
96
85
|
end
|
97
86
|
|
98
|
-
|
87
|
+
describe "queueing" do
|
99
88
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
end
|
89
|
+
it 'queues and remembers links' do
|
90
|
+
crawler = Relevance::Tarantula::Crawler.new
|
91
|
+
crawler.expects(:transform_url).with("/url").returns("/transformed")
|
92
|
+
crawler.queue_link("/url")
|
93
|
+
crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")]
|
94
|
+
crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")])
|
95
|
+
end
|
107
96
|
|
108
|
-
|
97
|
+
it 'queues and remembers forms' do
|
98
|
+
crawler = Relevance::Tarantula::Crawler.new
|
99
|
+
form = Hpricot('<form action="/action" method="post"/>').at('form')
|
100
|
+
signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature
|
101
|
+
crawler.queue_form(form)
|
102
|
+
crawler.forms_to_crawl.size.should == 1
|
103
|
+
crawler.form_signatures_queued.should == Set.new([signature])
|
104
|
+
end
|
109
105
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
response.content_type.should == "text/plain"
|
117
|
-
response.body.should == "ActiveRecord::RecordNotFound"
|
106
|
+
it 'remembers link referrer if there is one' do
|
107
|
+
crawler = Relevance::Tarantula::Crawler.new
|
108
|
+
crawler.queue_link("/url", "/some-referrer")
|
109
|
+
crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"}
|
110
|
+
end
|
111
|
+
|
118
112
|
end
|
113
|
+
|
114
|
+
describe "crawling" do
|
115
|
+
|
116
|
+
it "converts ActiveRecord::RecordNotFound into a 404" do
|
117
|
+
(proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound)
|
118
|
+
crawler = Relevance::Tarantula::Crawler.new
|
119
|
+
crawler.proxy = proxy
|
120
|
+
response = crawler.crawl_form stub_everything(:method => nil)
|
121
|
+
response.code.should == "404"
|
122
|
+
response.content_type.should == "text/plain"
|
123
|
+
response.body.should == "ActiveRecord::RecordNotFound"
|
124
|
+
end
|
119
125
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
126
|
+
it "does four things with each link: get, log, handle, and blip" do
|
127
|
+
crawler = Relevance::Tarantula::Crawler.new
|
128
|
+
crawler.proxy = stub
|
129
|
+
response = stub(:code => "200")
|
130
|
+
crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
|
131
|
+
crawler.proxy.expects(:get).returns(response).times(2)
|
132
|
+
crawler.expects(:log).times(2)
|
133
|
+
crawler.expects(:handle_link_results).times(2)
|
134
|
+
crawler.expects(:blip).times(2)
|
135
|
+
crawler.crawl_queued_links
|
136
|
+
crawler.links_to_crawl.should == []
|
137
|
+
end
|
138
|
+
|
139
|
+
it "invokes queued forms, logs responses, and calls handlers" do
|
140
|
+
crawler = Relevance::Tarantula::Crawler.new
|
141
|
+
crawler.forms_to_crawl << stub_everything(:method => "get",
|
142
|
+
:action => "/foo",
|
143
|
+
:data => "some data",
|
144
|
+
:to_s => "stub")
|
145
|
+
crawler.proxy = stub_everything(:send => stub(:code => "200" ))
|
146
|
+
crawler.expects(:log).with("Response 200 for stub")
|
147
|
+
crawler.expects(:blip)
|
148
|
+
crawler.crawl_queued_forms
|
149
|
+
end
|
150
|
+
|
151
|
+
it "breaks out early if a timeout is set" do
|
152
|
+
crawler = Relevance::Tarantula::Crawler.new
|
153
|
+
stub_puts_and_print(crawler)
|
154
|
+
crawler.proxy = stub
|
155
|
+
response = stub(:code => "200")
|
156
|
+
crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
|
157
|
+
crawler.proxy.expects(:get).returns(response).times(4)
|
158
|
+
crawler.forms_to_crawl << stub_everything(:method => "post",
|
159
|
+
:action => "/foo",
|
160
|
+
:data => "some data",
|
161
|
+
:to_s => "stub")
|
162
|
+
crawler.proxy.expects(:post).returns(response).times(2)
|
163
|
+
crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
|
164
|
+
crawler.times_to_crawl = 2
|
165
|
+
crawler.crawl
|
166
|
+
|
167
|
+
end
|
168
|
+
|
169
|
+
it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
|
170
|
+
crawler = Relevance::Tarantula::Crawler.new
|
171
|
+
stub_puts_and_print(crawler)
|
172
|
+
crawler.proxy = stub
|
173
|
+
response = stub(:code => "200")
|
174
|
+
crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
|
175
|
+
crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2
|
176
|
+
crawler.forms_to_crawl << stub_everything(:method => "post",
|
177
|
+
:action => "/foo",
|
178
|
+
:data => "some data",
|
179
|
+
:to_s => "stub")
|
180
|
+
crawler.proxy.expects(:post).returns(response).times(2)
|
181
|
+
crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
|
182
|
+
crawler.times_to_crawl = 2
|
183
|
+
crawler.crawl
|
184
|
+
end
|
132
185
|
|
133
|
-
it "invokes queued forms, logs responses, and calls handlers" do
|
134
|
-
crawler = Relevance::Tarantula::Crawler.new
|
135
|
-
crawler.forms_to_crawl << stub_everything(:method => "get",
|
136
|
-
:action => "/foo",
|
137
|
-
:data => "some data",
|
138
|
-
:to_s => "stub")
|
139
|
-
crawler.proxy = stub_everything(:send => stub(:code => "200" ))
|
140
|
-
crawler.expects(:log).with("Response 200 for stub")
|
141
|
-
crawler.expects(:blip)
|
142
|
-
crawler.crawl_queued_forms
|
143
186
|
end
|
144
187
|
|
145
|
-
|
146
|
-
crawler = Relevance::Tarantula::Crawler.new
|
147
|
-
stub_puts_and_print(crawler)
|
148
|
-
crawler.proxy = stub
|
149
|
-
response = stub(:code => "200")
|
150
|
-
crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
|
151
|
-
crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2
|
152
|
-
crawler.forms_to_crawl << stub_everything(:method => "post",
|
153
|
-
:action => "/foo",
|
154
|
-
:data => "some data",
|
155
|
-
:to_s => "stub")
|
156
|
-
crawler.proxy.expects(:post).returns(response).times(2)
|
157
|
-
crawler.expects(:links_completed_count).returns(*(0..6).to_a).times(6)
|
158
|
-
crawler.times_to_crawl = 2
|
159
|
-
crawler.crawl
|
160
|
-
end
|
161
|
-
end
|
188
|
+
describe "report_results" do
|
162
189
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
end
|
170
|
-
it "blips nothing if verbose" do
|
171
|
-
crawler = Relevance::Tarantula::Crawler.new
|
172
|
-
crawler.stubs(:verbose).returns true
|
173
|
-
crawler.expects(:print).never
|
174
|
-
crawler.blip
|
190
|
+
it "delegates to generate_reports" do
|
191
|
+
crawler = Relevance::Tarantula::Crawler.new
|
192
|
+
crawler.expects(:generate_reports)
|
193
|
+
crawler.report_results
|
194
|
+
end
|
195
|
+
|
175
196
|
end
|
176
|
-
|
197
|
+
|
198
|
+
describe "blip" do
|
177
199
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
200
|
+
it "blips the current progress if !verbose" do
|
201
|
+
crawler = Relevance::Tarantula::Crawler.new
|
202
|
+
crawler.stubs(:verbose).returns false
|
203
|
+
crawler.stubs(:timeout_if_too_long)
|
204
|
+
crawler.expects(:print).with("\r 0 of 0 links completed ")
|
205
|
+
crawler.blip
|
206
|
+
end
|
207
|
+
|
208
|
+
it "blips nothing if verbose" do
|
209
|
+
crawler = Relevance::Tarantula::Crawler.new
|
210
|
+
crawler.stubs(:verbose).returns true
|
211
|
+
crawler.expects(:print).never
|
212
|
+
crawler.blip
|
213
|
+
end
|
214
|
+
|
182
215
|
end
|
216
|
+
|
217
|
+
describe "finished?" do
|
183
218
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
end
|
219
|
+
it "is finished when the links and forms are crawled" do
|
220
|
+
crawler = Relevance::Tarantula::Crawler.new
|
221
|
+
crawler.finished?.should == true
|
222
|
+
end
|
189
223
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
224
|
+
it "isn't finished when links remain" do
|
225
|
+
crawler = Relevance::Tarantula::Crawler.new
|
226
|
+
crawler.links_to_crawl = [:stub_link]
|
227
|
+
crawler.finished?.should == false
|
228
|
+
end
|
229
|
+
|
230
|
+
it "isn't finished when links remain" do
|
231
|
+
crawler = Relevance::Tarantula::Crawler.new
|
232
|
+
crawler.forms_to_crawl = [:stub_form]
|
233
|
+
crawler.finished?.should == false
|
234
|
+
end
|
235
|
+
|
194
236
|
end
|
195
|
-
|
237
|
+
|
196
238
|
it "crawls links and forms again and again until finished?==true" do
|
197
239
|
crawler = Relevance::Tarantula::Crawler.new
|
198
240
|
crawler.expects(:finished?).times(3).returns(false, false, true)
|
199
241
|
crawler.expects(:crawl_queued_links).times(2)
|
200
242
|
crawler.expects(:crawl_queued_forms).times(2)
|
201
|
-
crawler.do_crawl
|
243
|
+
crawler.do_crawl(1)
|
202
244
|
end
|
203
245
|
|
204
246
|
it "asks each reporter to write its report in report_dir" do
|
@@ -225,72 +267,120 @@ describe 'Relevance::Tarantula::Crawler' do
|
|
225
267
|
crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
|
226
268
|
end
|
227
269
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
@crawler.expects(:log).with("Skipping long url /foo")
|
239
|
-
@crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
|
240
|
-
end
|
270
|
+
describe "link skipping" do
|
271
|
+
|
272
|
+
before { @crawler = Relevance::Tarantula::Crawler.new }
|
273
|
+
|
274
|
+
it "skips links that are too long" do
|
275
|
+
@crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
|
276
|
+
@crawler.max_url_length = 2
|
277
|
+
@crawler.expects(:log).with("Skipping long url /foo")
|
278
|
+
@crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
|
279
|
+
end
|
241
280
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
281
|
+
it "skips outbound links (those that begin with http)" do
|
282
|
+
@crawler.expects(:log).with("Skipping http-anything")
|
283
|
+
@crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true
|
284
|
+
end
|
246
285
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
286
|
+
it "skips javascript links (those that begin with javascript)" do
|
287
|
+
@crawler.expects(:log).with("Skipping javascript-anything")
|
288
|
+
@crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true
|
289
|
+
end
|
251
290
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
291
|
+
it "skips mailto links (those that begin with http)" do
|
292
|
+
@crawler.expects(:log).with("Skipping mailto-anything")
|
293
|
+
@crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true
|
294
|
+
end
|
256
295
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
296
|
+
it 'skips blank links' do
|
297
|
+
@crawler.queue_link(nil)
|
298
|
+
@crawler.links_to_crawl.should == []
|
299
|
+
@crawler.queue_link("")
|
300
|
+
@crawler.links_to_crawl.should == []
|
301
|
+
end
|
263
302
|
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
303
|
+
it "logs and skips links that match a pattern" do
|
304
|
+
@crawler.expects(:log).with("Skipping /the-red-button")
|
305
|
+
@crawler.skip_uri_patterns << /red-button/
|
306
|
+
@crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button")
|
307
|
+
@crawler.queue_link("/the-red-button").should == nil
|
308
|
+
end
|
270
309
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
310
|
+
it "logs and skips form submissions that match a pattern" do
|
311
|
+
@crawler.expects(:log).with("Skipping /reset-password-form")
|
312
|
+
@crawler.skip_uri_patterns << /reset-password/
|
313
|
+
fs = stub_everything(:action => "/reset-password-form")
|
314
|
+
@crawler.should_skip_form_submission?(fs).should == true
|
315
|
+
end
|
276
316
|
end
|
277
|
-
|
317
|
+
|
318
|
+
describe "allow_nnn_for" do
|
278
319
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
320
|
+
it "installs result as a response_code_handler" do
|
321
|
+
crawler = Relevance::Tarantula::Crawler.new
|
322
|
+
crawler.response_code_handler.should == Relevance::Tarantula::Result
|
323
|
+
end
|
324
|
+
|
325
|
+
it "delegates to the response_code_handler" do
|
326
|
+
crawler = Relevance::Tarantula::Crawler.new
|
327
|
+
(response_code_handler = mock).expects(:allow_404_for).with(:stub)
|
328
|
+
crawler.response_code_handler = response_code_handler
|
329
|
+
crawler.allow_404_for(:stub)
|
330
|
+
end
|
331
|
+
|
332
|
+
it "chains up to super for method_missing" do
|
333
|
+
crawler = Relevance::Tarantula::Crawler.new
|
334
|
+
lambda{crawler.foo}.should raise_error(NoMethodError)
|
335
|
+
end
|
336
|
+
|
283
337
|
end
|
284
338
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
339
|
+
describe "timeouts" do
|
340
|
+
|
341
|
+
it "sets start and end times for a single crawl" do
|
342
|
+
start_time = Time.parse("March 1st, 2008 10:00am")
|
343
|
+
end_time = Time.parse("March 1st, 2008 10:10am")
|
344
|
+
Time.stubs(:now).returns(start_time, end_time)
|
345
|
+
|
346
|
+
crawler = Relevance::Tarantula::Crawler.new
|
347
|
+
stub_puts_and_print(crawler)
|
348
|
+
crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
|
349
|
+
crawler.crawl
|
350
|
+
crawler.crawl_start_times.first.should == start_time
|
351
|
+
crawler.crawl_end_times.first.should == end_time
|
352
|
+
end
|
353
|
+
|
354
|
+
it "has elasped time for a crawl" do
|
355
|
+
start_time = Time.parse("March 1st, 2008 10:00am")
|
356
|
+
elasped_time_check = Time.parse("March 1st, 2008, 10:10:00am")
|
357
|
+
Time.stubs(:now).returns(start_time, elasped_time_check)
|
358
|
+
|
359
|
+
crawler = Relevance::Tarantula::Crawler.new
|
360
|
+
stub_puts_and_print(crawler)
|
361
|
+
crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
|
362
|
+
crawler.crawl
|
363
|
+
crawler.elasped_time_for_pass(0).should == 600.seconds
|
364
|
+
end
|
365
|
+
|
366
|
+
it "raises out of the crawl if elasped time is greater then the crawl timeout" do
|
367
|
+
start_time = Time.parse("March 1st, 2008 10:00am")
|
368
|
+
elasped_time_check = Time.parse("March 1st, 2008, 10:35:00am")
|
369
|
+
Time.stubs(:now).returns(start_time, elasped_time_check)
|
370
|
+
|
371
|
+
crawler = Relevance::Tarantula::Crawler.new
|
372
|
+
crawler.crawl_timeout = 5.minutes
|
373
|
+
|
374
|
+
crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
|
375
|
+
crawler.proxy = stub
|
376
|
+
crawler.proxy.stubs(:get).returns(response = stub(:code => "200"))
|
377
|
+
|
378
|
+
stub_puts_and_print(crawler)
|
379
|
+
lambda {
|
380
|
+
crawler.do_crawl(0)
|
381
|
+
}.should raise_error
|
382
|
+
end
|
383
|
+
|
290
384
|
end
|
291
385
|
|
292
|
-
|
293
|
-
crawler = Relevance::Tarantula::Crawler.new
|
294
|
-
lambda{crawler.foo}.should raise_error(NoMethodError)
|
295
|
-
end
|
296
|
-
end
|
386
|
+
end
|
@@ -7,11 +7,13 @@ class Relevance::Tarantula::Crawler
|
|
7
7
|
extend Forwardable
|
8
8
|
include Relevance::Tarantula
|
9
9
|
|
10
|
+
class CrawlTimeout < RuntimeError; end
|
11
|
+
|
10
12
|
attr_accessor :proxy, :handlers, :skip_uri_patterns, :log_grabber,
|
11
13
|
:reporters, :links_to_crawl, :links_queued, :forms_to_crawl,
|
12
14
|
:form_signatures_queued, :max_url_length, :response_code_handler,
|
13
|
-
:times_to_crawl, :fuzzers, :test_name
|
14
|
-
attr_reader :transform_url_patterns, :referrers, :failures, :successes
|
15
|
+
:times_to_crawl, :fuzzers, :test_name, :crawl_timeout
|
16
|
+
attr_reader :transform_url_patterns, :referrers, :failures, :successes, :crawl_start_times, :crawl_end_times
|
15
17
|
|
16
18
|
def initialize
|
17
19
|
@max_url_length = 1024
|
@@ -22,6 +24,8 @@ class Relevance::Tarantula::Crawler
|
|
22
24
|
@form_signatures_queued = Set.new
|
23
25
|
@links_to_crawl = []
|
24
26
|
@forms_to_crawl = []
|
27
|
+
@crawl_start_times, @crawl_end_times = [], []
|
28
|
+
@crawl_timeout = 20.minutes
|
25
29
|
@referrers = {}
|
26
30
|
@skip_uri_patterns = [
|
27
31
|
/^javascript/,
|
@@ -53,13 +57,18 @@ class Relevance::Tarantula::Crawler
|
|
53
57
|
orig_form_signatures_queued = @form_signatures_queued.dup
|
54
58
|
orig_links_to_crawl = @links_to_crawl.dup
|
55
59
|
orig_forms_to_crawl = @forms_to_crawl.dup
|
56
|
-
@times_to_crawl.times do |
|
60
|
+
@times_to_crawl.times do |num|
|
57
61
|
queue_link url
|
58
|
-
|
59
|
-
|
60
|
-
|
62
|
+
|
63
|
+
begin
|
64
|
+
do_crawl num
|
65
|
+
rescue CrawlTimeout => e
|
66
|
+
puts e.message
|
67
|
+
end
|
68
|
+
|
69
|
+
puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1
|
61
70
|
|
62
|
-
if
|
71
|
+
if num + 1 < @times_to_crawl
|
63
72
|
@links_queued = orig_links_queued
|
64
73
|
@form_signatures_queued = orig_form_signatures_queued
|
65
74
|
@links_to_crawl = orig_links_to_crawl
|
@@ -77,19 +86,21 @@ class Relevance::Tarantula::Crawler
|
|
77
86
|
@links_to_crawl.empty? && @forms_to_crawl.empty?
|
78
87
|
end
|
79
88
|
|
80
|
-
def do_crawl
|
89
|
+
def do_crawl(number)
|
81
90
|
while (!finished?)
|
82
|
-
|
83
|
-
|
91
|
+
@crawl_start_times << Time.now
|
92
|
+
crawl_queued_links(number)
|
93
|
+
crawl_queued_forms(number)
|
94
|
+
@crawl_end_times << Time.now
|
84
95
|
end
|
85
96
|
end
|
86
97
|
|
87
|
-
def crawl_queued_links
|
98
|
+
def crawl_queued_links(number = 0)
|
88
99
|
while (link = @links_to_crawl.pop)
|
89
100
|
response = proxy.send(link.method, link.href)
|
90
101
|
log "Response #{response.code} for #{link}"
|
91
102
|
handle_link_results(link, response)
|
92
|
-
blip
|
103
|
+
blip(number)
|
93
104
|
end
|
94
105
|
end
|
95
106
|
|
@@ -124,13 +135,17 @@ class Relevance::Tarantula::Crawler
|
|
124
135
|
Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain")
|
125
136
|
end
|
126
137
|
|
127
|
-
def crawl_queued_forms
|
138
|
+
def crawl_queued_forms(number = 0)
|
128
139
|
while (form = @forms_to_crawl.pop)
|
129
140
|
response = crawl_form(form)
|
130
141
|
handle_form_results(form, response)
|
131
|
-
blip
|
142
|
+
blip(number)
|
132
143
|
end
|
133
144
|
end
|
145
|
+
|
146
|
+
def elasped_time_for_pass(num)
|
147
|
+
Time.now - crawl_start_times[num]
|
148
|
+
end
|
134
149
|
|
135
150
|
def grab_log!
|
136
151
|
@log_grabber && @log_grabber.grab!
|
@@ -234,9 +249,16 @@ class Relevance::Tarantula::Crawler
|
|
234
249
|
total_links_count - links_remaining_count
|
235
250
|
end
|
236
251
|
|
237
|
-
def blip
|
252
|
+
def blip(number = 0)
|
238
253
|
unless verbose
|
239
254
|
print "\r #{links_completed_count} of #{total_links_count} links completed "
|
255
|
+
timeout_if_too_long(number)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
def timeout_if_too_long(number = 0)
|
260
|
+
if elasped_time_for_pass(number) > crawl_timeout
|
261
|
+
raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
|
240
262
|
end
|
241
263
|
end
|
242
264
|
end
|
@@ -23,9 +23,9 @@
|
|
23
23
|
and lives at <a href="http://github.com/relevance/tarantula">http://github.com/relevance/tarantula</a>.</p>
|
24
24
|
<hr/>
|
25
25
|
</div>
|
26
|
-
<div id="page">
|
26
|
+
<div id="page">
|
27
27
|
<div id="tabs-container">
|
28
|
-
<ul class="tabs"
|
28
|
+
<ul class="tabs"> </ul>
|
29
29
|
</div>
|
30
30
|
|
31
31
|
<div id="results-container">
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relevance-tarantula
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Relevance, Inc.
|
@@ -9,10 +9,29 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-04-06 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: htmlentities
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: hpricot
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
16
35
|
description: A big hairy fuzzy spider that crawls your site, wreaking havoc
|
17
36
|
email: opensource@thinkrelevance.com
|
18
37
|
executables: []
|