cobweb 0.0.55 → 0.0.57
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +2 -1
- data/lib/cobweb.rb +4 -1
- data/lib/cobweb_crawler.rb +1 -4
- data/lib/cobweb_version.rb +1 -1
- data/lib/content_link_parser.rb +1 -0
- data/lib/crawl_job.rb +4 -3
- data/lib/server.rb +2 -2
- data/lib/stats.rb +6 -1
- data/spec/cobweb/cobweb_job_spec.rb +162 -24
- data/spec/samples/sample_server.rb +21 -0
- data/spec/samples/sample_site/boxgrid.html +258 -0
- data/spec/samples/sample_site/css/accordion.css +45 -0
- data/spec/samples/sample_site/css/datatable.css +189 -0
- data/spec/samples/sample_site/css/datepicker.css +171 -0
- data/spec/samples/sample_site/css/form-buttons.css +180 -0
- data/spec/samples/sample_site/css/forms.css +489 -0
- data/spec/samples/sample_site/css/jquery.fancybox-1.3.4.css +282 -0
- data/spec/samples/sample_site/css/jquery.treeview.css +103 -0
- data/spec/samples/sample_site/css/link-buttons.css +187 -0
- data/spec/samples/sample_site/css/login.css +110 -0
- data/spec/samples/sample_site/css/menu.css +156 -0
- data/spec/samples/sample_site/css/messages.css +58 -0
- data/spec/samples/sample_site/css/modalbox.css +63 -0
- data/spec/samples/sample_site/css/statics.css +57 -0
- data/spec/samples/sample_site/css/style.css +497 -0
- data/spec/samples/sample_site/css/style_text.css +128 -0
- data/spec/samples/sample_site/css/tabs.css +58 -0
- data/spec/samples/sample_site/css/wysiwyg-editor.css +18 -0
- data/spec/samples/sample_site/css/wysiwyg.css +149 -0
- data/spec/samples/sample_site/css/wysiwyg.modal.css +69 -0
- data/spec/samples/sample_site/dashboard.html +852 -0
- data/spec/samples/sample_site/forms.html +329 -0
- data/spec/samples/sample_site/gallery.html +263 -0
- data/spec/samples/sample_site/gfx/back-menu.gif +0 -0
- data/spec/samples/sample_site/gfx/back-submenu.gif +0 -0
- data/spec/samples/sample_site/gfx/background.gif +0 -0
- data/spec/samples/sample_site/gfx/box-hide.png +0 -0
- data/spec/samples/sample_site/gfx/box-search.png +0 -0
- data/spec/samples/sample_site/gfx/box-title.gif +0 -0
- data/spec/samples/sample_site/gfx/code.gif +0 -0
- data/spec/samples/sample_site/gfx/datepicker-arrows.gif +0 -0
- data/spec/samples/sample_site/gfx/fancybox/blank.gif +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_close.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_loading.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_nav_left.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_nav_right.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_title_left.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_title_main.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_title_over.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_title_right.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancybox-x.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancybox.png +0 -0
- data/spec/samples/sample_site/gfx/forms/date-next.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/date-prev.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-checkbox.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-date.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-file.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-input-big.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-input-medium.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-input-small.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-input-xl.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-radio.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-selectbox-small.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-selectbox.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-textarea-big.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-textarea-medium.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-textarea-small.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-textarea-xl.gif +0 -0
- data/spec/samples/sample_site/gfx/icon-delete.png +0 -0
- data/spec/samples/sample_site/gfx/icon-edit.png +0 -0
- data/spec/samples/sample_site/gfx/icon-home.gif +0 -0
- data/spec/samples/sample_site/gfx/img-delete.png +0 -0
- data/spec/samples/sample_site/gfx/img-hover.png +0 -0
- data/spec/samples/sample_site/gfx/img-zoom.png +0 -0
- data/spec/samples/sample_site/gfx/jquery.wysiwyg.gif +0 -0
- data/spec/samples/sample_site/gfx/label-icons.gif +0 -0
- data/spec/samples/sample_site/gfx/label.gif +0 -0
- data/spec/samples/sample_site/gfx/li-down.gif +0 -0
- data/spec/samples/sample_site/gfx/li.gif +0 -0
- data/spec/samples/sample_site/gfx/link-button-big.gif +0 -0
- data/spec/samples/sample_site/gfx/link-button-medium.gif +0 -0
- data/spec/samples/sample_site/gfx/link-button.gif +0 -0
- data/spec/samples/sample_site/gfx/loading-2.gif +0 -0
- data/spec/samples/sample_site/gfx/loading.gif +0 -0
- data/spec/samples/sample_site/gfx/logo.png +0 -0
- data/spec/samples/sample_site/gfx/modal-title.gif +0 -0
- data/spec/samples/sample_site/gfx/photos/00.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/01.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/01xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/02.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/02xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/03.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/03xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/04.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/04xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/05.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/05xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/06.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/06xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/07.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/07xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/08.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/08xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/09.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/09xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/10.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/10xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/11.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/11xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/12.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/12xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/13.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/13xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/14.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/14xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/15.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/15xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/search-button.gif +0 -0
- data/spec/samples/sample_site/gfx/search-input.gif +0 -0
- data/spec/samples/sample_site/gfx/slider-button.gif +0 -0
- data/spec/samples/sample_site/gfx/system-messages.gif +0 -0
- data/spec/samples/sample_site/gfx/table-asc-arrow.gif +0 -0
- data/spec/samples/sample_site/gfx/table-desc-arrow.gif +0 -0
- data/spec/samples/sample_site/gfx/table-first.gif +0 -0
- data/spec/samples/sample_site/gfx/table-last.gif +0 -0
- data/spec/samples/sample_site/gfx/table-next.gif +0 -0
- data/spec/samples/sample_site/gfx/table-number.gif +0 -0
- data/spec/samples/sample_site/gfx/table-prev.gif +0 -0
- data/spec/samples/sample_site/gfx/table-rows.gif +0 -0
- data/spec/samples/sample_site/gfx/table-search.gif +0 -0
- data/spec/samples/sample_site/gfx/table-thead.gif +0 -0
- data/spec/samples/sample_site/gfx/tooltip.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/ajax-loader.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/file.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/folder-closed.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/folder.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/minus.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/plus.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/treeview-default-line.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/treeview-default.gif +0 -0
- data/spec/samples/sample_site/index.html +852 -0
- data/spec/samples/sample_site/js/controls/wysiwyg.image.js +284 -0
- data/spec/samples/sample_site/js/controls/wysiwyg.link.js +210 -0
- data/spec/samples/sample_site/js/controls/wysiwyg.table.js +151 -0
- data/spec/samples/sample_site/js/customInput.jquery.js +68 -0
- data/spec/samples/sample_site/js/excanvas.min.js +1 -0
- data/spec/samples/sample_site/js/hoverIntent.js +84 -0
- data/spec/samples/sample_site/js/inline.js +392 -0
- data/spec/samples/sample_site/js/jquery-1.7.1.min.js +4 -0
- data/spec/samples/sample_site/js/jquery-ui-select.js +522 -0
- data/spec/samples/sample_site/js/jquery-ui-timepicker-addon.js +1299 -0
- data/spec/samples/sample_site/js/jquery-ui.js +791 -0
- data/spec/samples/sample_site/js/jquery.dataTables.js +7440 -0
- data/spec/samples/sample_site/js/jquery.fancybox-1.3.4.js +1156 -0
- data/spec/samples/sample_site/js/jquery.filestyle.mini.js +2 -0
- data/spec/samples/sample_site/js/jquery.flot.js +2600 -0
- data/spec/samples/sample_site/js/jquery.flot.resize.min.js +60 -0
- data/spec/samples/sample_site/js/jquery.graphtable-0.2.js +179 -0
- data/spec/samples/sample_site/js/jquery.tipsy.js +104 -0
- data/spec/samples/sample_site/js/jquery.treeview.js +256 -0
- data/spec/samples/sample_site/js/jquery.wysiwyg.js +2454 -0
- data/spec/samples/sample_site/js/plugins/wysiwyg.rmFormat.js +348 -0
- data/spec/samples/sample_site/js/superfish.js +121 -0
- data/spec/samples/sample_site/js/supersubs.js +90 -0
- data/spec/samples/sample_site/more.html +503 -0
- data/spec/samples/sample_site/tables.html +845 -0
- data/spec/samples/sample_site/text/museosans-webfont.eot +0 -0
- data/spec/samples/sample_site/text/museosans-webfont.svg +231 -0
- data/spec/samples/sample_site/text/museosans-webfont.ttf +0 -0
- data/spec/samples/sample_site/text/museosans-webfont.woff +0 -0
- data/spec/samples/sample_site/text/museosans_bold-webfont.eot +0 -0
- data/spec/samples/sample_site/text/museosans_bold-webfont.svg +231 -0
- data/spec/samples/sample_site/text/museosans_bold-webfont.ttf +0 -0
- data/spec/samples/sample_site/text/museosans_bold-webfont.woff +0 -0
- data/spec/samples/sample_site/typography.html +192 -0
- data/spec/spec_helper.rb +10 -2
- metadata +196 -26
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -56,7 +56,9 @@ class Cobweb
|
|
56
56
|
|
57
57
|
if @options[:internal_urls].nil? || @options[:internal_urls].empty?
|
58
58
|
uri = Addressable::URI.parse(base_url)
|
59
|
-
@options[:internal_urls] = [
|
59
|
+
@options[:internal_urls] = []
|
60
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
|
61
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, ":", uri.inferred_port, "/*"].join
|
60
62
|
end
|
61
63
|
|
62
64
|
request.merge!(@options)
|
@@ -72,6 +74,7 @@ class Cobweb
|
|
72
74
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
73
75
|
|
74
76
|
Resque.enqueue(CrawlJob, request)
|
77
|
+
request
|
75
78
|
end
|
76
79
|
|
77
80
|
# Returns array of cookies from content
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -20,7 +20,7 @@ class CobwebCrawler
|
|
20
20
|
@options[:crawl_id] = @crawl_id
|
21
21
|
end
|
22
22
|
|
23
|
-
@redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{@crawl_id}")
|
23
|
+
@redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{@crawl_id}")
|
24
24
|
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
25
25
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
26
26
|
@debug = @options[:debug]
|
@@ -42,9 +42,6 @@ class CobwebCrawler
|
|
42
42
|
|
43
43
|
@crawl_options = crawl_options
|
44
44
|
|
45
|
-
puts "http://localhost:4567/statistics/#{@crawl_id}"
|
46
|
-
puts ""
|
47
|
-
|
48
45
|
@redis.sadd("queued", base_url) unless @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url)
|
49
46
|
crawl_counter = @redis.scard("crawled").to_i
|
50
47
|
queue_counter = @redis.scard("queued").to_i
|
data/lib/cobweb_version.rb
CHANGED
data/lib/content_link_parser.rb
CHANGED
@@ -46,6 +46,7 @@ class ContentLinkParser
|
|
46
46
|
data = link_data
|
47
47
|
links = data.keys.map{|key| data[key]}.flatten.uniq
|
48
48
|
links = links.map{|link| UriHelper.join_no_fragment(@url, link).to_s }
|
49
|
+
links = links.reject{|link| link =~ /\/([^\/]+?)\/\1\// }
|
49
50
|
links = links.reject{|link| link =~ /([^\/]+?)\/([^\/]+?)\/.*?\1\/\2/ }
|
50
51
|
links = links.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
|
51
52
|
links
|
data/lib/crawl_job.rb
CHANGED
@@ -36,6 +36,7 @@ class CrawlJob
|
|
36
36
|
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
37
37
|
|
38
38
|
## update statistics
|
39
|
+
@stats.update_status("Crawling #{content_request[:url]}...")
|
39
40
|
@stats.update_statistics(content)
|
40
41
|
|
41
42
|
# set the base url if this is the first page
|
@@ -72,7 +73,7 @@ class CrawlJob
|
|
72
73
|
if @redis.scard("queued") == 0
|
73
74
|
finished(content_request)
|
74
75
|
end
|
75
|
-
elsif @queue_counter == 0 || @crawl_counter
|
76
|
+
elsif @queue_counter == 0 || @crawl_counter > content_request[:crawl_limit].to_i
|
76
77
|
finished(content_request)
|
77
78
|
end
|
78
79
|
end
|
@@ -109,12 +110,12 @@ class CrawlJob
|
|
109
110
|
|
110
111
|
# Returns true if the crawl count is within limits
|
111
112
|
def self.within_crawl_limits?(crawl_limit)
|
112
|
-
crawl_limit.nil? or @crawl_counter
|
113
|
+
crawl_limit.nil? or @crawl_counter <= crawl_limit.to_i
|
113
114
|
end
|
114
115
|
|
115
116
|
# Returns true if the queue count is calculated to be still within limits when complete
|
116
117
|
def self.within_queue_limits?(crawl_limit)
|
117
|
-
within_crawl_limits?(crawl_limit)
|
118
|
+
within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
|
118
119
|
end
|
119
120
|
|
120
121
|
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
data/lib/server.rb
CHANGED
@@ -17,7 +17,7 @@ class Server < Sinatra::Base
|
|
17
17
|
|
18
18
|
@crawls = []
|
19
19
|
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
20
|
-
redis = NamespacedRedis.new({}, "cobweb-#{crawl_id}")
|
20
|
+
redis = NamespacedRedis.new({}, "cobweb-#{Cobweb.version}-#{crawl_id}")
|
21
21
|
stats = HashUtil.deep_symbolize_keys({
|
22
22
|
:crawl_details => redis.hgetall("crawl_details"),
|
23
23
|
:statistics => redis.hgetall("statistics"),
|
@@ -31,7 +31,7 @@ class Server < Sinatra::Base
|
|
31
31
|
|
32
32
|
# Sinatra Crawl Detail
|
33
33
|
get '/statistics/:crawl_id' do
|
34
|
-
redis = NamespacedRedis.new({}, "cobweb-#{params[:crawl_id]}")
|
34
|
+
redis = NamespacedRedis.new({}, "cobweb-#{Cobweb.version}-#{params[:crawl_id]}")
|
35
35
|
|
36
36
|
@statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
|
37
37
|
if @statistics[:status_counts].nil?
|
data/lib/stats.rb
CHANGED
@@ -4,8 +4,9 @@ class Stats
|
|
4
4
|
|
5
5
|
# Sets up redis usage for statistics
|
6
6
|
def initialize(options)
|
7
|
+
options[:redis_options] = {} unless options.has_key? :redis_options
|
7
8
|
@full_redis = Redis.new(options[:redis_options])
|
8
|
-
@redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{options[:crawl_id]}")
|
9
|
+
@redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
|
9
10
|
end
|
10
11
|
|
11
12
|
# Sets up the crawl in statistics
|
@@ -26,6 +27,10 @@ class Stats
|
|
26
27
|
@redis.del "crawl_details"
|
27
28
|
end
|
28
29
|
|
30
|
+
def get_crawled
|
31
|
+
@redis.smembers "crawled"
|
32
|
+
end
|
33
|
+
|
29
34
|
# Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
|
30
35
|
def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
|
31
36
|
|
@@ -1,41 +1,179 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
2
|
|
3
|
-
describe Cobweb do
|
3
|
+
describe Cobweb, :local_only => true do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
#store all existing resque process ids
|
7
|
+
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
8
|
+
|
9
|
+
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
|
+
puts "Starting Workers... Please Wait..."
|
11
|
+
`mkdir log`
|
12
|
+
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=5 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
|
+
puts "Workers Started."
|
14
|
+
|
15
|
+
# START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
|
16
|
+
@thin = nil
|
17
|
+
Thread.new do
|
18
|
+
@thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
|
19
|
+
end
|
20
|
+
|
21
|
+
# WAIT FOR START TO COMPLETE
|
22
|
+
sleep 1
|
23
|
+
|
24
|
+
end
|
4
25
|
|
5
26
|
before(:each) do
|
6
|
-
@base_url = "http://
|
7
|
-
@
|
27
|
+
@base_url = "http://localhost:3532/"
|
28
|
+
@base_page_count = 77
|
29
|
+
|
30
|
+
clear_queues
|
8
31
|
end
|
9
32
|
|
10
|
-
describe "
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
it "should not limit crawl"
|
22
|
-
it "should detect the end or crawl"
|
33
|
+
describe "with no crawl limit" do
|
34
|
+
before(:each) do
|
35
|
+
@request = {
|
36
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
37
|
+
:crawl_limit => nil,
|
38
|
+
:quiet => false,
|
39
|
+
:debug => false,
|
40
|
+
:cache => nil
|
41
|
+
}
|
42
|
+
@cobweb = Cobweb.new @request
|
23
43
|
end
|
24
|
-
|
25
|
-
|
44
|
+
|
45
|
+
it "should crawl entire site" do
|
46
|
+
crawl = @cobweb.start(@base_url)
|
47
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
48
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
49
|
+
Resque.size("cobweb_process_job").should == @base_page_count
|
50
|
+
end
|
51
|
+
it "detect crawl finished" do
|
52
|
+
crawl = @cobweb.start(@base_url)
|
53
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
54
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
55
|
+
Resque.size("cobweb_finished_job").should == 1
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "with a crawl limit" do
|
60
|
+
before(:each) do
|
26
61
|
@request = {
|
27
62
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
28
|
-
:
|
29
|
-
:
|
63
|
+
:quiet => true,
|
64
|
+
:cache => nil
|
30
65
|
}
|
66
|
+
@cobweb = Cobweb.new @request
|
67
|
+
end
|
68
|
+
|
69
|
+
describe "limit to 1" do
|
70
|
+
before(:each) do
|
71
|
+
@request[:crawl_limit] = 1
|
72
|
+
end
|
31
73
|
|
32
|
-
it "should
|
33
|
-
|
34
|
-
|
74
|
+
it "should not crawl the entire site" do
|
75
|
+
crawl = @cobweb.start(@base_url)
|
76
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
77
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
78
|
+
Resque.size("cobweb_process_job").should_not == @base_page_count
|
79
|
+
end
|
80
|
+
it "should only crawl 1 page" do
|
81
|
+
crawl = @cobweb.start(@base_url)
|
82
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
83
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
84
|
+
Resque.size("cobweb_process_job").should == 1
|
85
|
+
end
|
86
|
+
it "should notify of crawl finished" do
|
87
|
+
crawl = @cobweb.start(@base_url)
|
88
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
89
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
90
|
+
Resque.size("cobweb_finished_job").should == 1
|
91
|
+
end
|
92
|
+
|
35
93
|
end
|
94
|
+
|
95
|
+
describe "limit to 3" do
|
96
|
+
before(:each) do
|
97
|
+
@request[:crawl_limit] = 3
|
98
|
+
end
|
99
|
+
it "should not crawl the entire site" do
|
100
|
+
crawl = @cobweb.start(@base_url)
|
101
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
102
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
103
|
+
Resque.size("cobweb_process_job").should_not == @base_page_count
|
104
|
+
end
|
105
|
+
it "should notify of crawl finished" do
|
106
|
+
crawl = @cobweb.start(@base_url)
|
107
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
108
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
109
|
+
Resque.size("cobweb_finished_job").should == 1
|
110
|
+
end
|
111
|
+
it "should only crawl 3 pages" do
|
112
|
+
crawl = @cobweb.start(@base_url)
|
113
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
114
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
115
|
+
Resque.size("cobweb_process_job").should == 3
|
116
|
+
end
|
36
117
|
|
118
|
+
end
|
119
|
+
|
120
|
+
describe "limit to 100" do
|
121
|
+
before(:each) do
|
122
|
+
@request[:crawl_limit] = 100
|
123
|
+
end
|
37
124
|
|
125
|
+
it "should crawl the entire site" do
|
126
|
+
crawl = @cobweb.start(@base_url)
|
127
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
128
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
129
|
+
Resque.size("cobweb_process_job").should == @base_page_count
|
130
|
+
end
|
131
|
+
it "should notify of crawl finished" do
|
132
|
+
crawl = @cobweb.start(@base_url)
|
133
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
134
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
135
|
+
Resque.size("cobweb_finished_job").should == 1
|
136
|
+
end
|
137
|
+
it "should not crawl 100 pages" do
|
138
|
+
crawl = @cobweb.start(@base_url)
|
139
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
140
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
141
|
+
Resque.size("cobweb_process_job").should_not == 100
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
after(:all) do
|
147
|
+
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
148
|
+
command = "kill #{(@all_processes - @existing_processes).join(" ")}"
|
149
|
+
IO.popen(command)
|
150
|
+
#@thin.stop!
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
154
|
+
|
155
|
+
def wait_for_crawl_finished(crawl_id, timeout=20)
|
156
|
+
counter = 0
|
157
|
+
while(running?(crawl_id) && counter < timeout) do
|
158
|
+
sleep 1
|
159
|
+
counter+=1
|
160
|
+
end
|
161
|
+
if counter > timeout
|
162
|
+
raise "End of crawl not detected"
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def running?(crawl_id)
|
167
|
+
@stat.get_status != "Crawl Stopped"
|
168
|
+
end
|
169
|
+
|
170
|
+
def clear_queues
|
171
|
+
Resque.queues.each do |queue|
|
172
|
+
Resque.remove_queue(queue)
|
38
173
|
end
|
174
|
+
|
175
|
+
Resque.size("cobweb_process_job").should == 0
|
176
|
+
Resque.size("cobweb_finished_job").should == 0
|
177
|
+
end
|
39
178
|
|
40
179
|
|
41
|
-
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
class SampleServer
|
2
|
+
|
3
|
+
# This is the root of our app
|
4
|
+
@root = File.expand_path(File.dirname(__FILE__)) + "/sample_site"
|
5
|
+
|
6
|
+
def self.app
|
7
|
+
Proc.new { |env|
|
8
|
+
# Extract the requested path from the request
|
9
|
+
path = Rack::Utils.unescape(env['PATH_INFO'])
|
10
|
+
index_file = @root + "#{path}/index.html"
|
11
|
+
|
12
|
+
if File.exists?(index_file)
|
13
|
+
# Return the index
|
14
|
+
[200, {'Content-Type' => 'text/html'}, File.read(index_file)]
|
15
|
+
else
|
16
|
+
# Pass the request to the directory app
|
17
|
+
Rack::Directory.new(@root).call(env)
|
18
|
+
end
|
19
|
+
}
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,258 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
2
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
3
|
+
|
4
|
+
<head>
|
5
|
+
|
6
|
+
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
7
|
+
<title>CleanDream</title>
|
8
|
+
|
9
|
+
<style type="text/css">
|
10
|
+
@import url("css/style.css");
|
11
|
+
@import url('css/style_text.css');
|
12
|
+
@import url('css/form-buttons.css');
|
13
|
+
@import url('css/link-buttons.css');
|
14
|
+
@import url('css/menu.css');
|
15
|
+
@import url('css/statics.css');
|
16
|
+
@import url('css/messages.css');
|
17
|
+
@import url('css/datatable.css');
|
18
|
+
@import url('css/accordion.css');
|
19
|
+
@import url('css/tabs.css');
|
20
|
+
@import url('css/forms.css');
|
21
|
+
@import url('css/datepicker.css');
|
22
|
+
@import url('css/modalbox.css');
|
23
|
+
@import url('css/jquery.fancybox-1.3.4.css');
|
24
|
+
@import url('css/jquery.treeview.css');
|
25
|
+
@import url('css/wysiwyg.css');
|
26
|
+
@import url('css/wysiwyg.modal.css');
|
27
|
+
@import url('css/wysiwyg-editor.css');
|
28
|
+
</style>
|
29
|
+
|
30
|
+
<script type="text/javascript" src="js/jquery-1.7.1.min.js"></script>
|
31
|
+
|
32
|
+
<!--[if lte IE 8]>
|
33
|
+
<script type="text/javascript" src="js/excanvas.min.js"></script>
|
34
|
+
<![endif]-->
|
35
|
+
|
36
|
+
</head>
|
37
|
+
|
38
|
+
<body>
|
39
|
+
|
40
|
+
<div class="container">
|
41
|
+
|
42
|
+
<div class="logo-labels">
|
43
|
+
<h1><a href="#">CleanDream</a></h1>
|
44
|
+
<ul>
|
45
|
+
<li><a href="#"><span>Settings</span></a></li>
|
46
|
+
<li class="usermessage"><a href="#"><span>1 new message</span></a></li>
|
47
|
+
<li class="logout"><a href="#"><span>Logout</span></a></li>
|
48
|
+
</ul>
|
49
|
+
</div>
|
50
|
+
|
51
|
+
<div class="menu-search">
|
52
|
+
<ul>
|
53
|
+
<li><a href="dashboard.html">Dashboard</a></li>
|
54
|
+
<li>
|
55
|
+
<a href="#">Dropdown</a>
|
56
|
+
<ul>
|
57
|
+
<li><a href="#">Submenu item 001</a></li>
|
58
|
+
<li><a href="#">Submenu item 002</a></li>
|
59
|
+
<li><a href="#">Submenu item 003</a></li>
|
60
|
+
<li class="current">
|
61
|
+
<a href="#">Submenu item 004</a>
|
62
|
+
<ul>
|
63
|
+
<li><a href="#">Subsubmenu item 001</a></li>
|
64
|
+
<li><a href="#">Subsubmenu item 002</a></li>
|
65
|
+
<li class="current"><a href="#">Subsubmenu item 003</a></li>
|
66
|
+
<li><a href="#">Subsubmenu item 003</a></li>
|
67
|
+
<li><a href="#">Subsubmenu item 005</a></li>
|
68
|
+
</ul>
|
69
|
+
</li>
|
70
|
+
<li> <a href="#">Submenu item 005</a> </li>
|
71
|
+
</ul>
|
72
|
+
</li>
|
73
|
+
<li><a href="typography.html">Typography</a></li>
|
74
|
+
<li class="current"><a href="boxgrid.html">Boxes Grid</a></li>
|
75
|
+
<li><a href="forms.html">Forms</a></li>
|
76
|
+
<li><a href="gallery.html">Gallery</a></li>
|
77
|
+
<li><a href="tables.html">Tables</a></li>
|
78
|
+
<li><a href="more.html">More</a></li>
|
79
|
+
</ul>
|
80
|
+
<div class="search">
|
81
|
+
<form action="" method="post">
|
82
|
+
<input type="text" value="Seachterm" />
|
83
|
+
<button type="submit"></button>
|
84
|
+
</form>
|
85
|
+
</div>
|
86
|
+
</div>
|
87
|
+
|
88
|
+
<div class="breadcrumbs">
|
89
|
+
<ul>
|
90
|
+
<li class="home"><a href="#"></a></li>
|
91
|
+
<li class="break">»</li>
|
92
|
+
<li><a href="#">Menu item</a></li>
|
93
|
+
<li class="break">»</li>
|
94
|
+
<li><a href="#">Menu item</a></li>
|
95
|
+
</ul>
|
96
|
+
</div>
|
97
|
+
|
98
|
+
<div class="section">
|
99
|
+
<div class="full">
|
100
|
+
<div class="box">
|
101
|
+
<div class="title">
|
102
|
+
<h2>Full box</h2>
|
103
|
+
<span class="hide"></span>
|
104
|
+
</div>
|
105
|
+
<div class="content">Place your content here.</div>
|
106
|
+
</div>
|
107
|
+
</div>
|
108
|
+
</div>
|
109
|
+
|
110
|
+
<div class="section">
|
111
|
+
<div class="small">
|
112
|
+
<div class="box">
|
113
|
+
<div class="title">
|
114
|
+
<h2>Small box</h2>
|
115
|
+
<span class="hide"></span>
|
116
|
+
</div>
|
117
|
+
<div class="content">Place your content here.</div>
|
118
|
+
</div>
|
119
|
+
</div>
|
120
|
+
|
121
|
+
<div class="big">
|
122
|
+
<div class="box">
|
123
|
+
<div class="title">
|
124
|
+
<h2>Big box</h2>
|
125
|
+
<span class="hide"></span>
|
126
|
+
</div>
|
127
|
+
<div class="content">Place your content here.</div>
|
128
|
+
</div>
|
129
|
+
</div>
|
130
|
+
</div>
|
131
|
+
|
132
|
+
<div class="section">
|
133
|
+
<div class="medium">
|
134
|
+
<div class="box">
|
135
|
+
<div class="title">
|
136
|
+
<h2>Medium box</h2>
|
137
|
+
<span class="hide"></span>
|
138
|
+
</div>
|
139
|
+
<div class="content">Place your content here.</div>
|
140
|
+
</div>
|
141
|
+
</div>
|
142
|
+
|
143
|
+
<div class="medium">
|
144
|
+
<div class="box">
|
145
|
+
<div class="title">
|
146
|
+
<h2>Medium box</h2>
|
147
|
+
<span class="hide"></span>
|
148
|
+
</div>
|
149
|
+
<div class="content">Place your content here.</div>
|
150
|
+
</div>
|
151
|
+
</div>
|
152
|
+
</div>
|
153
|
+
|
154
|
+
<div class="section">
|
155
|
+
<div class="medium">
|
156
|
+
<div class="box">
|
157
|
+
<div class="title">
|
158
|
+
<h2>Medium box</h2>
|
159
|
+
<span class="hide"></span>
|
160
|
+
</div>
|
161
|
+
<div class="content">Place your content here.</div>
|
162
|
+
</div>
|
163
|
+
</div>
|
164
|
+
|
165
|
+
<div class="small">
|
166
|
+
<div class="box">
|
167
|
+
<div class="title">
|
168
|
+
<h2>Small box</h2>
|
169
|
+
<span class="hide"></span>
|
170
|
+
</div>
|
171
|
+
<div class="content">Place your content here.</div>
|
172
|
+
</div>
|
173
|
+
</div>
|
174
|
+
|
175
|
+
<div class="small">
|
176
|
+
<div class="box">
|
177
|
+
<div class="title">
|
178
|
+
<h2>Small box</h2>
|
179
|
+
<span class="hide"></span>
|
180
|
+
</div>
|
181
|
+
<div class="content">Place your content here.</div>
|
182
|
+
</div>
|
183
|
+
</div>
|
184
|
+
</div>
|
185
|
+
|
186
|
+
<div class="section">
|
187
|
+
<div class="small">
|
188
|
+
<div class="box">
|
189
|
+
<div class="title">
|
190
|
+
<h2>Small box</h2>
|
191
|
+
<span class="hide"></span>
|
192
|
+
</div>
|
193
|
+
<div class="content">Place your content here.</div>
|
194
|
+
</div>
|
195
|
+
</div>
|
196
|
+
|
197
|
+
<div class="small">
|
198
|
+
<div class="box">
|
199
|
+
<div class="title">
|
200
|
+
<h2>Small box</h2>
|
201
|
+
<span class="hide"></span>
|
202
|
+
</div>
|
203
|
+
<div class="content">Place your content here.</div>
|
204
|
+
</div>
|
205
|
+
</div>
|
206
|
+
|
207
|
+
<div class="small">
|
208
|
+
<div class="box">
|
209
|
+
<div class="title">
|
210
|
+
<h2>Small box</h2>
|
211
|
+
<span class="hide"></span>
|
212
|
+
</div>
|
213
|
+
<div class="content">Place your content here.</div>
|
214
|
+
</div>
|
215
|
+
</div>
|
216
|
+
|
217
|
+
<div class="small">
|
218
|
+
<div class="box">
|
219
|
+
<div class="title">
|
220
|
+
<h2>Small box</h2>
|
221
|
+
<span class="hide"></span>
|
222
|
+
</div>
|
223
|
+
<div class="content">Place your content here.</div>
|
224
|
+
</div>
|
225
|
+
</div>
|
226
|
+
</div>
|
227
|
+
|
228
|
+
<div class="footer">
|
229
|
+
<div class="split">© Copyright <a href="#">website.com</a></div>
|
230
|
+
<div class="split right">Powered by <a href="http://themeforest.net/item/cleandream/490140" target="_blank">CleanDream</a></div>
|
231
|
+
</div>
|
232
|
+
</div>
|
233
|
+
|
234
|
+
<script type="text/javascript" src="js/superfish.js"></script>
|
235
|
+
<script type="text/javascript" src="js/supersubs.js"></script>
|
236
|
+
<script type="text/javascript" src="js/hoverIntent.js"></script>
|
237
|
+
<script type="text/javascript" src="js/jquery.flot.js"></script>
|
238
|
+
<script type="text/javascript" src="js/jquery.graphtable-0.2.js"></script>
|
239
|
+
<script type="text/javascript" src="js/jquery.flot.resize.min.js"></script>
|
240
|
+
<script type="text/javascript" src="js/jquery-ui.js"></script>
|
241
|
+
<script type="text/javascript" src="js/jquery-ui-select.js"></script>
|
242
|
+
<script type="text/javascript" src="js/customInput.jquery.js"></script>
|
243
|
+
<script type="text/javascript" src="js/jquery.dataTables.js"></script>
|
244
|
+
<script type="text/javascript" src="js/jquery.fancybox-1.3.4.js"></script>
|
245
|
+
<script type="text/javascript" src="js/jquery.filestyle.mini.js"></script>
|
246
|
+
<script type="text/javascript" src="js/jquery-ui-timepicker-addon.js"></script>
|
247
|
+
<script type="text/javascript" src="js/jquery.treeview.js"></script>
|
248
|
+
<script type="text/javascript" src="js/jquery.tipsy.js"></script>
|
249
|
+
<script type="text/javascript" src="js/jquery.wysiwyg.js"></script>
|
250
|
+
<script type="text/javascript" src="js/plugins/wysiwyg.rmFormat.js"></script>
|
251
|
+
<script type="text/javascript" src="js/controls/wysiwyg.image.js"></script>
|
252
|
+
<script type="text/javascript" src="js/controls/wysiwyg.link.js"></script>
|
253
|
+
<script type="text/javascript" src="js/controls/wysiwyg.table.js"></script>
|
254
|
+
<script type="text/javascript" src="js/inline.js"></script>
|
255
|
+
|
256
|
+
</body>
|
257
|
+
|
258
|
+
</html>
|