cobweb 0.0.55 → 0.0.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +2 -1
- data/lib/cobweb.rb +4 -1
- data/lib/cobweb_crawler.rb +1 -4
- data/lib/cobweb_version.rb +1 -1
- data/lib/content_link_parser.rb +1 -0
- data/lib/crawl_job.rb +4 -3
- data/lib/server.rb +2 -2
- data/lib/stats.rb +6 -1
- data/spec/cobweb/cobweb_job_spec.rb +162 -24
- data/spec/samples/sample_server.rb +21 -0
- data/spec/samples/sample_site/boxgrid.html +258 -0
- data/spec/samples/sample_site/css/accordion.css +45 -0
- data/spec/samples/sample_site/css/datatable.css +189 -0
- data/spec/samples/sample_site/css/datepicker.css +171 -0
- data/spec/samples/sample_site/css/form-buttons.css +180 -0
- data/spec/samples/sample_site/css/forms.css +489 -0
- data/spec/samples/sample_site/css/jquery.fancybox-1.3.4.css +282 -0
- data/spec/samples/sample_site/css/jquery.treeview.css +103 -0
- data/spec/samples/sample_site/css/link-buttons.css +187 -0
- data/spec/samples/sample_site/css/login.css +110 -0
- data/spec/samples/sample_site/css/menu.css +156 -0
- data/spec/samples/sample_site/css/messages.css +58 -0
- data/spec/samples/sample_site/css/modalbox.css +63 -0
- data/spec/samples/sample_site/css/statics.css +57 -0
- data/spec/samples/sample_site/css/style.css +497 -0
- data/spec/samples/sample_site/css/style_text.css +128 -0
- data/spec/samples/sample_site/css/tabs.css +58 -0
- data/spec/samples/sample_site/css/wysiwyg-editor.css +18 -0
- data/spec/samples/sample_site/css/wysiwyg.css +149 -0
- data/spec/samples/sample_site/css/wysiwyg.modal.css +69 -0
- data/spec/samples/sample_site/dashboard.html +852 -0
- data/spec/samples/sample_site/forms.html +329 -0
- data/spec/samples/sample_site/gallery.html +263 -0
- data/spec/samples/sample_site/gfx/back-menu.gif +0 -0
- data/spec/samples/sample_site/gfx/back-submenu.gif +0 -0
- data/spec/samples/sample_site/gfx/background.gif +0 -0
- data/spec/samples/sample_site/gfx/box-hide.png +0 -0
- data/spec/samples/sample_site/gfx/box-search.png +0 -0
- data/spec/samples/sample_site/gfx/box-title.gif +0 -0
- data/spec/samples/sample_site/gfx/code.gif +0 -0
- data/spec/samples/sample_site/gfx/datepicker-arrows.gif +0 -0
- data/spec/samples/sample_site/gfx/fancybox/blank.gif +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_close.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_loading.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_nav_left.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_nav_right.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_title_left.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_title_main.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_title_over.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancy_title_right.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancybox-x.png +0 -0
- data/spec/samples/sample_site/gfx/fancybox/fancybox.png +0 -0
- data/spec/samples/sample_site/gfx/forms/date-next.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/date-prev.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-checkbox.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-date.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-file.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-input-big.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-input-medium.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-input-small.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-input-xl.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-radio.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-selectbox-small.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-selectbox.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-textarea-big.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-textarea-medium.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-textarea-small.gif +0 -0
- data/spec/samples/sample_site/gfx/forms/forms-textarea-xl.gif +0 -0
- data/spec/samples/sample_site/gfx/icon-delete.png +0 -0
- data/spec/samples/sample_site/gfx/icon-edit.png +0 -0
- data/spec/samples/sample_site/gfx/icon-home.gif +0 -0
- data/spec/samples/sample_site/gfx/img-delete.png +0 -0
- data/spec/samples/sample_site/gfx/img-hover.png +0 -0
- data/spec/samples/sample_site/gfx/img-zoom.png +0 -0
- data/spec/samples/sample_site/gfx/jquery.wysiwyg.gif +0 -0
- data/spec/samples/sample_site/gfx/label-icons.gif +0 -0
- data/spec/samples/sample_site/gfx/label.gif +0 -0
- data/spec/samples/sample_site/gfx/li-down.gif +0 -0
- data/spec/samples/sample_site/gfx/li.gif +0 -0
- data/spec/samples/sample_site/gfx/link-button-big.gif +0 -0
- data/spec/samples/sample_site/gfx/link-button-medium.gif +0 -0
- data/spec/samples/sample_site/gfx/link-button.gif +0 -0
- data/spec/samples/sample_site/gfx/loading-2.gif +0 -0
- data/spec/samples/sample_site/gfx/loading.gif +0 -0
- data/spec/samples/sample_site/gfx/logo.png +0 -0
- data/spec/samples/sample_site/gfx/modal-title.gif +0 -0
- data/spec/samples/sample_site/gfx/photos/00.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/01.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/01xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/02.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/02xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/03.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/03xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/04.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/04xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/05.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/05xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/06.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/06xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/07.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/07xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/08.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/08xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/09.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/09xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/10.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/10xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/11.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/11xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/12.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/12xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/13.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/13xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/14.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/14xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/15.jpg +0 -0
- data/spec/samples/sample_site/gfx/photos/15xl.jpg +0 -0
- data/spec/samples/sample_site/gfx/search-button.gif +0 -0
- data/spec/samples/sample_site/gfx/search-input.gif +0 -0
- data/spec/samples/sample_site/gfx/slider-button.gif +0 -0
- data/spec/samples/sample_site/gfx/system-messages.gif +0 -0
- data/spec/samples/sample_site/gfx/table-asc-arrow.gif +0 -0
- data/spec/samples/sample_site/gfx/table-desc-arrow.gif +0 -0
- data/spec/samples/sample_site/gfx/table-first.gif +0 -0
- data/spec/samples/sample_site/gfx/table-last.gif +0 -0
- data/spec/samples/sample_site/gfx/table-next.gif +0 -0
- data/spec/samples/sample_site/gfx/table-number.gif +0 -0
- data/spec/samples/sample_site/gfx/table-prev.gif +0 -0
- data/spec/samples/sample_site/gfx/table-rows.gif +0 -0
- data/spec/samples/sample_site/gfx/table-search.gif +0 -0
- data/spec/samples/sample_site/gfx/table-thead.gif +0 -0
- data/spec/samples/sample_site/gfx/tooltip.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/ajax-loader.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/file.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/folder-closed.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/folder.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/minus.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/plus.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/treeview-default-line.gif +0 -0
- data/spec/samples/sample_site/gfx/treeview/treeview-default.gif +0 -0
- data/spec/samples/sample_site/index.html +852 -0
- data/spec/samples/sample_site/js/controls/wysiwyg.image.js +284 -0
- data/spec/samples/sample_site/js/controls/wysiwyg.link.js +210 -0
- data/spec/samples/sample_site/js/controls/wysiwyg.table.js +151 -0
- data/spec/samples/sample_site/js/customInput.jquery.js +68 -0
- data/spec/samples/sample_site/js/excanvas.min.js +1 -0
- data/spec/samples/sample_site/js/hoverIntent.js +84 -0
- data/spec/samples/sample_site/js/inline.js +392 -0
- data/spec/samples/sample_site/js/jquery-1.7.1.min.js +4 -0
- data/spec/samples/sample_site/js/jquery-ui-select.js +522 -0
- data/spec/samples/sample_site/js/jquery-ui-timepicker-addon.js +1299 -0
- data/spec/samples/sample_site/js/jquery-ui.js +791 -0
- data/spec/samples/sample_site/js/jquery.dataTables.js +7440 -0
- data/spec/samples/sample_site/js/jquery.fancybox-1.3.4.js +1156 -0
- data/spec/samples/sample_site/js/jquery.filestyle.mini.js +2 -0
- data/spec/samples/sample_site/js/jquery.flot.js +2600 -0
- data/spec/samples/sample_site/js/jquery.flot.resize.min.js +60 -0
- data/spec/samples/sample_site/js/jquery.graphtable-0.2.js +179 -0
- data/spec/samples/sample_site/js/jquery.tipsy.js +104 -0
- data/spec/samples/sample_site/js/jquery.treeview.js +256 -0
- data/spec/samples/sample_site/js/jquery.wysiwyg.js +2454 -0
- data/spec/samples/sample_site/js/plugins/wysiwyg.rmFormat.js +348 -0
- data/spec/samples/sample_site/js/superfish.js +121 -0
- data/spec/samples/sample_site/js/supersubs.js +90 -0
- data/spec/samples/sample_site/more.html +503 -0
- data/spec/samples/sample_site/tables.html +845 -0
- data/spec/samples/sample_site/text/museosans-webfont.eot +0 -0
- data/spec/samples/sample_site/text/museosans-webfont.svg +231 -0
- data/spec/samples/sample_site/text/museosans-webfont.ttf +0 -0
- data/spec/samples/sample_site/text/museosans-webfont.woff +0 -0
- data/spec/samples/sample_site/text/museosans_bold-webfont.eot +0 -0
- data/spec/samples/sample_site/text/museosans_bold-webfont.svg +231 -0
- data/spec/samples/sample_site/text/museosans_bold-webfont.ttf +0 -0
- data/spec/samples/sample_site/text/museosans_bold-webfont.woff +0 -0
- data/spec/samples/sample_site/typography.html +192 -0
- data/spec/spec_helper.rb +10 -2
- metadata +196 -26
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
|
@@ -56,7 +56,9 @@ class Cobweb
|
|
|
56
56
|
|
|
57
57
|
if @options[:internal_urls].nil? || @options[:internal_urls].empty?
|
|
58
58
|
uri = Addressable::URI.parse(base_url)
|
|
59
|
-
@options[:internal_urls] = [
|
|
59
|
+
@options[:internal_urls] = []
|
|
60
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
|
|
61
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, ":", uri.inferred_port, "/*"].join
|
|
60
62
|
end
|
|
61
63
|
|
|
62
64
|
request.merge!(@options)
|
|
@@ -72,6 +74,7 @@ class Cobweb
|
|
|
72
74
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
|
73
75
|
|
|
74
76
|
Resque.enqueue(CrawlJob, request)
|
|
77
|
+
request
|
|
75
78
|
end
|
|
76
79
|
|
|
77
80
|
# Returns array of cookies from content
|
data/lib/cobweb_crawler.rb
CHANGED
|
@@ -20,7 +20,7 @@ class CobwebCrawler
|
|
|
20
20
|
@options[:crawl_id] = @crawl_id
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
-
@redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{@crawl_id}")
|
|
23
|
+
@redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{@crawl_id}")
|
|
24
24
|
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
|
25
25
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
|
26
26
|
@debug = @options[:debug]
|
|
@@ -42,9 +42,6 @@ class CobwebCrawler
|
|
|
42
42
|
|
|
43
43
|
@crawl_options = crawl_options
|
|
44
44
|
|
|
45
|
-
puts "http://localhost:4567/statistics/#{@crawl_id}"
|
|
46
|
-
puts ""
|
|
47
|
-
|
|
48
45
|
@redis.sadd("queued", base_url) unless @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url)
|
|
49
46
|
crawl_counter = @redis.scard("crawled").to_i
|
|
50
47
|
queue_counter = @redis.scard("queued").to_i
|
data/lib/cobweb_version.rb
CHANGED
data/lib/content_link_parser.rb
CHANGED
|
@@ -46,6 +46,7 @@ class ContentLinkParser
|
|
|
46
46
|
data = link_data
|
|
47
47
|
links = data.keys.map{|key| data[key]}.flatten.uniq
|
|
48
48
|
links = links.map{|link| UriHelper.join_no_fragment(@url, link).to_s }
|
|
49
|
+
links = links.reject{|link| link =~ /\/([^\/]+?)\/\1\// }
|
|
49
50
|
links = links.reject{|link| link =~ /([^\/]+?)\/([^\/]+?)\/.*?\1\/\2/ }
|
|
50
51
|
links = links.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
|
|
51
52
|
links
|
data/lib/crawl_job.rb
CHANGED
|
@@ -36,6 +36,7 @@ class CrawlJob
|
|
|
36
36
|
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
|
37
37
|
|
|
38
38
|
## update statistics
|
|
39
|
+
@stats.update_status("Crawling #{content_request[:url]}...")
|
|
39
40
|
@stats.update_statistics(content)
|
|
40
41
|
|
|
41
42
|
# set the base url if this is the first page
|
|
@@ -72,7 +73,7 @@ class CrawlJob
|
|
|
72
73
|
if @redis.scard("queued") == 0
|
|
73
74
|
finished(content_request)
|
|
74
75
|
end
|
|
75
|
-
elsif @queue_counter == 0 || @crawl_counter
|
|
76
|
+
elsif @queue_counter == 0 || @crawl_counter > content_request[:crawl_limit].to_i
|
|
76
77
|
finished(content_request)
|
|
77
78
|
end
|
|
78
79
|
end
|
|
@@ -109,12 +110,12 @@ class CrawlJob
|
|
|
109
110
|
|
|
110
111
|
# Returns true if the crawl count is within limits
|
|
111
112
|
def self.within_crawl_limits?(crawl_limit)
|
|
112
|
-
crawl_limit.nil? or @crawl_counter
|
|
113
|
+
crawl_limit.nil? or @crawl_counter <= crawl_limit.to_i
|
|
113
114
|
end
|
|
114
115
|
|
|
115
116
|
# Returns true if the queue count is calculated to be still within limits when complete
|
|
116
117
|
def self.within_queue_limits?(crawl_limit)
|
|
117
|
-
within_crawl_limits?(crawl_limit)
|
|
118
|
+
within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
|
|
118
119
|
end
|
|
119
120
|
|
|
120
121
|
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
data/lib/server.rb
CHANGED
|
@@ -17,7 +17,7 @@ class Server < Sinatra::Base
|
|
|
17
17
|
|
|
18
18
|
@crawls = []
|
|
19
19
|
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
|
20
|
-
redis = NamespacedRedis.new({}, "cobweb-#{crawl_id}")
|
|
20
|
+
redis = NamespacedRedis.new({}, "cobweb-#{Cobweb.version}-#{crawl_id}")
|
|
21
21
|
stats = HashUtil.deep_symbolize_keys({
|
|
22
22
|
:crawl_details => redis.hgetall("crawl_details"),
|
|
23
23
|
:statistics => redis.hgetall("statistics"),
|
|
@@ -31,7 +31,7 @@ class Server < Sinatra::Base
|
|
|
31
31
|
|
|
32
32
|
# Sinatra Crawl Detail
|
|
33
33
|
get '/statistics/:crawl_id' do
|
|
34
|
-
redis = NamespacedRedis.new({}, "cobweb-#{params[:crawl_id]}")
|
|
34
|
+
redis = NamespacedRedis.new({}, "cobweb-#{Cobweb.version}-#{params[:crawl_id]}")
|
|
35
35
|
|
|
36
36
|
@statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
|
|
37
37
|
if @statistics[:status_counts].nil?
|
data/lib/stats.rb
CHANGED
|
@@ -4,8 +4,9 @@ class Stats
|
|
|
4
4
|
|
|
5
5
|
# Sets up redis usage for statistics
|
|
6
6
|
def initialize(options)
|
|
7
|
+
options[:redis_options] = {} unless options.has_key? :redis_options
|
|
7
8
|
@full_redis = Redis.new(options[:redis_options])
|
|
8
|
-
@redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{options[:crawl_id]}")
|
|
9
|
+
@redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
|
|
9
10
|
end
|
|
10
11
|
|
|
11
12
|
# Sets up the crawl in statistics
|
|
@@ -26,6 +27,10 @@ class Stats
|
|
|
26
27
|
@redis.del "crawl_details"
|
|
27
28
|
end
|
|
28
29
|
|
|
30
|
+
def get_crawled
|
|
31
|
+
@redis.smembers "crawled"
|
|
32
|
+
end
|
|
33
|
+
|
|
29
34
|
# Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
|
|
30
35
|
def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
|
|
31
36
|
|
|
@@ -1,41 +1,179 @@
|
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
2
2
|
|
|
3
|
-
describe Cobweb do
|
|
3
|
+
describe Cobweb, :local_only => true do
|
|
4
|
+
|
|
5
|
+
before(:all) do
|
|
6
|
+
#store all existing resque process ids
|
|
7
|
+
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
|
8
|
+
|
|
9
|
+
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
|
10
|
+
puts "Starting Workers... Please Wait..."
|
|
11
|
+
`mkdir log`
|
|
12
|
+
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=5 QUEUE=cobweb_crawl_job > log/output.log &")
|
|
13
|
+
puts "Workers Started."
|
|
14
|
+
|
|
15
|
+
# START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
|
|
16
|
+
@thin = nil
|
|
17
|
+
Thread.new do
|
|
18
|
+
@thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# WAIT FOR START TO COMPLETE
|
|
22
|
+
sleep 1
|
|
23
|
+
|
|
24
|
+
end
|
|
4
25
|
|
|
5
26
|
before(:each) do
|
|
6
|
-
@base_url = "http://
|
|
7
|
-
@
|
|
27
|
+
@base_url = "http://localhost:3532/"
|
|
28
|
+
@base_page_count = 77
|
|
29
|
+
|
|
30
|
+
clear_queues
|
|
8
31
|
end
|
|
9
32
|
|
|
10
|
-
describe "
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
it "should not limit crawl"
|
|
22
|
-
it "should detect the end or crawl"
|
|
33
|
+
describe "with no crawl limit" do
|
|
34
|
+
before(:each) do
|
|
35
|
+
@request = {
|
|
36
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
|
37
|
+
:crawl_limit => nil,
|
|
38
|
+
:quiet => false,
|
|
39
|
+
:debug => false,
|
|
40
|
+
:cache => nil
|
|
41
|
+
}
|
|
42
|
+
@cobweb = Cobweb.new @request
|
|
23
43
|
end
|
|
24
|
-
|
|
25
|
-
|
|
44
|
+
|
|
45
|
+
it "should crawl entire site" do
|
|
46
|
+
crawl = @cobweb.start(@base_url)
|
|
47
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
48
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
49
|
+
Resque.size("cobweb_process_job").should == @base_page_count
|
|
50
|
+
end
|
|
51
|
+
it "detect crawl finished" do
|
|
52
|
+
crawl = @cobweb.start(@base_url)
|
|
53
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
54
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
55
|
+
Resque.size("cobweb_finished_job").should == 1
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
describe "with a crawl limit" do
|
|
60
|
+
before(:each) do
|
|
26
61
|
@request = {
|
|
27
62
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
|
28
|
-
:
|
|
29
|
-
:
|
|
63
|
+
:quiet => true,
|
|
64
|
+
:cache => nil
|
|
30
65
|
}
|
|
66
|
+
@cobweb = Cobweb.new @request
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
describe "limit to 1" do
|
|
70
|
+
before(:each) do
|
|
71
|
+
@request[:crawl_limit] = 1
|
|
72
|
+
end
|
|
31
73
|
|
|
32
|
-
it "should
|
|
33
|
-
|
|
34
|
-
|
|
74
|
+
it "should not crawl the entire site" do
|
|
75
|
+
crawl = @cobweb.start(@base_url)
|
|
76
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
77
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
78
|
+
Resque.size("cobweb_process_job").should_not == @base_page_count
|
|
79
|
+
end
|
|
80
|
+
it "should only crawl 1 page" do
|
|
81
|
+
crawl = @cobweb.start(@base_url)
|
|
82
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
83
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
84
|
+
Resque.size("cobweb_process_job").should == 1
|
|
85
|
+
end
|
|
86
|
+
it "should notify of crawl finished" do
|
|
87
|
+
crawl = @cobweb.start(@base_url)
|
|
88
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
89
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
90
|
+
Resque.size("cobweb_finished_job").should == 1
|
|
91
|
+
end
|
|
92
|
+
|
|
35
93
|
end
|
|
94
|
+
|
|
95
|
+
describe "limit to 3" do
|
|
96
|
+
before(:each) do
|
|
97
|
+
@request[:crawl_limit] = 3
|
|
98
|
+
end
|
|
99
|
+
it "should not crawl the entire site" do
|
|
100
|
+
crawl = @cobweb.start(@base_url)
|
|
101
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
102
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
103
|
+
Resque.size("cobweb_process_job").should_not == @base_page_count
|
|
104
|
+
end
|
|
105
|
+
it "should notify of crawl finished" do
|
|
106
|
+
crawl = @cobweb.start(@base_url)
|
|
107
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
108
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
109
|
+
Resque.size("cobweb_finished_job").should == 1
|
|
110
|
+
end
|
|
111
|
+
it "should only crawl 3 pages" do
|
|
112
|
+
crawl = @cobweb.start(@base_url)
|
|
113
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
114
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
115
|
+
Resque.size("cobweb_process_job").should == 3
|
|
116
|
+
end
|
|
36
117
|
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
describe "limit to 100" do
|
|
121
|
+
before(:each) do
|
|
122
|
+
@request[:crawl_limit] = 100
|
|
123
|
+
end
|
|
37
124
|
|
|
125
|
+
it "should crawl the entire site" do
|
|
126
|
+
crawl = @cobweb.start(@base_url)
|
|
127
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
128
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
129
|
+
Resque.size("cobweb_process_job").should == @base_page_count
|
|
130
|
+
end
|
|
131
|
+
it "should notify of crawl finished" do
|
|
132
|
+
crawl = @cobweb.start(@base_url)
|
|
133
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
134
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
135
|
+
Resque.size("cobweb_finished_job").should == 1
|
|
136
|
+
end
|
|
137
|
+
it "should not crawl 100 pages" do
|
|
138
|
+
crawl = @cobweb.start(@base_url)
|
|
139
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
140
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
|
141
|
+
Resque.size("cobweb_process_job").should_not == 100
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
after(:all) do
|
|
147
|
+
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
|
148
|
+
command = "kill #{(@all_processes - @existing_processes).join(" ")}"
|
|
149
|
+
IO.popen(command)
|
|
150
|
+
#@thin.stop!
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def wait_for_crawl_finished(crawl_id, timeout=20)
|
|
156
|
+
counter = 0
|
|
157
|
+
while(running?(crawl_id) && counter < timeout) do
|
|
158
|
+
sleep 1
|
|
159
|
+
counter+=1
|
|
160
|
+
end
|
|
161
|
+
if counter > timeout
|
|
162
|
+
raise "End of crawl not detected"
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def running?(crawl_id)
|
|
167
|
+
@stat.get_status != "Crawl Stopped"
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def clear_queues
|
|
171
|
+
Resque.queues.each do |queue|
|
|
172
|
+
Resque.remove_queue(queue)
|
|
38
173
|
end
|
|
174
|
+
|
|
175
|
+
Resque.size("cobweb_process_job").should == 0
|
|
176
|
+
Resque.size("cobweb_finished_job").should == 0
|
|
177
|
+
end
|
|
39
178
|
|
|
40
179
|
|
|
41
|
-
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
class SampleServer
|
|
2
|
+
|
|
3
|
+
# This is the root of our app
|
|
4
|
+
@root = File.expand_path(File.dirname(__FILE__)) + "/sample_site"
|
|
5
|
+
|
|
6
|
+
def self.app
|
|
7
|
+
Proc.new { |env|
|
|
8
|
+
# Extract the requested path from the request
|
|
9
|
+
path = Rack::Utils.unescape(env['PATH_INFO'])
|
|
10
|
+
index_file = @root + "#{path}/index.html"
|
|
11
|
+
|
|
12
|
+
if File.exists?(index_file)
|
|
13
|
+
# Return the index
|
|
14
|
+
[200, {'Content-Type' => 'text/html'}, File.read(index_file)]
|
|
15
|
+
else
|
|
16
|
+
# Pass the request to the directory app
|
|
17
|
+
Rack::Directory.new(@root).call(env)
|
|
18
|
+
end
|
|
19
|
+
}
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
2
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
3
|
+
|
|
4
|
+
<head>
|
|
5
|
+
|
|
6
|
+
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
|
7
|
+
<title>CleanDream</title>
|
|
8
|
+
|
|
9
|
+
<style type="text/css">
|
|
10
|
+
@import url("css/style.css");
|
|
11
|
+
@import url('css/style_text.css');
|
|
12
|
+
@import url('css/form-buttons.css');
|
|
13
|
+
@import url('css/link-buttons.css');
|
|
14
|
+
@import url('css/menu.css');
|
|
15
|
+
@import url('css/statics.css');
|
|
16
|
+
@import url('css/messages.css');
|
|
17
|
+
@import url('css/datatable.css');
|
|
18
|
+
@import url('css/accordion.css');
|
|
19
|
+
@import url('css/tabs.css');
|
|
20
|
+
@import url('css/forms.css');
|
|
21
|
+
@import url('css/datepicker.css');
|
|
22
|
+
@import url('css/modalbox.css');
|
|
23
|
+
@import url('css/jquery.fancybox-1.3.4.css');
|
|
24
|
+
@import url('css/jquery.treeview.css');
|
|
25
|
+
@import url('css/wysiwyg.css');
|
|
26
|
+
@import url('css/wysiwyg.modal.css');
|
|
27
|
+
@import url('css/wysiwyg-editor.css');
|
|
28
|
+
</style>
|
|
29
|
+
|
|
30
|
+
<script type="text/javascript" src="js/jquery-1.7.1.min.js"></script>
|
|
31
|
+
|
|
32
|
+
<!--[if lte IE 8]>
|
|
33
|
+
<script type="text/javascript" src="js/excanvas.min.js"></script>
|
|
34
|
+
<![endif]-->
|
|
35
|
+
|
|
36
|
+
</head>
|
|
37
|
+
|
|
38
|
+
<body>
|
|
39
|
+
|
|
40
|
+
<div class="container">
|
|
41
|
+
|
|
42
|
+
<div class="logo-labels">
|
|
43
|
+
<h1><a href="#">CleanDream</a></h1>
|
|
44
|
+
<ul>
|
|
45
|
+
<li><a href="#"><span>Settings</span></a></li>
|
|
46
|
+
<li class="usermessage"><a href="#"><span>1 new message</span></a></li>
|
|
47
|
+
<li class="logout"><a href="#"><span>Logout</span></a></li>
|
|
48
|
+
</ul>
|
|
49
|
+
</div>
|
|
50
|
+
|
|
51
|
+
<div class="menu-search">
|
|
52
|
+
<ul>
|
|
53
|
+
<li><a href="dashboard.html">Dashboard</a></li>
|
|
54
|
+
<li>
|
|
55
|
+
<a href="#">Dropdown</a>
|
|
56
|
+
<ul>
|
|
57
|
+
<li><a href="#">Submenu item 001</a></li>
|
|
58
|
+
<li><a href="#">Submenu item 002</a></li>
|
|
59
|
+
<li><a href="#">Submenu item 003</a></li>
|
|
60
|
+
<li class="current">
|
|
61
|
+
<a href="#">Submenu item 004</a>
|
|
62
|
+
<ul>
|
|
63
|
+
<li><a href="#">Subsubmenu item 001</a></li>
|
|
64
|
+
<li><a href="#">Subsubmenu item 002</a></li>
|
|
65
|
+
<li class="current"><a href="#">Subsubmenu item 003</a></li>
|
|
66
|
+
<li><a href="#">Subsubmenu item 003</a></li>
|
|
67
|
+
<li><a href="#">Subsubmenu item 005</a></li>
|
|
68
|
+
</ul>
|
|
69
|
+
</li>
|
|
70
|
+
<li> <a href="#">Submenu item 005</a> </li>
|
|
71
|
+
</ul>
|
|
72
|
+
</li>
|
|
73
|
+
<li><a href="typography.html">Typography</a></li>
|
|
74
|
+
<li class="current"><a href="boxgrid.html">Boxes Grid</a></li>
|
|
75
|
+
<li><a href="forms.html">Forms</a></li>
|
|
76
|
+
<li><a href="gallery.html">Gallery</a></li>
|
|
77
|
+
<li><a href="tables.html">Tables</a></li>
|
|
78
|
+
<li><a href="more.html">More</a></li>
|
|
79
|
+
</ul>
|
|
80
|
+
<div class="search">
|
|
81
|
+
<form action="" method="post">
|
|
82
|
+
<input type="text" value="Seachterm" />
|
|
83
|
+
<button type="submit"></button>
|
|
84
|
+
</form>
|
|
85
|
+
</div>
|
|
86
|
+
</div>
|
|
87
|
+
|
|
88
|
+
<div class="breadcrumbs">
|
|
89
|
+
<ul>
|
|
90
|
+
<li class="home"><a href="#"></a></li>
|
|
91
|
+
<li class="break">»</li>
|
|
92
|
+
<li><a href="#">Menu item</a></li>
|
|
93
|
+
<li class="break">»</li>
|
|
94
|
+
<li><a href="#">Menu item</a></li>
|
|
95
|
+
</ul>
|
|
96
|
+
</div>
|
|
97
|
+
|
|
98
|
+
<div class="section">
|
|
99
|
+
<div class="full">
|
|
100
|
+
<div class="box">
|
|
101
|
+
<div class="title">
|
|
102
|
+
<h2>Full box</h2>
|
|
103
|
+
<span class="hide"></span>
|
|
104
|
+
</div>
|
|
105
|
+
<div class="content">Place your content here.</div>
|
|
106
|
+
</div>
|
|
107
|
+
</div>
|
|
108
|
+
</div>
|
|
109
|
+
|
|
110
|
+
<div class="section">
|
|
111
|
+
<div class="small">
|
|
112
|
+
<div class="box">
|
|
113
|
+
<div class="title">
|
|
114
|
+
<h2>Small box</h2>
|
|
115
|
+
<span class="hide"></span>
|
|
116
|
+
</div>
|
|
117
|
+
<div class="content">Place your content here.</div>
|
|
118
|
+
</div>
|
|
119
|
+
</div>
|
|
120
|
+
|
|
121
|
+
<div class="big">
|
|
122
|
+
<div class="box">
|
|
123
|
+
<div class="title">
|
|
124
|
+
<h2>Big box</h2>
|
|
125
|
+
<span class="hide"></span>
|
|
126
|
+
</div>
|
|
127
|
+
<div class="content">Place your content here.</div>
|
|
128
|
+
</div>
|
|
129
|
+
</div>
|
|
130
|
+
</div>
|
|
131
|
+
|
|
132
|
+
<div class="section">
|
|
133
|
+
<div class="medium">
|
|
134
|
+
<div class="box">
|
|
135
|
+
<div class="title">
|
|
136
|
+
<h2>Medium box</h2>
|
|
137
|
+
<span class="hide"></span>
|
|
138
|
+
</div>
|
|
139
|
+
<div class="content">Place your content here.</div>
|
|
140
|
+
</div>
|
|
141
|
+
</div>
|
|
142
|
+
|
|
143
|
+
<div class="medium">
|
|
144
|
+
<div class="box">
|
|
145
|
+
<div class="title">
|
|
146
|
+
<h2>Medium box</h2>
|
|
147
|
+
<span class="hide"></span>
|
|
148
|
+
</div>
|
|
149
|
+
<div class="content">Place your content here.</div>
|
|
150
|
+
</div>
|
|
151
|
+
</div>
|
|
152
|
+
</div>
|
|
153
|
+
|
|
154
|
+
<div class="section">
|
|
155
|
+
<div class="medium">
|
|
156
|
+
<div class="box">
|
|
157
|
+
<div class="title">
|
|
158
|
+
<h2>Medium box</h2>
|
|
159
|
+
<span class="hide"></span>
|
|
160
|
+
</div>
|
|
161
|
+
<div class="content">Place your content here.</div>
|
|
162
|
+
</div>
|
|
163
|
+
</div>
|
|
164
|
+
|
|
165
|
+
<div class="small">
|
|
166
|
+
<div class="box">
|
|
167
|
+
<div class="title">
|
|
168
|
+
<h2>Small box</h2>
|
|
169
|
+
<span class="hide"></span>
|
|
170
|
+
</div>
|
|
171
|
+
<div class="content">Place your content here.</div>
|
|
172
|
+
</div>
|
|
173
|
+
</div>
|
|
174
|
+
|
|
175
|
+
<div class="small">
|
|
176
|
+
<div class="box">
|
|
177
|
+
<div class="title">
|
|
178
|
+
<h2>Small box</h2>
|
|
179
|
+
<span class="hide"></span>
|
|
180
|
+
</div>
|
|
181
|
+
<div class="content">Place your content here.</div>
|
|
182
|
+
</div>
|
|
183
|
+
</div>
|
|
184
|
+
</div>
|
|
185
|
+
|
|
186
|
+
<div class="section">
|
|
187
|
+
<div class="small">
|
|
188
|
+
<div class="box">
|
|
189
|
+
<div class="title">
|
|
190
|
+
<h2>Small box</h2>
|
|
191
|
+
<span class="hide"></span>
|
|
192
|
+
</div>
|
|
193
|
+
<div class="content">Place your content here.</div>
|
|
194
|
+
</div>
|
|
195
|
+
</div>
|
|
196
|
+
|
|
197
|
+
<div class="small">
|
|
198
|
+
<div class="box">
|
|
199
|
+
<div class="title">
|
|
200
|
+
<h2>Small box</h2>
|
|
201
|
+
<span class="hide"></span>
|
|
202
|
+
</div>
|
|
203
|
+
<div class="content">Place your content here.</div>
|
|
204
|
+
</div>
|
|
205
|
+
</div>
|
|
206
|
+
|
|
207
|
+
<div class="small">
|
|
208
|
+
<div class="box">
|
|
209
|
+
<div class="title">
|
|
210
|
+
<h2>Small box</h2>
|
|
211
|
+
<span class="hide"></span>
|
|
212
|
+
</div>
|
|
213
|
+
<div class="content">Place your content here.</div>
|
|
214
|
+
</div>
|
|
215
|
+
</div>
|
|
216
|
+
|
|
217
|
+
<div class="small">
|
|
218
|
+
<div class="box">
|
|
219
|
+
<div class="title">
|
|
220
|
+
<h2>Small box</h2>
|
|
221
|
+
<span class="hide"></span>
|
|
222
|
+
</div>
|
|
223
|
+
<div class="content">Place your content here.</div>
|
|
224
|
+
</div>
|
|
225
|
+
</div>
|
|
226
|
+
</div>
|
|
227
|
+
|
|
228
|
+
<div class="footer">
|
|
229
|
+
<div class="split">© Copyright <a href="#">website.com</a></div>
|
|
230
|
+
<div class="split right">Powered by <a href="http://themeforest.net/item/cleandream/490140" target="_blank">CleanDream</a></div>
|
|
231
|
+
</div>
|
|
232
|
+
</div>
|
|
233
|
+
|
|
234
|
+
<script type="text/javascript" src="js/superfish.js"></script>
|
|
235
|
+
<script type="text/javascript" src="js/supersubs.js"></script>
|
|
236
|
+
<script type="text/javascript" src="js/hoverIntent.js"></script>
|
|
237
|
+
<script type="text/javascript" src="js/jquery.flot.js"></script>
|
|
238
|
+
<script type="text/javascript" src="js/jquery.graphtable-0.2.js"></script>
|
|
239
|
+
<script type="text/javascript" src="js/jquery.flot.resize.min.js"></script>
|
|
240
|
+
<script type="text/javascript" src="js/jquery-ui.js"></script>
|
|
241
|
+
<script type="text/javascript" src="js/jquery-ui-select.js"></script>
|
|
242
|
+
<script type="text/javascript" src="js/customInput.jquery.js"></script>
|
|
243
|
+
<script type="text/javascript" src="js/jquery.dataTables.js"></script>
|
|
244
|
+
<script type="text/javascript" src="js/jquery.fancybox-1.3.4.js"></script>
|
|
245
|
+
<script type="text/javascript" src="js/jquery.filestyle.mini.js"></script>
|
|
246
|
+
<script type="text/javascript" src="js/jquery-ui-timepicker-addon.js"></script>
|
|
247
|
+
<script type="text/javascript" src="js/jquery.treeview.js"></script>
|
|
248
|
+
<script type="text/javascript" src="js/jquery.tipsy.js"></script>
|
|
249
|
+
<script type="text/javascript" src="js/jquery.wysiwyg.js"></script>
|
|
250
|
+
<script type="text/javascript" src="js/plugins/wysiwyg.rmFormat.js"></script>
|
|
251
|
+
<script type="text/javascript" src="js/controls/wysiwyg.image.js"></script>
|
|
252
|
+
<script type="text/javascript" src="js/controls/wysiwyg.link.js"></script>
|
|
253
|
+
<script type="text/javascript" src="js/controls/wysiwyg.table.js"></script>
|
|
254
|
+
<script type="text/javascript" src="js/inline.js"></script>
|
|
255
|
+
|
|
256
|
+
</body>
|
|
257
|
+
|
|
258
|
+
</html>
|