cobweb 0.0.38 → 0.0.39
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +1 -1
- data/lib/cobweb.rb +3 -1
- data/lib/cobweb_crawler.rb +98 -100
- data/lib/crawl_job.rb +3 -4
- data/lib/server.rb +98 -0
- data/lib/stats.rb +141 -64
- data/public/css/accordion.css +45 -0
- data/public/css/custom.css +13 -0
- data/public/css/datatable.css +189 -0
- data/public/css/datepicker.css +171 -0
- data/public/css/form-buttons.css +180 -0
- data/public/css/forms.css +489 -0
- data/public/css/jquery.fancybox-1.3.4.css +282 -0
- data/public/css/jquery.treeview.css +103 -0
- data/public/css/link-buttons.css +187 -0
- data/public/css/login.css +110 -0
- data/public/css/menu.css +156 -0
- data/public/css/messages.css +58 -0
- data/public/css/modalbox.css +63 -0
- data/public/css/statics.css +57 -0
- data/public/css/style.css +497 -0
- data/public/css/style_text.css +128 -0
- data/public/css/tabs.css +58 -0
- data/public/css/wysiwyg-editor.css +18 -0
- data/public/css/wysiwyg.css +149 -0
- data/public/css/wysiwyg.modal.css +69 -0
- data/public/gfx/back-menu.gif +0 -0
- data/public/gfx/back-submenu.gif +0 -0
- data/public/gfx/background.gif +0 -0
- data/public/gfx/box-hide.png +0 -0
- data/public/gfx/box-search.png +0 -0
- data/public/gfx/box-title.gif +0 -0
- data/public/gfx/code.gif +0 -0
- data/public/gfx/datepicker-arrows.gif +0 -0
- data/public/gfx/fancybox/blank.gif +0 -0
- data/public/gfx/fancybox/fancy_close.png +0 -0
- data/public/gfx/fancybox/fancy_loading.png +0 -0
- data/public/gfx/fancybox/fancy_nav_left.png +0 -0
- data/public/gfx/fancybox/fancy_nav_right.png +0 -0
- data/public/gfx/fancybox/fancy_title_left.png +0 -0
- data/public/gfx/fancybox/fancy_title_main.png +0 -0
- data/public/gfx/fancybox/fancy_title_over.png +0 -0
- data/public/gfx/fancybox/fancy_title_right.png +0 -0
- data/public/gfx/fancybox/fancybox-x.png +0 -0
- data/public/gfx/fancybox/fancybox.png +0 -0
- data/public/gfx/forms/date-next.gif +0 -0
- data/public/gfx/forms/date-prev.gif +0 -0
- data/public/gfx/forms/forms-checkbox.gif +0 -0
- data/public/gfx/forms/forms-date.gif +0 -0
- data/public/gfx/forms/forms-file.gif +0 -0
- data/public/gfx/forms/forms-input-big.gif +0 -0
- data/public/gfx/forms/forms-input-medium.gif +0 -0
- data/public/gfx/forms/forms-input-small.gif +0 -0
- data/public/gfx/forms/forms-input-xl.gif +0 -0
- data/public/gfx/forms/forms-radio.gif +0 -0
- data/public/gfx/forms/forms-selectbox-small.gif +0 -0
- data/public/gfx/forms/forms-selectbox.gif +0 -0
- data/public/gfx/forms/forms-textarea-big.gif +0 -0
- data/public/gfx/forms/forms-textarea-medium.gif +0 -0
- data/public/gfx/forms/forms-textarea-small.gif +0 -0
- data/public/gfx/forms/forms-textarea-xl.gif +0 -0
- data/public/gfx/icon-delete.png +0 -0
- data/public/gfx/icon-edit.png +0 -0
- data/public/gfx/icon-home.gif +0 -0
- data/public/gfx/img-delete.png +0 -0
- data/public/gfx/img-hover.png +0 -0
- data/public/gfx/img-zoom.png +0 -0
- data/public/gfx/jquery.wysiwyg.gif +0 -0
- data/public/gfx/label-icons.gif +0 -0
- data/public/gfx/label.gif +0 -0
- data/public/gfx/li-down.gif +0 -0
- data/public/gfx/li.gif +0 -0
- data/public/gfx/link-button-big.gif +0 -0
- data/public/gfx/link-button-medium.gif +0 -0
- data/public/gfx/link-button.gif +0 -0
- data/public/gfx/loading-2.gif +0 -0
- data/public/gfx/loading.gif +0 -0
- data/public/gfx/logo.png +0 -0
- data/public/gfx/modal-title.gif +0 -0
- data/public/gfx/photos/00.jpg +0 -0
- data/public/gfx/photos/01.jpg +0 -0
- data/public/gfx/photos/01xl.jpg +0 -0
- data/public/gfx/photos/02.jpg +0 -0
- data/public/gfx/photos/02xl.jpg +0 -0
- data/public/gfx/photos/03.jpg +0 -0
- data/public/gfx/photos/03xl.jpg +0 -0
- data/public/gfx/photos/04.jpg +0 -0
- data/public/gfx/photos/04xl.jpg +0 -0
- data/public/gfx/photos/05.jpg +0 -0
- data/public/gfx/photos/05xl.jpg +0 -0
- data/public/gfx/photos/06.jpg +0 -0
- data/public/gfx/photos/06xl.jpg +0 -0
- data/public/gfx/photos/07.jpg +0 -0
- data/public/gfx/photos/07xl.jpg +0 -0
- data/public/gfx/photos/08.jpg +0 -0
- data/public/gfx/photos/08xl.jpg +0 -0
- data/public/gfx/photos/09.jpg +0 -0
- data/public/gfx/photos/09xl.jpg +0 -0
- data/public/gfx/photos/10.jpg +0 -0
- data/public/gfx/photos/10xl.jpg +0 -0
- data/public/gfx/photos/11.jpg +0 -0
- data/public/gfx/photos/11xl.jpg +0 -0
- data/public/gfx/photos/12.jpg +0 -0
- data/public/gfx/photos/12xl.jpg +0 -0
- data/public/gfx/photos/13.jpg +0 -0
- data/public/gfx/photos/13xl.jpg +0 -0
- data/public/gfx/photos/14.jpg +0 -0
- data/public/gfx/photos/14xl.jpg +0 -0
- data/public/gfx/photos/15.jpg +0 -0
- data/public/gfx/photos/15xl.jpg +0 -0
- data/public/gfx/search-button.gif +0 -0
- data/public/gfx/search-input.gif +0 -0
- data/public/gfx/slider-button.gif +0 -0
- data/public/gfx/system-messages.gif +0 -0
- data/public/gfx/table-asc-arrow.gif +0 -0
- data/public/gfx/table-desc-arrow.gif +0 -0
- data/public/gfx/table-first.gif +0 -0
- data/public/gfx/table-last.gif +0 -0
- data/public/gfx/table-next.gif +0 -0
- data/public/gfx/table-number.gif +0 -0
- data/public/gfx/table-prev.gif +0 -0
- data/public/gfx/table-rows.gif +0 -0
- data/public/gfx/table-search.gif +0 -0
- data/public/gfx/table-thead.gif +0 -0
- data/public/gfx/tooltip.gif +0 -0
- data/public/gfx/treeview/ajax-loader.gif +0 -0
- data/public/gfx/treeview/file.gif +0 -0
- data/public/gfx/treeview/folder-closed.gif +0 -0
- data/public/gfx/treeview/folder.gif +0 -0
- data/public/gfx/treeview/minus.gif +0 -0
- data/public/gfx/treeview/plus.gif +0 -0
- data/public/gfx/treeview/treeview-default-line.gif +0 -0
- data/public/gfx/treeview/treeview-default.gif +0 -0
- data/public/js/controls/wysiwyg.image.js +284 -0
- data/public/js/controls/wysiwyg.link.js +210 -0
- data/public/js/controls/wysiwyg.table.js +151 -0
- data/public/js/customInput.jquery.js +68 -0
- data/public/js/excanvas.min.js +1 -0
- data/public/js/hoverIntent.js +84 -0
- data/public/js/inline.js +392 -0
- data/public/js/jquery-1.7.1.min.js +4 -0
- data/public/js/jquery-ui-select.js +522 -0
- data/public/js/jquery-ui-timepicker-addon.js +1299 -0
- data/public/js/jquery-ui.js +791 -0
- data/public/js/jquery.dataTables.js +7440 -0
- data/public/js/jquery.fancybox-1.3.4.js +1156 -0
- data/public/js/jquery.filestyle.mini.js +2 -0
- data/public/js/jquery.flot.js +2600 -0
- data/public/js/jquery.flot.resize.min.js +60 -0
- data/public/js/jquery.graphtable-0.2.js +179 -0
- data/public/js/jquery.tipsy.js +104 -0
- data/public/js/jquery.treeview.js +256 -0
- data/public/js/jquery.wysiwyg.js +2454 -0
- data/public/js/plugins/wysiwyg.rmFormat.js +348 -0
- data/public/js/superfish.js +121 -0
- data/public/js/supersubs.js +90 -0
- data/views/home.haml +54 -0
- data/views/layout.haml +89 -0
- data/views/statistics.haml +251 -71
- metadata +175 -22
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -20,7 +20,7 @@ class Cobweb
|
|
20
20
|
# investigate using event machine for single threaded crawling
|
21
21
|
|
22
22
|
def self.version
|
23
|
-
"0.0.
|
23
|
+
"0.0.39"
|
24
24
|
end
|
25
25
|
|
26
26
|
def method_missing(method_sym, *arguments, &block)
|
@@ -67,6 +67,8 @@ class Cobweb
|
|
67
67
|
@redis.set("crawl-counter", 0)
|
68
68
|
@redis.set("queue-counter", 1)
|
69
69
|
|
70
|
+
@stats = Stats.new(request)
|
71
|
+
@stats.start_crawl(request)
|
70
72
|
|
71
73
|
# add internal_urls into redis
|
72
74
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -11,9 +11,18 @@ class CobwebCrawler
|
|
11
11
|
@statistic = {}
|
12
12
|
|
13
13
|
@options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
|
14
|
-
crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
|
14
|
+
@crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
|
15
|
+
@options[:crawl_id] = @crawl_id
|
15
16
|
|
16
|
-
@redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{crawl_id}")
|
17
|
+
@redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{@crawl_id}")
|
18
|
+
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
19
|
+
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
20
|
+
@debug = @options[:debug]
|
21
|
+
|
22
|
+
@stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
|
23
|
+
if @options[:web_statistics]
|
24
|
+
Server.start
|
25
|
+
end
|
17
26
|
|
18
27
|
@cobweb = Cobweb.new(@options)
|
19
28
|
end
|
@@ -21,124 +30,113 @@ class CobwebCrawler
|
|
21
30
|
def crawl(base_url, crawl_options = {}, &block)
|
22
31
|
@options[:base_url] = base_url unless @options.has_key? :base_url
|
23
32
|
|
33
|
+
@options[:internal_urls] << base_url if @options[:internal_urls].empty?
|
34
|
+
@redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty?
|
35
|
+
|
24
36
|
@crawl_options = crawl_options
|
25
37
|
|
38
|
+
puts "http://localhost:4567/statistics/#{@crawl_id}"
|
39
|
+
puts ""
|
40
|
+
|
26
41
|
@redis.sadd "queued", base_url
|
27
42
|
crawl_counter = @redis.scard("crawled").to_i
|
28
43
|
queue_counter = @redis.scard("queued").to_i
|
29
44
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
crawl_counter = @redis.scard("crawled").to_i
|
35
|
-
queue_counter = @redis.scard("queued").to_i
|
45
|
+
begin
|
46
|
+
@stats.start_crawl(@options)
|
47
|
+
while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
|
48
|
+
thread = Thread.new do
|
36
49
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
@statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
|
48
|
-
end
|
49
|
-
|
50
|
-
@statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time]
|
51
|
-
@statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time]
|
52
|
-
|
53
|
-
if @statistic[:average_length]
|
54
|
-
@statistic[:average_length] = (((@statistic[:average_length].to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
|
55
|
-
else
|
56
|
-
@statistic[:average_length] = content[:length].to_i
|
57
|
-
end
|
58
|
-
|
59
|
-
@statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i
|
60
|
-
@statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i < @statistic[:minimum_length].to_i
|
61
|
-
@statistic[:total_length] = @statistic[:total_length].to_i + content[:length].to_i
|
50
|
+
url = @redis.spop "queued"
|
51
|
+
crawl_counter = @redis.scard("crawled").to_i
|
52
|
+
queue_counter = @redis.scard("queued").to_i
|
53
|
+
|
54
|
+
@options[:url] = url
|
55
|
+
unless @redis.sismember("crawled", url.to_s)
|
56
|
+
begin
|
57
|
+
@stats.update_status("Requesting #{url}...")
|
58
|
+
content = @cobweb.get(url)
|
59
|
+
@stats.update_status("Processing #{url}...")
|
62
60
|
|
63
|
-
|
64
|
-
@
|
65
|
-
@statistic[:page_size] = @statistic[:page_size].to_i + content[:length].to_i
|
66
|
-
else
|
67
|
-
@statistic[:asset_count] = @statistic[:asset_count].to_i + 1
|
68
|
-
@statistic[:asset_size] = @statistic[:asset_size].to_i + content[:length].to_i
|
69
|
-
end
|
70
|
-
|
71
|
-
@statistic[:total_redirects] = 0 if @statistic[:total_redirects].nil?
|
72
|
-
@statistic[:total_redirects] += content[:redirect_through].count unless content[:redirect_through].nil?
|
61
|
+
@redis.sadd "crawled", url.to_s
|
62
|
+
@redis.incr "crawl-counter"
|
73
63
|
|
74
|
-
|
75
|
-
|
64
|
+
internal_links = all_links_from_content(content).map{|link| link.to_s}
|
65
|
+
|
66
|
+
# reject the link if we've crawled it or queued it
|
67
|
+
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
68
|
+
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
76
69
|
|
77
|
-
mime_counts = {}
|
78
|
-
if @statistic.has_key? :mime_counts
|
79
|
-
mime_counts = @statistic[:mime_counts]
|
80
|
-
if mime_counts.has_key? content[:mime_type]
|
81
|
-
mime_counts[content[:mime_type]] += 1
|
82
|
-
else
|
83
|
-
mime_counts[content[:mime_type]] = 1
|
84
|
-
end
|
85
|
-
else
|
86
|
-
mime_counts = {content[:mime_type] => 1}
|
87
|
-
end
|
88
|
-
@statistic[:mime_counts] = mime_counts
|
89
70
|
|
90
|
-
|
91
|
-
|
92
|
-
if @statistic.has_key? :status_counts
|
93
|
-
status_counts = @statistic[:status_counts]
|
94
|
-
if status_counts.has_key? content[:status_code].to_i
|
95
|
-
status_counts[content[:status_code].to_i] += 1
|
96
|
-
else
|
97
|
-
status_counts[content[:status_code].to_i] = 1
|
98
|
-
end
|
99
|
-
else
|
100
|
-
status_counts = {content[:status_code].to_i => 1}
|
101
|
-
end
|
102
|
-
@statistic[:status_counts] = status_counts
|
71
|
+
# select the link if its internal
|
72
|
+
internal_links.select!{|link| internal_link?(link)}
|
103
73
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |content_link|
|
108
|
-
link = content_link.to_s
|
109
|
-
unless @redis.sismember("crawled", link)
|
110
|
-
puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug]
|
111
|
-
if link.to_s.match(Regexp.new("^#{@options[:base_url]}"))
|
112
|
-
puts "Matched as #{link} as internal" if @options[:debug]
|
113
|
-
unless @redis.sismember("crawled", link) || @redis.sismember("queued", link)
|
114
|
-
puts "Added #{link.to_s} to queue" if @options[:debug]
|
115
|
-
@redis.sadd "queued", link
|
116
|
-
crawl_counter = @redis.scard("crawled").to_i
|
117
|
-
queue_counter = @redis.scard("queued").to_i
|
118
|
-
end
|
119
|
-
end
|
74
|
+
internal_links.each do |link|
|
75
|
+
puts "Added #{link.to_s} to queue" if @debug
|
76
|
+
@redis.sadd "queued", link
|
120
77
|
end
|
121
|
-
end
|
122
78
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
79
|
+
crawl_counter = @redis.scard("crawled").to_i
|
80
|
+
queue_counter = @redis.scard("queued").to_i
|
81
|
+
|
82
|
+
@stats.update_statistics(content)
|
83
|
+
@stats.update_status("Completed #{url}.")
|
84
|
+
puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
|
128
85
|
|
129
|
-
|
86
|
+
yield content, @statistic if block_given?
|
130
87
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
88
|
+
rescue => e
|
89
|
+
puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
|
90
|
+
ap e
|
91
|
+
ap e.backtrace
|
92
|
+
end
|
93
|
+
else
|
94
|
+
puts "Already crawled #{@options[:url]}" if @debug
|
135
95
|
end
|
136
|
-
else
|
137
|
-
puts "Already crawled #{@options[:url]}" if @options[:debug]
|
138
96
|
end
|
97
|
+
thread.join
|
139
98
|
end
|
140
|
-
|
99
|
+
ensure
|
100
|
+
@stats.end_crawl(@options)
|
141
101
|
end
|
142
102
|
@statistic
|
143
103
|
end
|
104
|
+
|
105
|
+
|
106
|
+
def internal_link?(link)
|
107
|
+
puts "Checking internal link for: #{link}" if @debug
|
108
|
+
valid_link = true
|
109
|
+
internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
|
110
|
+
puts "Matching against #{pattern.source}" if @debug
|
111
|
+
if link.match(pattern)
|
112
|
+
puts "Matched as internal" if @debug
|
113
|
+
return true
|
114
|
+
end
|
115
|
+
end
|
116
|
+
puts "Didn't match any pattern so marked as not internal" if @debug
|
117
|
+
false
|
118
|
+
end
|
119
|
+
|
120
|
+
def internal_patterns
|
121
|
+
@internal_patterns ||= @redis.smembers("internal_urls")
|
122
|
+
end
|
123
|
+
|
124
|
+
def all_links_from_content(content)
|
125
|
+
links = content[:links].keys.map{|key| content[:links][key]}.flatten
|
126
|
+
links.reject!{|link| link.starts_with?("javascript:")}
|
127
|
+
links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
|
128
|
+
links.select!{|link| link.scheme == "http" || link.scheme == "https"}
|
129
|
+
links.uniq
|
130
|
+
links
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
class String
|
135
|
+
def starts_with?(val)
|
136
|
+
if self.length >= val.length
|
137
|
+
self[0..val.length-1] == val
|
138
|
+
else
|
139
|
+
false
|
140
|
+
end
|
141
|
+
end
|
144
142
|
end
|
data/lib/crawl_job.rb
CHANGED
@@ -14,6 +14,7 @@ class CrawlJob
|
|
14
14
|
|
15
15
|
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
|
16
16
|
@redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
17
|
+
@stats = Stats.new(content_request)
|
17
18
|
|
18
19
|
@debug = content_request[:debug]
|
19
20
|
|
@@ -32,7 +33,7 @@ class CrawlJob
|
|
32
33
|
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
33
34
|
|
34
35
|
## update statistics
|
35
|
-
|
36
|
+
@stats.update_statistics(content)
|
36
37
|
|
37
38
|
# set the base url if this is the first page
|
38
39
|
set_base_url @redis, content, content_request
|
@@ -81,9 +82,7 @@ class CrawlJob
|
|
81
82
|
|
82
83
|
def self.finished(content_request)
|
83
84
|
# finished
|
84
|
-
|
85
|
-
Stats.set_totals
|
86
|
-
|
85
|
+
@stats.end_crawl(content_request)
|
87
86
|
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), Stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
88
87
|
end
|
89
88
|
|
data/lib/server.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'sinatra'
|
2
|
+
require 'haml'
|
3
|
+
|
4
|
+
class Server < Sinatra::Base
|
5
|
+
|
6
|
+
set :views, settings.root + '/../views'
|
7
|
+
puts "#{settings.root}/../public"
|
8
|
+
set :public_folder, settings.root + '/../public'
|
9
|
+
enable :static
|
10
|
+
|
11
|
+
get '/' do
|
12
|
+
@full_redis = Redis.new
|
13
|
+
|
14
|
+
@colors = ["#00366f", "#006ba0", "#3F0BDB", "#396CB3"]
|
15
|
+
|
16
|
+
@crawls = []
|
17
|
+
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
18
|
+
redis = NamespacedRedis.new({}, "cobweb-#{crawl_id}")
|
19
|
+
stats = HashUtil.deep_symbolize_keys({
|
20
|
+
:crawl_details => redis.hgetall("crawl_details"),
|
21
|
+
:statistics => redis.hgetall("statistics"),
|
22
|
+
:minute_totals => redis.hgetall("minute_totals")
|
23
|
+
})
|
24
|
+
@crawls << stats
|
25
|
+
end
|
26
|
+
|
27
|
+
haml :home
|
28
|
+
end
|
29
|
+
|
30
|
+
get '/statistics/:crawl_id' do
|
31
|
+
redis = NamespacedRedis.new({}, "cobweb-#{params[:crawl_id]}")
|
32
|
+
|
33
|
+
@statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
|
34
|
+
if @statistics[:status_counts].nil?
|
35
|
+
@statistics[:status_counts]
|
36
|
+
else
|
37
|
+
@statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
|
38
|
+
end
|
39
|
+
if @statistics[:mime_counts].nil?
|
40
|
+
@statistics[:mime_counts]
|
41
|
+
else
|
42
|
+
@statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
|
43
|
+
end
|
44
|
+
@crawl = {
|
45
|
+
:statistics => @statistics,
|
46
|
+
:crawl_details => HashUtil.deep_symbolize_keys(redis.hgetall("crawl_details")),
|
47
|
+
:minute_totals => HashUtil.deep_symbolize_keys(redis.hgetall("minute_totals")),
|
48
|
+
:status_200_count => HashUtil.deep_symbolize_keys(redis.hgetall("status_200_count")),
|
49
|
+
:status_400_count => HashUtil.deep_symbolize_keys(redis.hgetall("status_400_count")),
|
50
|
+
:status_500_count => HashUtil.deep_symbolize_keys(redis.hgetall("status_500_count")),
|
51
|
+
:mime_text_count => HashUtil.deep_symbolize_keys(redis.hgetall("mime_text_count")),
|
52
|
+
:mime_image_count => HashUtil.deep_symbolize_keys(redis.hgetall("mime_image_count")),
|
53
|
+
:mime_application_count => HashUtil.deep_symbolize_keys(redis.hgetall("mime_application_count")),
|
54
|
+
:pages_count => HashUtil.deep_symbolize_keys(redis.hgetall("pages_count")),
|
55
|
+
:assets_count => HashUtil.deep_symbolize_keys(redis.hgetall("assets_count"))
|
56
|
+
}
|
57
|
+
ap @crawl
|
58
|
+
haml :statistics
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.start
|
62
|
+
unless Server.running?
|
63
|
+
thread = Thread.new do
|
64
|
+
puts "Starting Sinatra"
|
65
|
+
Server.run!
|
66
|
+
puts "Stopping crawl..."
|
67
|
+
## we need to manually kill the main thread as sinatra traps the interrupts
|
68
|
+
Thread.main.kill
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
class HashUtil
|
76
|
+
def self.deep_symbolize_keys(hash)
|
77
|
+
hash.keys.each do |key|
|
78
|
+
value = hash[key]
|
79
|
+
hash.delete(key)
|
80
|
+
hash[key.to_sym] = value
|
81
|
+
if hash[key.to_sym].instance_of? Hash
|
82
|
+
hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
|
83
|
+
end
|
84
|
+
end
|
85
|
+
hash
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
class Numeric
|
90
|
+
def to_human
|
91
|
+
units = %w{B KB MB GB TB}
|
92
|
+
ap self
|
93
|
+
e = 0
|
94
|
+
e = (Math.log(self)/Math.log(1024)).floor unless self==0
|
95
|
+
s = "%.3f" % (to_f / 1024**e)
|
96
|
+
s.sub(/\.?0*$/, units[e])
|
97
|
+
end
|
98
|
+
end
|
data/lib/stats.rb
CHANGED
@@ -1,63 +1,68 @@
|
|
1
|
-
|
2
|
-
require 'haml'
|
3
|
-
|
4
|
-
class Stats < Sinatra::Base
|
5
|
-
|
6
|
-
def self.update_statistics(statistics)
|
7
|
-
@@statistics = statistics
|
8
|
-
@@statistics
|
9
|
-
end
|
1
|
+
class Stats
|
10
2
|
|
11
|
-
def
|
12
|
-
|
3
|
+
def initialize(options)
|
4
|
+
@full_redis = Redis.new(options[:redis_options])
|
5
|
+
@redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{options[:crawl_id]}")
|
13
6
|
end
|
14
7
|
|
15
|
-
def
|
16
|
-
|
8
|
+
def start_crawl(options)
|
9
|
+
@full_redis.sadd "cobweb_crawls", options[:crawl_id]
|
10
|
+
options.keys.each do |key|
|
11
|
+
@redis.hset "crawl_details", key, options[key]
|
12
|
+
end
|
13
|
+
@redis.hset "statistics", "current_status", "Crawl Starting..."
|
17
14
|
end
|
18
15
|
|
19
|
-
def
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
stats[:crawl_counter] = @crawl_counter
|
24
|
-
stats[:queue_counter] = @queue_counter
|
25
|
-
stats[:crawled] = @redis.smembers "crawled"
|
26
|
-
|
27
|
-
Stats.update_statistics(stats)
|
16
|
+
def end_crawl(options)
|
17
|
+
@full_redis.srem "cobweb_crawls", options[:crawl_id]
|
18
|
+
@redis.hset "statistics", "current_status", "Crawl Stopped"
|
19
|
+
@redis.del "crawl_details"
|
28
20
|
end
|
29
21
|
|
30
|
-
def
|
31
|
-
|
32
|
-
@redis = redis
|
22
|
+
def update_statistics(content)
|
33
23
|
|
34
|
-
crawl_counter = redis.
|
35
|
-
queue_counter = redis.
|
24
|
+
crawl_counter = @redis.scard("crawled").to_i
|
25
|
+
queue_counter = @redis.scard("queued").to_i
|
36
26
|
|
37
|
-
if redis.hexists "statistics", "average_response_time"
|
38
|
-
redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
|
27
|
+
if @redis.hexists "statistics", "average_response_time"
|
28
|
+
@redis.hset("statistics", "average_response_time", (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
|
39
29
|
else
|
40
|
-
redis.hset("statistics", "average_response_time", content[:response_time].to_f)
|
30
|
+
@redis.hset("statistics", "average_response_time", content[:response_time].to_f)
|
41
31
|
end
|
42
|
-
redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
|
43
|
-
redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
|
44
|
-
if redis.hexists "statistics", "average_length"
|
45
|
-
redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
|
32
|
+
@redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if @redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > @redis.hget("statistics", "maximum_response_time").to_f
|
33
|
+
@redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if @redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < @redis.hget("statistics", "minimum_response_time").to_f
|
34
|
+
if @redis.hexists "statistics", "average_length"
|
35
|
+
@redis.hset("statistics", "average_length", (((@redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
|
46
36
|
else
|
47
|
-
redis.hset("statistics", "average_length", content[:length].to_i)
|
37
|
+
@redis.hset("statistics", "average_length", content[:length].to_i)
|
48
38
|
end
|
49
|
-
redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
|
50
|
-
redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
|
51
|
-
|
39
|
+
@redis.hset "statistics", "maximum_length", content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @redis.hget("statistics", "maximum_length").to_i
|
40
|
+
@redis.hset "statistics", "minimum_length", content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @redis.hget("statistics", "minimum_length").to_i
|
41
|
+
|
42
|
+
|
52
43
|
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
53
|
-
redis.
|
44
|
+
@redis.hset "statistics", "page_count", @redis.hget("statistics", "page_count").to_i + 1
|
45
|
+
@redis.hset "statistics", "page_size", @redis.hget("statistics", "page_size").to_i + content[:length].to_i
|
46
|
+
increment_time_stat("pages_count")
|
54
47
|
else
|
55
|
-
redis.
|
48
|
+
@redis.hset "statistics", "asset_count", @redis.hget("statistics", "asset_count").to_i + 1
|
49
|
+
@redis.hset "statistics", "asset_size", @redis.hget("statistics", "asset_size").to_i + content[:length].to_i
|
50
|
+
increment_time_stat("assets_count")
|
56
51
|
end
|
52
|
+
|
53
|
+
total_redirects = @redis.hget("statistics", "total_redirects").to_i
|
54
|
+
@redis.hset "statistics", "total_redirects", 0 if total_redirects.nil?
|
55
|
+
@redis.hset("statistics", "total_redirects", total_redirects += content[:redirect_through].count) unless content[:redirect_through].nil?
|
56
|
+
|
57
|
+
@redis.hset "statistics", "crawl_counter", crawl_counter
|
58
|
+
@redis.hset "statistics", "queue_counter", queue_counter
|
59
|
+
|
60
|
+
total_length = @redis.hget("statistics", "total_length").to_i
|
61
|
+
@redis.hset "statistics", "total_length", total_length + content[:length].to_i
|
57
62
|
|
58
63
|
mime_counts = {}
|
59
|
-
if redis.hexists "statistics", "mime_counts"
|
60
|
-
mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
|
64
|
+
if @redis.hexists "statistics", "mime_counts"
|
65
|
+
mime_counts = JSON.parse(@redis.hget("statistics", "mime_counts"))
|
61
66
|
if mime_counts.has_key? content[:mime_type]
|
62
67
|
mime_counts[content[:mime_type]] += 1
|
63
68
|
else
|
@@ -66,40 +71,112 @@ class Stats < Sinatra::Base
|
|
66
71
|
else
|
67
72
|
mime_counts = {content[:mime_type] => 1}
|
68
73
|
end
|
69
|
-
redis.hset "statistics", "mime_counts", mime_counts.to_json
|
74
|
+
@redis.hset "statistics", "mime_counts", mime_counts.to_json
|
70
75
|
|
76
|
+
# record mime categories stats
|
77
|
+
if content[:mime_type].starts_with? "text"
|
78
|
+
increment_time_stat("mime_text_count")
|
79
|
+
elsif content[:mime_type].starts_with? "application"
|
80
|
+
increment_time_stat("mime_application_count")
|
81
|
+
elsif content[:mime_type].starts_with? "audio"
|
82
|
+
increment_time_stat("mime_audio_count")
|
83
|
+
elsif content[:mime_type].starts_with? "image"
|
84
|
+
increment_time_stat("mime_image_count")
|
85
|
+
elsif content[:mime_type].starts_with? "message"
|
86
|
+
increment_time_stat("mime_message_count")
|
87
|
+
elsif content[:mime_type].starts_with? "model"
|
88
|
+
increment_time_stat("mime_model_count")
|
89
|
+
elsif content[:mime_type].starts_with? "multipart"
|
90
|
+
increment_time_stat("mime_multipart_count")
|
91
|
+
elsif content[:mime_type].starts_with? "video"
|
92
|
+
increment_time_stat("mime_video_count")
|
93
|
+
end
|
94
|
+
|
71
95
|
status_counts = {}
|
72
|
-
if redis.hexists "statistics", "status_counts"
|
73
|
-
status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
|
74
|
-
|
75
|
-
|
96
|
+
if @redis.hexists "statistics", "status_counts"
|
97
|
+
status_counts = HashUtil.deep_symbolize_keys(JSON.parse(@redis.hget("statistics", "status_counts")))
|
98
|
+
status_code = content[:status_code].to_i.to_s.to_sym
|
99
|
+
if status_counts.has_key? status_code
|
100
|
+
status_counts[status_code] += 1
|
76
101
|
else
|
77
|
-
status_counts[
|
78
|
-
end
|
102
|
+
status_counts[status_code] = 1
|
103
|
+
end
|
79
104
|
else
|
80
|
-
status_counts = {
|
105
|
+
status_counts = {status_code => 1}
|
106
|
+
end
|
107
|
+
|
108
|
+
# record statistics by status type
|
109
|
+
if content[:status_code] >= 200 && content[:status_code] < 300
|
110
|
+
increment_time_stat("status_200_count")
|
111
|
+
elsif content[:status_code] >= 400 && content[:status_code] < 500
|
112
|
+
increment_time_stat("status|_400_count")
|
113
|
+
elsif content[:status_code] >= 500 && content[:status_code] < 600
|
114
|
+
increment_time_stat("status|_500_count")
|
115
|
+
end
|
116
|
+
|
117
|
+
@redis.hset "statistics", "status_counts", status_counts.to_json
|
118
|
+
|
119
|
+
|
120
|
+
## time based statistics
|
121
|
+
increment_time_stat("minute_totals", "minute", 60)
|
122
|
+
|
123
|
+
get_statistics
|
124
|
+
end
|
125
|
+
|
126
|
+
def record_time_stat(stat_name, value, type="minute", duration=60)
|
127
|
+
key = DateTime.now.strftime("%Y-%m-%d %H:%M")
|
128
|
+
if type == "hour"
|
129
|
+
key = DateTime.now.strftime("%Y-%m-%d %H:00")
|
81
130
|
end
|
82
|
-
redis.
|
131
|
+
stat_value = @redis.hget(stat_name, key).to_i
|
132
|
+
stat_count = @redis.hget("#{stat_name}-count", key).to_i
|
83
133
|
|
84
|
-
|
134
|
+
if minute_count.nil?
|
135
|
+
@redis.hset stat_name, key, value
|
136
|
+
@redis.hset "#{stat_name}-count", key, 1
|
137
|
+
else
|
138
|
+
@redis.hset stat_name, key, ((stat_value*stat_count) + value) / (stat_count+1)
|
139
|
+
@redis.hset "#{stat_name}-count", key, stat_count+1
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def increment_time_stat(stat_name, type="minute", duration=60)
|
144
|
+
key = DateTime.now.strftime("%Y-%m-%d %H:%M")
|
145
|
+
if type == "hour"
|
146
|
+
key = DateTime.now.strftime("%Y-%m-%d %H:00")
|
147
|
+
end
|
148
|
+
minute_count = @redis.hget(stat_name, key).to_i
|
149
|
+
if minute_count.nil?
|
150
|
+
@redis.hset stat_name, key, 1
|
151
|
+
else
|
152
|
+
@redis.hset stat_name, key, minute_count + 1
|
153
|
+
end
|
154
|
+
#clear up older data
|
155
|
+
@redis.hgetall(stat_name).keys.each do |key|
|
156
|
+
if DateTime.parse(key) < DateTime.now-(duration/1440.0)
|
157
|
+
puts "Deleting #{stat_name} - #{key}"
|
158
|
+
@redis.hdel(stat_name, key)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def get_statistics
|
164
|
+
@redis.hgetall "statistics"
|
85
165
|
end
|
86
166
|
|
87
|
-
|
167
|
+
def update_status(status)
|
168
|
+
@redis.hset "statistics", "current_status", status
|
169
|
+
end
|
88
170
|
|
89
|
-
|
90
|
-
@statistics
|
91
|
-
@status = @@status
|
92
|
-
haml :statistics
|
171
|
+
def get_status
|
172
|
+
@redis.hget "statistics", "current_status"
|
93
173
|
end
|
94
174
|
|
95
|
-
def
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
## we need to manually kill the main thread as sinatra traps the interrupts
|
100
|
-
Thread.main.kill
|
101
|
-
end
|
175
|
+
def set_totals
|
176
|
+
stats = get_statistics
|
177
|
+
stats[:crawled] = @redis.smembers "crawled"
|
102
178
|
end
|
179
|
+
|
103
180
|
end
|
104
181
|
|
105
182
|
|