cobweb 0.0.38 → 0.0.39

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/README.textile +1 -1
  2. data/lib/cobweb.rb +3 -1
  3. data/lib/cobweb_crawler.rb +98 -100
  4. data/lib/crawl_job.rb +3 -4
  5. data/lib/server.rb +98 -0
  6. data/lib/stats.rb +141 -64
  7. data/public/css/accordion.css +45 -0
  8. data/public/css/custom.css +13 -0
  9. data/public/css/datatable.css +189 -0
  10. data/public/css/datepicker.css +171 -0
  11. data/public/css/form-buttons.css +180 -0
  12. data/public/css/forms.css +489 -0
  13. data/public/css/jquery.fancybox-1.3.4.css +282 -0
  14. data/public/css/jquery.treeview.css +103 -0
  15. data/public/css/link-buttons.css +187 -0
  16. data/public/css/login.css +110 -0
  17. data/public/css/menu.css +156 -0
  18. data/public/css/messages.css +58 -0
  19. data/public/css/modalbox.css +63 -0
  20. data/public/css/statics.css +57 -0
  21. data/public/css/style.css +497 -0
  22. data/public/css/style_text.css +128 -0
  23. data/public/css/tabs.css +58 -0
  24. data/public/css/wysiwyg-editor.css +18 -0
  25. data/public/css/wysiwyg.css +149 -0
  26. data/public/css/wysiwyg.modal.css +69 -0
  27. data/public/gfx/back-menu.gif +0 -0
  28. data/public/gfx/back-submenu.gif +0 -0
  29. data/public/gfx/background.gif +0 -0
  30. data/public/gfx/box-hide.png +0 -0
  31. data/public/gfx/box-search.png +0 -0
  32. data/public/gfx/box-title.gif +0 -0
  33. data/public/gfx/code.gif +0 -0
  34. data/public/gfx/datepicker-arrows.gif +0 -0
  35. data/public/gfx/fancybox/blank.gif +0 -0
  36. data/public/gfx/fancybox/fancy_close.png +0 -0
  37. data/public/gfx/fancybox/fancy_loading.png +0 -0
  38. data/public/gfx/fancybox/fancy_nav_left.png +0 -0
  39. data/public/gfx/fancybox/fancy_nav_right.png +0 -0
  40. data/public/gfx/fancybox/fancy_title_left.png +0 -0
  41. data/public/gfx/fancybox/fancy_title_main.png +0 -0
  42. data/public/gfx/fancybox/fancy_title_over.png +0 -0
  43. data/public/gfx/fancybox/fancy_title_right.png +0 -0
  44. data/public/gfx/fancybox/fancybox-x.png +0 -0
  45. data/public/gfx/fancybox/fancybox.png +0 -0
  46. data/public/gfx/forms/date-next.gif +0 -0
  47. data/public/gfx/forms/date-prev.gif +0 -0
  48. data/public/gfx/forms/forms-checkbox.gif +0 -0
  49. data/public/gfx/forms/forms-date.gif +0 -0
  50. data/public/gfx/forms/forms-file.gif +0 -0
  51. data/public/gfx/forms/forms-input-big.gif +0 -0
  52. data/public/gfx/forms/forms-input-medium.gif +0 -0
  53. data/public/gfx/forms/forms-input-small.gif +0 -0
  54. data/public/gfx/forms/forms-input-xl.gif +0 -0
  55. data/public/gfx/forms/forms-radio.gif +0 -0
  56. data/public/gfx/forms/forms-selectbox-small.gif +0 -0
  57. data/public/gfx/forms/forms-selectbox.gif +0 -0
  58. data/public/gfx/forms/forms-textarea-big.gif +0 -0
  59. data/public/gfx/forms/forms-textarea-medium.gif +0 -0
  60. data/public/gfx/forms/forms-textarea-small.gif +0 -0
  61. data/public/gfx/forms/forms-textarea-xl.gif +0 -0
  62. data/public/gfx/icon-delete.png +0 -0
  63. data/public/gfx/icon-edit.png +0 -0
  64. data/public/gfx/icon-home.gif +0 -0
  65. data/public/gfx/img-delete.png +0 -0
  66. data/public/gfx/img-hover.png +0 -0
  67. data/public/gfx/img-zoom.png +0 -0
  68. data/public/gfx/jquery.wysiwyg.gif +0 -0
  69. data/public/gfx/label-icons.gif +0 -0
  70. data/public/gfx/label.gif +0 -0
  71. data/public/gfx/li-down.gif +0 -0
  72. data/public/gfx/li.gif +0 -0
  73. data/public/gfx/link-button-big.gif +0 -0
  74. data/public/gfx/link-button-medium.gif +0 -0
  75. data/public/gfx/link-button.gif +0 -0
  76. data/public/gfx/loading-2.gif +0 -0
  77. data/public/gfx/loading.gif +0 -0
  78. data/public/gfx/logo.png +0 -0
  79. data/public/gfx/modal-title.gif +0 -0
  80. data/public/gfx/photos/00.jpg +0 -0
  81. data/public/gfx/photos/01.jpg +0 -0
  82. data/public/gfx/photos/01xl.jpg +0 -0
  83. data/public/gfx/photos/02.jpg +0 -0
  84. data/public/gfx/photos/02xl.jpg +0 -0
  85. data/public/gfx/photos/03.jpg +0 -0
  86. data/public/gfx/photos/03xl.jpg +0 -0
  87. data/public/gfx/photos/04.jpg +0 -0
  88. data/public/gfx/photos/04xl.jpg +0 -0
  89. data/public/gfx/photos/05.jpg +0 -0
  90. data/public/gfx/photos/05xl.jpg +0 -0
  91. data/public/gfx/photos/06.jpg +0 -0
  92. data/public/gfx/photos/06xl.jpg +0 -0
  93. data/public/gfx/photos/07.jpg +0 -0
  94. data/public/gfx/photos/07xl.jpg +0 -0
  95. data/public/gfx/photos/08.jpg +0 -0
  96. data/public/gfx/photos/08xl.jpg +0 -0
  97. data/public/gfx/photos/09.jpg +0 -0
  98. data/public/gfx/photos/09xl.jpg +0 -0
  99. data/public/gfx/photos/10.jpg +0 -0
  100. data/public/gfx/photos/10xl.jpg +0 -0
  101. data/public/gfx/photos/11.jpg +0 -0
  102. data/public/gfx/photos/11xl.jpg +0 -0
  103. data/public/gfx/photos/12.jpg +0 -0
  104. data/public/gfx/photos/12xl.jpg +0 -0
  105. data/public/gfx/photos/13.jpg +0 -0
  106. data/public/gfx/photos/13xl.jpg +0 -0
  107. data/public/gfx/photos/14.jpg +0 -0
  108. data/public/gfx/photos/14xl.jpg +0 -0
  109. data/public/gfx/photos/15.jpg +0 -0
  110. data/public/gfx/photos/15xl.jpg +0 -0
  111. data/public/gfx/search-button.gif +0 -0
  112. data/public/gfx/search-input.gif +0 -0
  113. data/public/gfx/slider-button.gif +0 -0
  114. data/public/gfx/system-messages.gif +0 -0
  115. data/public/gfx/table-asc-arrow.gif +0 -0
  116. data/public/gfx/table-desc-arrow.gif +0 -0
  117. data/public/gfx/table-first.gif +0 -0
  118. data/public/gfx/table-last.gif +0 -0
  119. data/public/gfx/table-next.gif +0 -0
  120. data/public/gfx/table-number.gif +0 -0
  121. data/public/gfx/table-prev.gif +0 -0
  122. data/public/gfx/table-rows.gif +0 -0
  123. data/public/gfx/table-search.gif +0 -0
  124. data/public/gfx/table-thead.gif +0 -0
  125. data/public/gfx/tooltip.gif +0 -0
  126. data/public/gfx/treeview/ajax-loader.gif +0 -0
  127. data/public/gfx/treeview/file.gif +0 -0
  128. data/public/gfx/treeview/folder-closed.gif +0 -0
  129. data/public/gfx/treeview/folder.gif +0 -0
  130. data/public/gfx/treeview/minus.gif +0 -0
  131. data/public/gfx/treeview/plus.gif +0 -0
  132. data/public/gfx/treeview/treeview-default-line.gif +0 -0
  133. data/public/gfx/treeview/treeview-default.gif +0 -0
  134. data/public/js/controls/wysiwyg.image.js +284 -0
  135. data/public/js/controls/wysiwyg.link.js +210 -0
  136. data/public/js/controls/wysiwyg.table.js +151 -0
  137. data/public/js/customInput.jquery.js +68 -0
  138. data/public/js/excanvas.min.js +1 -0
  139. data/public/js/hoverIntent.js +84 -0
  140. data/public/js/inline.js +392 -0
  141. data/public/js/jquery-1.7.1.min.js +4 -0
  142. data/public/js/jquery-ui-select.js +522 -0
  143. data/public/js/jquery-ui-timepicker-addon.js +1299 -0
  144. data/public/js/jquery-ui.js +791 -0
  145. data/public/js/jquery.dataTables.js +7440 -0
  146. data/public/js/jquery.fancybox-1.3.4.js +1156 -0
  147. data/public/js/jquery.filestyle.mini.js +2 -0
  148. data/public/js/jquery.flot.js +2600 -0
  149. data/public/js/jquery.flot.resize.min.js +60 -0
  150. data/public/js/jquery.graphtable-0.2.js +179 -0
  151. data/public/js/jquery.tipsy.js +104 -0
  152. data/public/js/jquery.treeview.js +256 -0
  153. data/public/js/jquery.wysiwyg.js +2454 -0
  154. data/public/js/plugins/wysiwyg.rmFormat.js +348 -0
  155. data/public/js/superfish.js +121 -0
  156. data/public/js/supersubs.js +90 -0
  157. data/views/home.haml +54 -0
  158. data/views/layout.haml +89 -0
  159. data/views/statistics.haml +251 -71
  160. metadata +175 -22
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.38
2
+ h1. Cobweb v0.0.39
3
3
 
4
4
  h2. Intro
5
5
 
data/lib/cobweb.rb CHANGED
@@ -20,7 +20,7 @@ class Cobweb
20
20
  # investigate using event machine for single threaded crawling
21
21
 
22
22
  def self.version
23
- "0.0.38"
23
+ "0.0.39"
24
24
  end
25
25
 
26
26
  def method_missing(method_sym, *arguments, &block)
@@ -67,6 +67,8 @@ class Cobweb
67
67
  @redis.set("crawl-counter", 0)
68
68
  @redis.set("queue-counter", 1)
69
69
 
70
+ @stats = Stats.new(request)
71
+ @stats.start_crawl(request)
70
72
 
71
73
  # add internal_urls into redis
72
74
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
@@ -11,9 +11,18 @@ class CobwebCrawler
11
11
  @statistic = {}
12
12
 
13
13
  @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
14
- crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
14
+ @crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
15
+ @options[:crawl_id] = @crawl_id
15
16
 
16
- @redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{crawl_id}")
17
+ @redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{@crawl_id}")
18
+ @options[:internal_urls] = [] if @options[:internal_urls].nil?
19
+ @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
20
+ @debug = @options[:debug]
21
+
22
+ @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
23
+ if @options[:web_statistics]
24
+ Server.start
25
+ end
17
26
 
18
27
  @cobweb = Cobweb.new(@options)
19
28
  end
@@ -21,124 +30,113 @@ class CobwebCrawler
21
30
  def crawl(base_url, crawl_options = {}, &block)
22
31
  @options[:base_url] = base_url unless @options.has_key? :base_url
23
32
 
33
+ @options[:internal_urls] << base_url if @options[:internal_urls].empty?
34
+ @redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty?
35
+
24
36
  @crawl_options = crawl_options
25
37
 
38
+ puts "http://localhost:4567/statistics/#{@crawl_id}"
39
+ puts ""
40
+
26
41
  @redis.sadd "queued", base_url
27
42
  crawl_counter = @redis.scard("crawled").to_i
28
43
  queue_counter = @redis.scard("queued").to_i
29
44
 
30
- while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
31
- thread = Thread.new do
32
-
33
- url = @redis.spop "queued"
34
- crawl_counter = @redis.scard("crawled").to_i
35
- queue_counter = @redis.scard("queued").to_i
45
+ begin
46
+ @stats.start_crawl(@options)
47
+ while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
48
+ thread = Thread.new do
36
49
 
37
- @options[:url] = url
38
- unless @redis.sismember("crawled", url.to_s)
39
- begin
40
- Stats.update_status("Requesting #{url}...")
41
- content = @cobweb.get(url)
42
- Stats.update_status("Processing #{url}...")
43
-
44
- if @statistic[:average_response_time].nil?
45
- @statistic[:average_response_time] = content[:response_time].to_f
46
- else
47
- @statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
48
- end
49
-
50
- @statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time]
51
- @statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time]
52
-
53
- if @statistic[:average_length]
54
- @statistic[:average_length] = (((@statistic[:average_length].to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
55
- else
56
- @statistic[:average_length] = content[:length].to_i
57
- end
58
-
59
- @statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i
60
- @statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i < @statistic[:minimum_length].to_i
61
- @statistic[:total_length] = @statistic[:total_length].to_i + content[:length].to_i
50
+ url = @redis.spop "queued"
51
+ crawl_counter = @redis.scard("crawled").to_i
52
+ queue_counter = @redis.scard("queued").to_i
53
+
54
+ @options[:url] = url
55
+ unless @redis.sismember("crawled", url.to_s)
56
+ begin
57
+ @stats.update_status("Requesting #{url}...")
58
+ content = @cobweb.get(url)
59
+ @stats.update_status("Processing #{url}...")
62
60
 
63
- if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
64
- @statistic[:page_count] = @statistic[:page_count].to_i + 1
65
- @statistic[:page_size] = @statistic[:page_size].to_i + content[:length].to_i
66
- else
67
- @statistic[:asset_count] = @statistic[:asset_count].to_i + 1
68
- @statistic[:asset_size] = @statistic[:asset_size].to_i + content[:length].to_i
69
- end
70
-
71
- @statistic[:total_redirects] = 0 if @statistic[:total_redirects].nil?
72
- @statistic[:total_redirects] += content[:redirect_through].count unless content[:redirect_through].nil?
61
+ @redis.sadd "crawled", url.to_s
62
+ @redis.incr "crawl-counter"
73
63
 
74
- @statistic[:crawl_counter] = crawl_counter
75
- @statistic[:queue_counter] = queue_counter
64
+ internal_links = all_links_from_content(content).map{|link| link.to_s}
65
+
66
+ # reject the link if we've crawled it or queued it
67
+ internal_links.reject!{|link| @redis.sismember("crawled", link)}
68
+ internal_links.reject!{|link| @redis.sismember("queued", link)}
76
69
 
77
- mime_counts = {}
78
- if @statistic.has_key? :mime_counts
79
- mime_counts = @statistic[:mime_counts]
80
- if mime_counts.has_key? content[:mime_type]
81
- mime_counts[content[:mime_type]] += 1
82
- else
83
- mime_counts[content[:mime_type]] = 1
84
- end
85
- else
86
- mime_counts = {content[:mime_type] => 1}
87
- end
88
- @statistic[:mime_counts] = mime_counts
89
70
 
90
- status_counts = {}
91
-
92
- if @statistic.has_key? :status_counts
93
- status_counts = @statistic[:status_counts]
94
- if status_counts.has_key? content[:status_code].to_i
95
- status_counts[content[:status_code].to_i] += 1
96
- else
97
- status_counts[content[:status_code].to_i] = 1
98
- end
99
- else
100
- status_counts = {content[:status_code].to_i => 1}
101
- end
102
- @statistic[:status_counts] = status_counts
71
+ # select the link if its internal
72
+ internal_links.select!{|link| internal_link?(link)}
103
73
 
104
- @redis.sadd "crawled", url.to_s
105
- @redis.incr "crawl-counter"
106
-
107
- content[:links].keys.map{|key| content[:links][key]}.flatten.each do |content_link|
108
- link = content_link.to_s
109
- unless @redis.sismember("crawled", link)
110
- puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug]
111
- if link.to_s.match(Regexp.new("^#{@options[:base_url]}"))
112
- puts "Matched as #{link} as internal" if @options[:debug]
113
- unless @redis.sismember("crawled", link) || @redis.sismember("queued", link)
114
- puts "Added #{link.to_s} to queue" if @options[:debug]
115
- @redis.sadd "queued", link
116
- crawl_counter = @redis.scard("crawled").to_i
117
- queue_counter = @redis.scard("queued").to_i
118
- end
119
- end
74
+ internal_links.each do |link|
75
+ puts "Added #{link.to_s} to queue" if @debug
76
+ @redis.sadd "queued", link
120
77
  end
121
- end
122
78
 
123
- crawl_counter = @redis.scard("crawled").to_i
124
- queue_counter = @redis.scard("queued").to_i
125
- Stats.update_statistics(@statistic)
126
- Stats.update_status("Completed #{url}.")
127
- puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @options[:debug]
79
+ crawl_counter = @redis.scard("crawled").to_i
80
+ queue_counter = @redis.scard("queued").to_i
81
+
82
+ @stats.update_statistics(content)
83
+ @stats.update_status("Completed #{url}.")
84
+ puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
128
85
 
129
- yield content, @statistic if block_given?
86
+ yield content, @statistic if block_given?
130
87
 
131
- rescue => e
132
- puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
133
- ap e
134
- ap e.backtrace
88
+ rescue => e
89
+ puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
90
+ ap e
91
+ ap e.backtrace
92
+ end
93
+ else
94
+ puts "Already crawled #{@options[:url]}" if @debug
135
95
  end
136
- else
137
- puts "Already crawled #{@options[:url]}" if @options[:debug]
138
96
  end
97
+ thread.join
139
98
  end
140
- thread.join
99
+ ensure
100
+ @stats.end_crawl(@options)
141
101
  end
142
102
  @statistic
143
103
  end
104
+
105
+
106
+ def internal_link?(link)
107
+ puts "Checking internal link for: #{link}" if @debug
108
+ valid_link = true
109
+ internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
110
+ puts "Matching against #{pattern.source}" if @debug
111
+ if link.match(pattern)
112
+ puts "Matched as internal" if @debug
113
+ return true
114
+ end
115
+ end
116
+ puts "Didn't match any pattern so marked as not internal" if @debug
117
+ false
118
+ end
119
+
120
+ def internal_patterns
121
+ @internal_patterns ||= @redis.smembers("internal_urls")
122
+ end
123
+
124
+ def all_links_from_content(content)
125
+ links = content[:links].keys.map{|key| content[:links][key]}.flatten
126
+ links.reject!{|link| link.starts_with?("javascript:")}
127
+ links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
128
+ links.select!{|link| link.scheme == "http" || link.scheme == "https"}
129
+ links.uniq
130
+ links
131
+ end
132
+ end
133
+
134
+ class String
135
+ def starts_with?(val)
136
+ if self.length >= val.length
137
+ self[0..val.length-1] == val
138
+ else
139
+ false
140
+ end
141
+ end
144
142
  end
data/lib/crawl_job.rb CHANGED
@@ -14,6 +14,7 @@ class CrawlJob
14
14
 
15
15
  content_request[:redis_options] = {} unless content_request.has_key? :redis_options
16
16
  @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
17
+ @stats = Stats.new(content_request)
17
18
 
18
19
  @debug = content_request[:debug]
19
20
 
@@ -32,7 +33,7 @@ class CrawlJob
32
33
  content = Cobweb.new(content_request).get(content_request[:url], content_request)
33
34
 
34
35
  ## update statistics
35
- Stats.set_statistics_in_redis(@redis, content)
36
+ @stats.update_statistics(content)
36
37
 
37
38
  # set the base url if this is the first page
38
39
  set_base_url @redis, content, content_request
@@ -81,9 +82,7 @@ class CrawlJob
81
82
 
82
83
  def self.finished(content_request)
83
84
  # finished
84
-
85
- Stats.set_totals
86
-
85
+ @stats.end_crawl(content_request)
87
86
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), Stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
88
87
  end
89
88
 
data/lib/server.rb ADDED
@@ -0,0 +1,98 @@
1
+ require 'sinatra'
2
+ require 'haml'
3
+
4
+ class Server < Sinatra::Base
5
+
6
+ set :views, settings.root + '/../views'
7
+ puts "#{settings.root}/../public"
8
+ set :public_folder, settings.root + '/../public'
9
+ enable :static
10
+
11
+ get '/' do
12
+ @full_redis = Redis.new
13
+
14
+ @colors = ["#00366f", "#006ba0", "#3F0BDB", "#396CB3"]
15
+
16
+ @crawls = []
17
+ @full_redis.smembers("cobweb_crawls").each do |crawl_id|
18
+ redis = NamespacedRedis.new({}, "cobweb-#{crawl_id}")
19
+ stats = HashUtil.deep_symbolize_keys({
20
+ :crawl_details => redis.hgetall("crawl_details"),
21
+ :statistics => redis.hgetall("statistics"),
22
+ :minute_totals => redis.hgetall("minute_totals")
23
+ })
24
+ @crawls << stats
25
+ end
26
+
27
+ haml :home
28
+ end
29
+
30
+ get '/statistics/:crawl_id' do
31
+ redis = NamespacedRedis.new({}, "cobweb-#{params[:crawl_id]}")
32
+
33
+ @statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
34
+ if @statistics[:status_counts].nil?
35
+ @statistics[:status_counts]
36
+ else
37
+ @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
38
+ end
39
+ if @statistics[:mime_counts].nil?
40
+ @statistics[:mime_counts]
41
+ else
42
+ @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
43
+ end
44
+ @crawl = {
45
+ :statistics => @statistics,
46
+ :crawl_details => HashUtil.deep_symbolize_keys(redis.hgetall("crawl_details")),
47
+ :minute_totals => HashUtil.deep_symbolize_keys(redis.hgetall("minute_totals")),
48
+ :status_200_count => HashUtil.deep_symbolize_keys(redis.hgetall("status_200_count")),
49
+ :status_400_count => HashUtil.deep_symbolize_keys(redis.hgetall("status_400_count")),
50
+ :status_500_count => HashUtil.deep_symbolize_keys(redis.hgetall("status_500_count")),
51
+ :mime_text_count => HashUtil.deep_symbolize_keys(redis.hgetall("mime_text_count")),
52
+ :mime_image_count => HashUtil.deep_symbolize_keys(redis.hgetall("mime_image_count")),
53
+ :mime_application_count => HashUtil.deep_symbolize_keys(redis.hgetall("mime_application_count")),
54
+ :pages_count => HashUtil.deep_symbolize_keys(redis.hgetall("pages_count")),
55
+ :assets_count => HashUtil.deep_symbolize_keys(redis.hgetall("assets_count"))
56
+ }
57
+ ap @crawl
58
+ haml :statistics
59
+ end
60
+
61
+ def self.start
62
+ unless Server.running?
63
+ thread = Thread.new do
64
+ puts "Starting Sinatra"
65
+ Server.run!
66
+ puts "Stopping crawl..."
67
+ ## we need to manually kill the main thread as sinatra traps the interrupts
68
+ Thread.main.kill
69
+ end
70
+ end
71
+ end
72
+
73
+ end
74
+
75
+ class HashUtil
76
+ def self.deep_symbolize_keys(hash)
77
+ hash.keys.each do |key|
78
+ value = hash[key]
79
+ hash.delete(key)
80
+ hash[key.to_sym] = value
81
+ if hash[key.to_sym].instance_of? Hash
82
+ hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
83
+ end
84
+ end
85
+ hash
86
+ end
87
+ end
88
+
89
+ class Numeric
90
+ def to_human
91
+ units = %w{B KB MB GB TB}
92
+ ap self
93
+ e = 0
94
+ e = (Math.log(self)/Math.log(1024)).floor unless self==0
95
+ s = "%.3f" % (to_f / 1024**e)
96
+ s.sub(/\.?0*$/, units[e])
97
+ end
98
+ end
data/lib/stats.rb CHANGED
@@ -1,63 +1,68 @@
1
- require 'sinatra'
2
- require 'haml'
3
-
4
- class Stats < Sinatra::Base
5
-
6
- def self.update_statistics(statistics)
7
- @@statistics = statistics
8
- @@statistics
9
- end
1
+ class Stats
10
2
 
11
- def self.get_statistics
12
- @@statistics ||= {}
3
+ def initialize(options)
4
+ @full_redis = Redis.new(options[:redis_options])
5
+ @redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{options[:crawl_id]}")
13
6
  end
14
7
 
15
- def self.update_status(status)
16
- @@status = status
8
+ def start_crawl(options)
9
+ @full_redis.sadd "cobweb_crawls", options[:crawl_id]
10
+ options.keys.each do |key|
11
+ @redis.hset "crawl_details", key, options[key]
12
+ end
13
+ @redis.hset "statistics", "current_status", "Crawl Starting..."
17
14
  end
18
15
 
19
- def self.set_totals
20
- stats = @redis.hgetall "statistics"
21
- stats[:total_pages] = @redis.get("total_pages").to_i
22
- stats[:total_assets] = @redis.get("total_assets").to_i
23
- stats[:crawl_counter] = @crawl_counter
24
- stats[:queue_counter] = @queue_counter
25
- stats[:crawled] = @redis.smembers "crawled"
26
-
27
- Stats.update_statistics(stats)
16
+ def end_crawl(options)
17
+ @full_redis.srem "cobweb_crawls", options[:crawl_id]
18
+ @redis.hset "statistics", "current_status", "Crawl Stopped"
19
+ @redis.del "crawl_details"
28
20
  end
29
21
 
30
- def self.set_statistics_in_redis(redis, content)
31
-
32
- @redis = redis
22
+ def update_statistics(content)
33
23
 
34
- crawl_counter = redis.get("crawl-counter").to_i
35
- queue_counter = redis.get("queue-counter").to_i
24
+ crawl_counter = @redis.scard("crawled").to_i
25
+ queue_counter = @redis.scard("queued").to_i
36
26
 
37
- if redis.hexists "statistics", "average_response_time"
38
- redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
27
+ if @redis.hexists "statistics", "average_response_time"
28
+ @redis.hset("statistics", "average_response_time", (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
39
29
  else
40
- redis.hset("statistics", "average_response_time", content[:response_time].to_f)
30
+ @redis.hset("statistics", "average_response_time", content[:response_time].to_f)
41
31
  end
42
- redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
43
- redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
44
- if redis.hexists "statistics", "average_length"
45
- redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
32
+ @redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if @redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > @redis.hget("statistics", "maximum_response_time").to_f
33
+ @redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if @redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < @redis.hget("statistics", "minimum_response_time").to_f
34
+ if @redis.hexists "statistics", "average_length"
35
+ @redis.hset("statistics", "average_length", (((@redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
46
36
  else
47
- redis.hset("statistics", "average_length", content[:length].to_i)
37
+ @redis.hset("statistics", "average_length", content[:length].to_i)
48
38
  end
49
- redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
50
- redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
51
-
39
+ @redis.hset "statistics", "maximum_length", content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @redis.hget("statistics", "maximum_length").to_i
40
+ @redis.hset "statistics", "minimum_length", content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @redis.hget("statistics", "minimum_length").to_i
41
+
42
+
52
43
  if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
53
- redis.incr "total_pages"
44
+ @redis.hset "statistics", "page_count", @redis.hget("statistics", "page_count").to_i + 1
45
+ @redis.hset "statistics", "page_size", @redis.hget("statistics", "page_size").to_i + content[:length].to_i
46
+ increment_time_stat("pages_count")
54
47
  else
55
- redis.incr "total_assets"
48
+ @redis.hset "statistics", "asset_count", @redis.hget("statistics", "asset_count").to_i + 1
49
+ @redis.hset "statistics", "asset_size", @redis.hget("statistics", "asset_size").to_i + content[:length].to_i
50
+ increment_time_stat("assets_count")
56
51
  end
52
+
53
+ total_redirects = @redis.hget("statistics", "total_redirects").to_i
54
+ @redis.hset "statistics", "total_redirects", 0 if total_redirects.nil?
55
+ @redis.hset("statistics", "total_redirects", total_redirects += content[:redirect_through].count) unless content[:redirect_through].nil?
56
+
57
+ @redis.hset "statistics", "crawl_counter", crawl_counter
58
+ @redis.hset "statistics", "queue_counter", queue_counter
59
+
60
+ total_length = @redis.hget("statistics", "total_length").to_i
61
+ @redis.hset "statistics", "total_length", total_length + content[:length].to_i
57
62
 
58
63
  mime_counts = {}
59
- if redis.hexists "statistics", "mime_counts"
60
- mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
64
+ if @redis.hexists "statistics", "mime_counts"
65
+ mime_counts = JSON.parse(@redis.hget("statistics", "mime_counts"))
61
66
  if mime_counts.has_key? content[:mime_type]
62
67
  mime_counts[content[:mime_type]] += 1
63
68
  else
@@ -66,40 +71,112 @@ class Stats < Sinatra::Base
66
71
  else
67
72
  mime_counts = {content[:mime_type] => 1}
68
73
  end
69
- redis.hset "statistics", "mime_counts", mime_counts.to_json
74
+ @redis.hset "statistics", "mime_counts", mime_counts.to_json
70
75
 
76
+ # record mime categories stats
77
+ if content[:mime_type].starts_with? "text"
78
+ increment_time_stat("mime_text_count")
79
+ elsif content[:mime_type].starts_with? "application"
80
+ increment_time_stat("mime_application_count")
81
+ elsif content[:mime_type].starts_with? "audio"
82
+ increment_time_stat("mime_audio_count")
83
+ elsif content[:mime_type].starts_with? "image"
84
+ increment_time_stat("mime_image_count")
85
+ elsif content[:mime_type].starts_with? "message"
86
+ increment_time_stat("mime_message_count")
87
+ elsif content[:mime_type].starts_with? "model"
88
+ increment_time_stat("mime_model_count")
89
+ elsif content[:mime_type].starts_with? "multipart"
90
+ increment_time_stat("mime_multipart_count")
91
+ elsif content[:mime_type].starts_with? "video"
92
+ increment_time_stat("mime_video_count")
93
+ end
94
+
71
95
  status_counts = {}
72
- if redis.hexists "statistics", "status_counts"
73
- status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
74
- if status_counts.has_key? content[:status_code].to_i
75
- status_counts[content[:status_code].to_i] += 1
96
+ if @redis.hexists "statistics", "status_counts"
97
+ status_counts = HashUtil.deep_symbolize_keys(JSON.parse(@redis.hget("statistics", "status_counts")))
98
+ status_code = content[:status_code].to_i.to_s.to_sym
99
+ if status_counts.has_key? status_code
100
+ status_counts[status_code] += 1
76
101
  else
77
- status_counts[content[:status_code].to_i] = 1
78
- end
102
+ status_counts[status_code] = 1
103
+ end
79
104
  else
80
- status_counts = {content[:status_code].to_i => 1}
105
+ status_counts = {status_code => 1}
106
+ end
107
+
108
+ # record statistics by status type
109
+ if content[:status_code] >= 200 && content[:status_code] < 300
110
+ increment_time_stat("status_200_count")
111
+ elsif content[:status_code] >= 400 && content[:status_code] < 500
112
+ increment_time_stat("status|_400_count")
113
+ elsif content[:status_code] >= 500 && content[:status_code] < 600
114
+ increment_time_stat("status|_500_count")
115
+ end
116
+
117
+ @redis.hset "statistics", "status_counts", status_counts.to_json
118
+
119
+
120
+ ## time based statistics
121
+ increment_time_stat("minute_totals", "minute", 60)
122
+
123
+ get_statistics
124
+ end
125
+
126
+ def record_time_stat(stat_name, value, type="minute", duration=60)
127
+ key = DateTime.now.strftime("%Y-%m-%d %H:%M")
128
+ if type == "hour"
129
+ key = DateTime.now.strftime("%Y-%m-%d %H:00")
81
130
  end
82
- redis.hset "statistics", "status_counts", status_counts.to_json
131
+ stat_value = @redis.hget(stat_name, key).to_i
132
+ stat_count = @redis.hget("#{stat_name}-count", key).to_i
83
133
 
84
- @@statistics = @redis.hgetall "statistics"
134
+ if minute_count.nil?
135
+ @redis.hset stat_name, key, value
136
+ @redis.hset "#{stat_name}-count", key, 1
137
+ else
138
+ @redis.hset stat_name, key, ((stat_value*stat_count) + value) / (stat_count+1)
139
+ @redis.hset "#{stat_name}-count", key, stat_count+1
140
+ end
141
+ end
142
+
143
+ def increment_time_stat(stat_name, type="minute", duration=60)
144
+ key = DateTime.now.strftime("%Y-%m-%d %H:%M")
145
+ if type == "hour"
146
+ key = DateTime.now.strftime("%Y-%m-%d %H:00")
147
+ end
148
+ minute_count = @redis.hget(stat_name, key).to_i
149
+ if minute_count.nil?
150
+ @redis.hset stat_name, key, 1
151
+ else
152
+ @redis.hset stat_name, key, minute_count + 1
153
+ end
154
+ #clear up older data
155
+ @redis.hgetall(stat_name).keys.each do |key|
156
+ if DateTime.parse(key) < DateTime.now-(duration/1440.0)
157
+ puts "Deleting #{stat_name} - #{key}"
158
+ @redis.hdel(stat_name, key)
159
+ end
160
+ end
161
+ end
162
+
163
+ def get_statistics
164
+ @redis.hgetall "statistics"
85
165
  end
86
166
 
87
- set :views, settings.root + '/../views'
167
+ def update_status(status)
168
+ @redis.hset "statistics", "current_status", status
169
+ end
88
170
 
89
- get '/' do
90
- @statistics = @@statistics
91
- @status = @@status
92
- haml :statistics
171
+ def get_status
172
+ @redis.hget "statistics", "current_status"
93
173
  end
94
174
 
95
- def self.start
96
- thread = Thread.new do
97
- Stats.run!
98
-
99
- ## we need to manually kill the main thread as sinatra traps the interrupts
100
- Thread.main.kill
101
- end
175
+ def set_totals
176
+ stats = get_statistics
177
+ stats[:crawled] = @redis.smembers "crawled"
102
178
  end
179
+
103
180
  end
104
181
 
105
182