cobweb 0.0.38 → 0.0.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. data/README.textile +1 -1
  2. data/lib/cobweb.rb +3 -1
  3. data/lib/cobweb_crawler.rb +98 -100
  4. data/lib/crawl_job.rb +3 -4
  5. data/lib/server.rb +98 -0
  6. data/lib/stats.rb +141 -64
  7. data/public/css/accordion.css +45 -0
  8. data/public/css/custom.css +13 -0
  9. data/public/css/datatable.css +189 -0
  10. data/public/css/datepicker.css +171 -0
  11. data/public/css/form-buttons.css +180 -0
  12. data/public/css/forms.css +489 -0
  13. data/public/css/jquery.fancybox-1.3.4.css +282 -0
  14. data/public/css/jquery.treeview.css +103 -0
  15. data/public/css/link-buttons.css +187 -0
  16. data/public/css/login.css +110 -0
  17. data/public/css/menu.css +156 -0
  18. data/public/css/messages.css +58 -0
  19. data/public/css/modalbox.css +63 -0
  20. data/public/css/statics.css +57 -0
  21. data/public/css/style.css +497 -0
  22. data/public/css/style_text.css +128 -0
  23. data/public/css/tabs.css +58 -0
  24. data/public/css/wysiwyg-editor.css +18 -0
  25. data/public/css/wysiwyg.css +149 -0
  26. data/public/css/wysiwyg.modal.css +69 -0
  27. data/public/gfx/back-menu.gif +0 -0
  28. data/public/gfx/back-submenu.gif +0 -0
  29. data/public/gfx/background.gif +0 -0
  30. data/public/gfx/box-hide.png +0 -0
  31. data/public/gfx/box-search.png +0 -0
  32. data/public/gfx/box-title.gif +0 -0
  33. data/public/gfx/code.gif +0 -0
  34. data/public/gfx/datepicker-arrows.gif +0 -0
  35. data/public/gfx/fancybox/blank.gif +0 -0
  36. data/public/gfx/fancybox/fancy_close.png +0 -0
  37. data/public/gfx/fancybox/fancy_loading.png +0 -0
  38. data/public/gfx/fancybox/fancy_nav_left.png +0 -0
  39. data/public/gfx/fancybox/fancy_nav_right.png +0 -0
  40. data/public/gfx/fancybox/fancy_title_left.png +0 -0
  41. data/public/gfx/fancybox/fancy_title_main.png +0 -0
  42. data/public/gfx/fancybox/fancy_title_over.png +0 -0
  43. data/public/gfx/fancybox/fancy_title_right.png +0 -0
  44. data/public/gfx/fancybox/fancybox-x.png +0 -0
  45. data/public/gfx/fancybox/fancybox.png +0 -0
  46. data/public/gfx/forms/date-next.gif +0 -0
  47. data/public/gfx/forms/date-prev.gif +0 -0
  48. data/public/gfx/forms/forms-checkbox.gif +0 -0
  49. data/public/gfx/forms/forms-date.gif +0 -0
  50. data/public/gfx/forms/forms-file.gif +0 -0
  51. data/public/gfx/forms/forms-input-big.gif +0 -0
  52. data/public/gfx/forms/forms-input-medium.gif +0 -0
  53. data/public/gfx/forms/forms-input-small.gif +0 -0
  54. data/public/gfx/forms/forms-input-xl.gif +0 -0
  55. data/public/gfx/forms/forms-radio.gif +0 -0
  56. data/public/gfx/forms/forms-selectbox-small.gif +0 -0
  57. data/public/gfx/forms/forms-selectbox.gif +0 -0
  58. data/public/gfx/forms/forms-textarea-big.gif +0 -0
  59. data/public/gfx/forms/forms-textarea-medium.gif +0 -0
  60. data/public/gfx/forms/forms-textarea-small.gif +0 -0
  61. data/public/gfx/forms/forms-textarea-xl.gif +0 -0
  62. data/public/gfx/icon-delete.png +0 -0
  63. data/public/gfx/icon-edit.png +0 -0
  64. data/public/gfx/icon-home.gif +0 -0
  65. data/public/gfx/img-delete.png +0 -0
  66. data/public/gfx/img-hover.png +0 -0
  67. data/public/gfx/img-zoom.png +0 -0
  68. data/public/gfx/jquery.wysiwyg.gif +0 -0
  69. data/public/gfx/label-icons.gif +0 -0
  70. data/public/gfx/label.gif +0 -0
  71. data/public/gfx/li-down.gif +0 -0
  72. data/public/gfx/li.gif +0 -0
  73. data/public/gfx/link-button-big.gif +0 -0
  74. data/public/gfx/link-button-medium.gif +0 -0
  75. data/public/gfx/link-button.gif +0 -0
  76. data/public/gfx/loading-2.gif +0 -0
  77. data/public/gfx/loading.gif +0 -0
  78. data/public/gfx/logo.png +0 -0
  79. data/public/gfx/modal-title.gif +0 -0
  80. data/public/gfx/photos/00.jpg +0 -0
  81. data/public/gfx/photos/01.jpg +0 -0
  82. data/public/gfx/photos/01xl.jpg +0 -0
  83. data/public/gfx/photos/02.jpg +0 -0
  84. data/public/gfx/photos/02xl.jpg +0 -0
  85. data/public/gfx/photos/03.jpg +0 -0
  86. data/public/gfx/photos/03xl.jpg +0 -0
  87. data/public/gfx/photos/04.jpg +0 -0
  88. data/public/gfx/photos/04xl.jpg +0 -0
  89. data/public/gfx/photos/05.jpg +0 -0
  90. data/public/gfx/photos/05xl.jpg +0 -0
  91. data/public/gfx/photos/06.jpg +0 -0
  92. data/public/gfx/photos/06xl.jpg +0 -0
  93. data/public/gfx/photos/07.jpg +0 -0
  94. data/public/gfx/photos/07xl.jpg +0 -0
  95. data/public/gfx/photos/08.jpg +0 -0
  96. data/public/gfx/photos/08xl.jpg +0 -0
  97. data/public/gfx/photos/09.jpg +0 -0
  98. data/public/gfx/photos/09xl.jpg +0 -0
  99. data/public/gfx/photos/10.jpg +0 -0
  100. data/public/gfx/photos/10xl.jpg +0 -0
  101. data/public/gfx/photos/11.jpg +0 -0
  102. data/public/gfx/photos/11xl.jpg +0 -0
  103. data/public/gfx/photos/12.jpg +0 -0
  104. data/public/gfx/photos/12xl.jpg +0 -0
  105. data/public/gfx/photos/13.jpg +0 -0
  106. data/public/gfx/photos/13xl.jpg +0 -0
  107. data/public/gfx/photos/14.jpg +0 -0
  108. data/public/gfx/photos/14xl.jpg +0 -0
  109. data/public/gfx/photos/15.jpg +0 -0
  110. data/public/gfx/photos/15xl.jpg +0 -0
  111. data/public/gfx/search-button.gif +0 -0
  112. data/public/gfx/search-input.gif +0 -0
  113. data/public/gfx/slider-button.gif +0 -0
  114. data/public/gfx/system-messages.gif +0 -0
  115. data/public/gfx/table-asc-arrow.gif +0 -0
  116. data/public/gfx/table-desc-arrow.gif +0 -0
  117. data/public/gfx/table-first.gif +0 -0
  118. data/public/gfx/table-last.gif +0 -0
  119. data/public/gfx/table-next.gif +0 -0
  120. data/public/gfx/table-number.gif +0 -0
  121. data/public/gfx/table-prev.gif +0 -0
  122. data/public/gfx/table-rows.gif +0 -0
  123. data/public/gfx/table-search.gif +0 -0
  124. data/public/gfx/table-thead.gif +0 -0
  125. data/public/gfx/tooltip.gif +0 -0
  126. data/public/gfx/treeview/ajax-loader.gif +0 -0
  127. data/public/gfx/treeview/file.gif +0 -0
  128. data/public/gfx/treeview/folder-closed.gif +0 -0
  129. data/public/gfx/treeview/folder.gif +0 -0
  130. data/public/gfx/treeview/minus.gif +0 -0
  131. data/public/gfx/treeview/plus.gif +0 -0
  132. data/public/gfx/treeview/treeview-default-line.gif +0 -0
  133. data/public/gfx/treeview/treeview-default.gif +0 -0
  134. data/public/js/controls/wysiwyg.image.js +284 -0
  135. data/public/js/controls/wysiwyg.link.js +210 -0
  136. data/public/js/controls/wysiwyg.table.js +151 -0
  137. data/public/js/customInput.jquery.js +68 -0
  138. data/public/js/excanvas.min.js +1 -0
  139. data/public/js/hoverIntent.js +84 -0
  140. data/public/js/inline.js +392 -0
  141. data/public/js/jquery-1.7.1.min.js +4 -0
  142. data/public/js/jquery-ui-select.js +522 -0
  143. data/public/js/jquery-ui-timepicker-addon.js +1299 -0
  144. data/public/js/jquery-ui.js +791 -0
  145. data/public/js/jquery.dataTables.js +7440 -0
  146. data/public/js/jquery.fancybox-1.3.4.js +1156 -0
  147. data/public/js/jquery.filestyle.mini.js +2 -0
  148. data/public/js/jquery.flot.js +2600 -0
  149. data/public/js/jquery.flot.resize.min.js +60 -0
  150. data/public/js/jquery.graphtable-0.2.js +179 -0
  151. data/public/js/jquery.tipsy.js +104 -0
  152. data/public/js/jquery.treeview.js +256 -0
  153. data/public/js/jquery.wysiwyg.js +2454 -0
  154. data/public/js/plugins/wysiwyg.rmFormat.js +348 -0
  155. data/public/js/superfish.js +121 -0
  156. data/public/js/supersubs.js +90 -0
  157. data/views/home.haml +54 -0
  158. data/views/layout.haml +89 -0
  159. data/views/statistics.haml +251 -71
  160. metadata +175 -22
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.38
2
+ h1. Cobweb v0.0.39
3
3
 
4
4
  h2. Intro
5
5
 
data/lib/cobweb.rb CHANGED
@@ -20,7 +20,7 @@ class Cobweb
20
20
  # investigate using event machine for single threaded crawling
21
21
 
22
22
  def self.version
23
- "0.0.38"
23
+ "0.0.39"
24
24
  end
25
25
 
26
26
  def method_missing(method_sym, *arguments, &block)
@@ -67,6 +67,8 @@ class Cobweb
67
67
  @redis.set("crawl-counter", 0)
68
68
  @redis.set("queue-counter", 1)
69
69
 
70
+ @stats = Stats.new(request)
71
+ @stats.start_crawl(request)
70
72
 
71
73
  # add internal_urls into redis
72
74
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
@@ -11,9 +11,18 @@ class CobwebCrawler
11
11
  @statistic = {}
12
12
 
13
13
  @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
14
- crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
14
+ @crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
15
+ @options[:crawl_id] = @crawl_id
15
16
 
16
- @redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{crawl_id}")
17
+ @redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{@crawl_id}")
18
+ @options[:internal_urls] = [] if @options[:internal_urls].nil?
19
+ @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
20
+ @debug = @options[:debug]
21
+
22
+ @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
23
+ if @options[:web_statistics]
24
+ Server.start
25
+ end
17
26
 
18
27
  @cobweb = Cobweb.new(@options)
19
28
  end
@@ -21,124 +30,113 @@ class CobwebCrawler
21
30
  def crawl(base_url, crawl_options = {}, &block)
22
31
  @options[:base_url] = base_url unless @options.has_key? :base_url
23
32
 
33
+ @options[:internal_urls] << base_url if @options[:internal_urls].empty?
34
+ @redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty?
35
+
24
36
  @crawl_options = crawl_options
25
37
 
38
+ puts "http://localhost:4567/statistics/#{@crawl_id}"
39
+ puts ""
40
+
26
41
  @redis.sadd "queued", base_url
27
42
  crawl_counter = @redis.scard("crawled").to_i
28
43
  queue_counter = @redis.scard("queued").to_i
29
44
 
30
- while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
31
- thread = Thread.new do
32
-
33
- url = @redis.spop "queued"
34
- crawl_counter = @redis.scard("crawled").to_i
35
- queue_counter = @redis.scard("queued").to_i
45
+ begin
46
+ @stats.start_crawl(@options)
47
+ while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
48
+ thread = Thread.new do
36
49
 
37
- @options[:url] = url
38
- unless @redis.sismember("crawled", url.to_s)
39
- begin
40
- Stats.update_status("Requesting #{url}...")
41
- content = @cobweb.get(url)
42
- Stats.update_status("Processing #{url}...")
43
-
44
- if @statistic[:average_response_time].nil?
45
- @statistic[:average_response_time] = content[:response_time].to_f
46
- else
47
- @statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
48
- end
49
-
50
- @statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time]
51
- @statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time]
52
-
53
- if @statistic[:average_length]
54
- @statistic[:average_length] = (((@statistic[:average_length].to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
55
- else
56
- @statistic[:average_length] = content[:length].to_i
57
- end
58
-
59
- @statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i
60
- @statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i < @statistic[:minimum_length].to_i
61
- @statistic[:total_length] = @statistic[:total_length].to_i + content[:length].to_i
50
+ url = @redis.spop "queued"
51
+ crawl_counter = @redis.scard("crawled").to_i
52
+ queue_counter = @redis.scard("queued").to_i
53
+
54
+ @options[:url] = url
55
+ unless @redis.sismember("crawled", url.to_s)
56
+ begin
57
+ @stats.update_status("Requesting #{url}...")
58
+ content = @cobweb.get(url)
59
+ @stats.update_status("Processing #{url}...")
62
60
 
63
- if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
64
- @statistic[:page_count] = @statistic[:page_count].to_i + 1
65
- @statistic[:page_size] = @statistic[:page_size].to_i + content[:length].to_i
66
- else
67
- @statistic[:asset_count] = @statistic[:asset_count].to_i + 1
68
- @statistic[:asset_size] = @statistic[:asset_size].to_i + content[:length].to_i
69
- end
70
-
71
- @statistic[:total_redirects] = 0 if @statistic[:total_redirects].nil?
72
- @statistic[:total_redirects] += content[:redirect_through].count unless content[:redirect_through].nil?
61
+ @redis.sadd "crawled", url.to_s
62
+ @redis.incr "crawl-counter"
73
63
 
74
- @statistic[:crawl_counter] = crawl_counter
75
- @statistic[:queue_counter] = queue_counter
64
+ internal_links = all_links_from_content(content).map{|link| link.to_s}
65
+
66
+ # reject the link if we've crawled it or queued it
67
+ internal_links.reject!{|link| @redis.sismember("crawled", link)}
68
+ internal_links.reject!{|link| @redis.sismember("queued", link)}
76
69
 
77
- mime_counts = {}
78
- if @statistic.has_key? :mime_counts
79
- mime_counts = @statistic[:mime_counts]
80
- if mime_counts.has_key? content[:mime_type]
81
- mime_counts[content[:mime_type]] += 1
82
- else
83
- mime_counts[content[:mime_type]] = 1
84
- end
85
- else
86
- mime_counts = {content[:mime_type] => 1}
87
- end
88
- @statistic[:mime_counts] = mime_counts
89
70
 
90
- status_counts = {}
91
-
92
- if @statistic.has_key? :status_counts
93
- status_counts = @statistic[:status_counts]
94
- if status_counts.has_key? content[:status_code].to_i
95
- status_counts[content[:status_code].to_i] += 1
96
- else
97
- status_counts[content[:status_code].to_i] = 1
98
- end
99
- else
100
- status_counts = {content[:status_code].to_i => 1}
101
- end
102
- @statistic[:status_counts] = status_counts
71
+ # select the link if its internal
72
+ internal_links.select!{|link| internal_link?(link)}
103
73
 
104
- @redis.sadd "crawled", url.to_s
105
- @redis.incr "crawl-counter"
106
-
107
- content[:links].keys.map{|key| content[:links][key]}.flatten.each do |content_link|
108
- link = content_link.to_s
109
- unless @redis.sismember("crawled", link)
110
- puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug]
111
- if link.to_s.match(Regexp.new("^#{@options[:base_url]}"))
112
- puts "Matched as #{link} as internal" if @options[:debug]
113
- unless @redis.sismember("crawled", link) || @redis.sismember("queued", link)
114
- puts "Added #{link.to_s} to queue" if @options[:debug]
115
- @redis.sadd "queued", link
116
- crawl_counter = @redis.scard("crawled").to_i
117
- queue_counter = @redis.scard("queued").to_i
118
- end
119
- end
74
+ internal_links.each do |link|
75
+ puts "Added #{link.to_s} to queue" if @debug
76
+ @redis.sadd "queued", link
120
77
  end
121
- end
122
78
 
123
- crawl_counter = @redis.scard("crawled").to_i
124
- queue_counter = @redis.scard("queued").to_i
125
- Stats.update_statistics(@statistic)
126
- Stats.update_status("Completed #{url}.")
127
- puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @options[:debug]
79
+ crawl_counter = @redis.scard("crawled").to_i
80
+ queue_counter = @redis.scard("queued").to_i
81
+
82
+ @stats.update_statistics(content)
83
+ @stats.update_status("Completed #{url}.")
84
+ puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
128
85
 
129
- yield content, @statistic if block_given?
86
+ yield content, @statistic if block_given?
130
87
 
131
- rescue => e
132
- puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
133
- ap e
134
- ap e.backtrace
88
+ rescue => e
89
+ puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
90
+ ap e
91
+ ap e.backtrace
92
+ end
93
+ else
94
+ puts "Already crawled #{@options[:url]}" if @debug
135
95
  end
136
- else
137
- puts "Already crawled #{@options[:url]}" if @options[:debug]
138
96
  end
97
+ thread.join
139
98
  end
140
- thread.join
99
+ ensure
100
+ @stats.end_crawl(@options)
141
101
  end
142
102
  @statistic
143
103
  end
104
+
105
+
106
+ def internal_link?(link)
107
+ puts "Checking internal link for: #{link}" if @debug
108
+ valid_link = true
109
+ internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
110
+ puts "Matching against #{pattern.source}" if @debug
111
+ if link.match(pattern)
112
+ puts "Matched as internal" if @debug
113
+ return true
114
+ end
115
+ end
116
+ puts "Didn't match any pattern so marked as not internal" if @debug
117
+ false
118
+ end
119
+
120
+ def internal_patterns
121
+ @internal_patterns ||= @redis.smembers("internal_urls")
122
+ end
123
+
124
+ def all_links_from_content(content)
125
+ links = content[:links].keys.map{|key| content[:links][key]}.flatten
126
+ links.reject!{|link| link.starts_with?("javascript:")}
127
+ links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
128
+ links.select!{|link| link.scheme == "http" || link.scheme == "https"}
129
+ links.uniq
130
+ links
131
+ end
132
+ end
133
+
134
+ class String
135
+ def starts_with?(val)
136
+ if self.length >= val.length
137
+ self[0..val.length-1] == val
138
+ else
139
+ false
140
+ end
141
+ end
144
142
  end
data/lib/crawl_job.rb CHANGED
@@ -14,6 +14,7 @@ class CrawlJob
14
14
 
15
15
  content_request[:redis_options] = {} unless content_request.has_key? :redis_options
16
16
  @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
17
+ @stats = Stats.new(content_request)
17
18
 
18
19
  @debug = content_request[:debug]
19
20
 
@@ -32,7 +33,7 @@ class CrawlJob
32
33
  content = Cobweb.new(content_request).get(content_request[:url], content_request)
33
34
 
34
35
  ## update statistics
35
- Stats.set_statistics_in_redis(@redis, content)
36
+ @stats.update_statistics(content)
36
37
 
37
38
  # set the base url if this is the first page
38
39
  set_base_url @redis, content, content_request
@@ -81,9 +82,7 @@ class CrawlJob
81
82
 
82
83
  def self.finished(content_request)
83
84
  # finished
84
-
85
- Stats.set_totals
86
-
85
+ @stats.end_crawl(content_request)
87
86
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), Stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
88
87
  end
89
88
 
data/lib/server.rb ADDED
@@ -0,0 +1,98 @@
1
+ require 'sinatra'
2
+ require 'haml'
3
+
4
+ class Server < Sinatra::Base
5
+
6
+ set :views, settings.root + '/../views'
7
+ puts "#{settings.root}/../public"
8
+ set :public_folder, settings.root + '/../public'
9
+ enable :static
10
+
11
+ get '/' do
12
+ @full_redis = Redis.new
13
+
14
+ @colors = ["#00366f", "#006ba0", "#3F0BDB", "#396CB3"]
15
+
16
+ @crawls = []
17
+ @full_redis.smembers("cobweb_crawls").each do |crawl_id|
18
+ redis = NamespacedRedis.new({}, "cobweb-#{crawl_id}")
19
+ stats = HashUtil.deep_symbolize_keys({
20
+ :crawl_details => redis.hgetall("crawl_details"),
21
+ :statistics => redis.hgetall("statistics"),
22
+ :minute_totals => redis.hgetall("minute_totals")
23
+ })
24
+ @crawls << stats
25
+ end
26
+
27
+ haml :home
28
+ end
29
+
30
+ get '/statistics/:crawl_id' do
31
+ redis = NamespacedRedis.new({}, "cobweb-#{params[:crawl_id]}")
32
+
33
+ @statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
34
+ if @statistics[:status_counts].nil?
35
+ @statistics[:status_counts]
36
+ else
37
+ @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
38
+ end
39
+ if @statistics[:mime_counts].nil?
40
+ @statistics[:mime_counts]
41
+ else
42
+ @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
43
+ end
44
+ @crawl = {
45
+ :statistics => @statistics,
46
+ :crawl_details => HashUtil.deep_symbolize_keys(redis.hgetall("crawl_details")),
47
+ :minute_totals => HashUtil.deep_symbolize_keys(redis.hgetall("minute_totals")),
48
+ :status_200_count => HashUtil.deep_symbolize_keys(redis.hgetall("status_200_count")),
49
+ :status_400_count => HashUtil.deep_symbolize_keys(redis.hgetall("status_400_count")),
50
+ :status_500_count => HashUtil.deep_symbolize_keys(redis.hgetall("status_500_count")),
51
+ :mime_text_count => HashUtil.deep_symbolize_keys(redis.hgetall("mime_text_count")),
52
+ :mime_image_count => HashUtil.deep_symbolize_keys(redis.hgetall("mime_image_count")),
53
+ :mime_application_count => HashUtil.deep_symbolize_keys(redis.hgetall("mime_application_count")),
54
+ :pages_count => HashUtil.deep_symbolize_keys(redis.hgetall("pages_count")),
55
+ :assets_count => HashUtil.deep_symbolize_keys(redis.hgetall("assets_count"))
56
+ }
57
+ ap @crawl
58
+ haml :statistics
59
+ end
60
+
61
+ def self.start
62
+ unless Server.running?
63
+ thread = Thread.new do
64
+ puts "Starting Sinatra"
65
+ Server.run!
66
+ puts "Stopping crawl..."
67
+ ## we need to manually kill the main thread as sinatra traps the interrupts
68
+ Thread.main.kill
69
+ end
70
+ end
71
+ end
72
+
73
+ end
74
+
75
+ class HashUtil
76
+ def self.deep_symbolize_keys(hash)
77
+ hash.keys.each do |key|
78
+ value = hash[key]
79
+ hash.delete(key)
80
+ hash[key.to_sym] = value
81
+ if hash[key.to_sym].instance_of? Hash
82
+ hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
83
+ end
84
+ end
85
+ hash
86
+ end
87
+ end
88
+
89
+ class Numeric
90
+ def to_human
91
+ units = %w{B KB MB GB TB}
92
+ ap self
93
+ e = 0
94
+ e = (Math.log(self)/Math.log(1024)).floor unless self==0
95
+ s = "%.3f" % (to_f / 1024**e)
96
+ s.sub(/\.?0*$/, units[e])
97
+ end
98
+ end
data/lib/stats.rb CHANGED
@@ -1,63 +1,68 @@
1
- require 'sinatra'
2
- require 'haml'
3
-
4
- class Stats < Sinatra::Base
5
-
6
- def self.update_statistics(statistics)
7
- @@statistics = statistics
8
- @@statistics
9
- end
1
+ class Stats
10
2
 
11
- def self.get_statistics
12
- @@statistics ||= {}
3
+ def initialize(options)
4
+ @full_redis = Redis.new(options[:redis_options])
5
+ @redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{options[:crawl_id]}")
13
6
  end
14
7
 
15
- def self.update_status(status)
16
- @@status = status
8
+ def start_crawl(options)
9
+ @full_redis.sadd "cobweb_crawls", options[:crawl_id]
10
+ options.keys.each do |key|
11
+ @redis.hset "crawl_details", key, options[key]
12
+ end
13
+ @redis.hset "statistics", "current_status", "Crawl Starting..."
17
14
  end
18
15
 
19
- def self.set_totals
20
- stats = @redis.hgetall "statistics"
21
- stats[:total_pages] = @redis.get("total_pages").to_i
22
- stats[:total_assets] = @redis.get("total_assets").to_i
23
- stats[:crawl_counter] = @crawl_counter
24
- stats[:queue_counter] = @queue_counter
25
- stats[:crawled] = @redis.smembers "crawled"
26
-
27
- Stats.update_statistics(stats)
16
+ def end_crawl(options)
17
+ @full_redis.srem "cobweb_crawls", options[:crawl_id]
18
+ @redis.hset "statistics", "current_status", "Crawl Stopped"
19
+ @redis.del "crawl_details"
28
20
  end
29
21
 
30
- def self.set_statistics_in_redis(redis, content)
31
-
32
- @redis = redis
22
+ def update_statistics(content)
33
23
 
34
- crawl_counter = redis.get("crawl-counter").to_i
35
- queue_counter = redis.get("queue-counter").to_i
24
+ crawl_counter = @redis.scard("crawled").to_i
25
+ queue_counter = @redis.scard("queued").to_i
36
26
 
37
- if redis.hexists "statistics", "average_response_time"
38
- redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
27
+ if @redis.hexists "statistics", "average_response_time"
28
+ @redis.hset("statistics", "average_response_time", (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
39
29
  else
40
- redis.hset("statistics", "average_response_time", content[:response_time].to_f)
30
+ @redis.hset("statistics", "average_response_time", content[:response_time].to_f)
41
31
  end
42
- redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
43
- redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
44
- if redis.hexists "statistics", "average_length"
45
- redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
32
+ @redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if @redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > @redis.hget("statistics", "maximum_response_time").to_f
33
+ @redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if @redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < @redis.hget("statistics", "minimum_response_time").to_f
34
+ if @redis.hexists "statistics", "average_length"
35
+ @redis.hset("statistics", "average_length", (((@redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
46
36
  else
47
- redis.hset("statistics", "average_length", content[:length].to_i)
37
+ @redis.hset("statistics", "average_length", content[:length].to_i)
48
38
  end
49
- redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
50
- redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
51
-
39
+ @redis.hset "statistics", "maximum_length", content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @redis.hget("statistics", "maximum_length").to_i
40
+ @redis.hset "statistics", "minimum_length", content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @redis.hget("statistics", "minimum_length").to_i
41
+
42
+
52
43
  if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
53
- redis.incr "total_pages"
44
+ @redis.hset "statistics", "page_count", @redis.hget("statistics", "page_count").to_i + 1
45
+ @redis.hset "statistics", "page_size", @redis.hget("statistics", "page_size").to_i + content[:length].to_i
46
+ increment_time_stat("pages_count")
54
47
  else
55
- redis.incr "total_assets"
48
+ @redis.hset "statistics", "asset_count", @redis.hget("statistics", "asset_count").to_i + 1
49
+ @redis.hset "statistics", "asset_size", @redis.hget("statistics", "asset_size").to_i + content[:length].to_i
50
+ increment_time_stat("assets_count")
56
51
  end
52
+
53
+ total_redirects = @redis.hget("statistics", "total_redirects").to_i
54
+ @redis.hset "statistics", "total_redirects", 0 if total_redirects.nil?
55
+ @redis.hset("statistics", "total_redirects", total_redirects += content[:redirect_through].count) unless content[:redirect_through].nil?
56
+
57
+ @redis.hset "statistics", "crawl_counter", crawl_counter
58
+ @redis.hset "statistics", "queue_counter", queue_counter
59
+
60
+ total_length = @redis.hget("statistics", "total_length").to_i
61
+ @redis.hset "statistics", "total_length", total_length + content[:length].to_i
57
62
 
58
63
  mime_counts = {}
59
- if redis.hexists "statistics", "mime_counts"
60
- mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
64
+ if @redis.hexists "statistics", "mime_counts"
65
+ mime_counts = JSON.parse(@redis.hget("statistics", "mime_counts"))
61
66
  if mime_counts.has_key? content[:mime_type]
62
67
  mime_counts[content[:mime_type]] += 1
63
68
  else
@@ -66,40 +71,112 @@ class Stats < Sinatra::Base
66
71
  else
67
72
  mime_counts = {content[:mime_type] => 1}
68
73
  end
69
- redis.hset "statistics", "mime_counts", mime_counts.to_json
74
+ @redis.hset "statistics", "mime_counts", mime_counts.to_json
70
75
 
76
+ # record mime categories stats
77
+ if content[:mime_type].starts_with? "text"
78
+ increment_time_stat("mime_text_count")
79
+ elsif content[:mime_type].starts_with? "application"
80
+ increment_time_stat("mime_application_count")
81
+ elsif content[:mime_type].starts_with? "audio"
82
+ increment_time_stat("mime_audio_count")
83
+ elsif content[:mime_type].starts_with? "image"
84
+ increment_time_stat("mime_image_count")
85
+ elsif content[:mime_type].starts_with? "message"
86
+ increment_time_stat("mime_message_count")
87
+ elsif content[:mime_type].starts_with? "model"
88
+ increment_time_stat("mime_model_count")
89
+ elsif content[:mime_type].starts_with? "multipart"
90
+ increment_time_stat("mime_multipart_count")
91
+ elsif content[:mime_type].starts_with? "video"
92
+ increment_time_stat("mime_video_count")
93
+ end
94
+
71
95
  status_counts = {}
72
- if redis.hexists "statistics", "status_counts"
73
- status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
74
- if status_counts.has_key? content[:status_code].to_i
75
- status_counts[content[:status_code].to_i] += 1
96
+ if @redis.hexists "statistics", "status_counts"
97
+ status_counts = HashUtil.deep_symbolize_keys(JSON.parse(@redis.hget("statistics", "status_counts")))
98
+ status_code = content[:status_code].to_i.to_s.to_sym
99
+ if status_counts.has_key? status_code
100
+ status_counts[status_code] += 1
76
101
  else
77
- status_counts[content[:status_code].to_i] = 1
78
- end
102
+ status_counts[status_code] = 1
103
+ end
79
104
  else
80
- status_counts = {content[:status_code].to_i => 1}
105
+ status_counts = {status_code => 1}
106
+ end
107
+
108
+ # record statistics by status type
109
+ if content[:status_code] >= 200 && content[:status_code] < 300
110
+ increment_time_stat("status_200_count")
111
+ elsif content[:status_code] >= 400 && content[:status_code] < 500
112
+ increment_time_stat("status|_400_count")
113
+ elsif content[:status_code] >= 500 && content[:status_code] < 600
114
+ increment_time_stat("status|_500_count")
115
+ end
116
+
117
+ @redis.hset "statistics", "status_counts", status_counts.to_json
118
+
119
+
120
+ ## time based statistics
121
+ increment_time_stat("minute_totals", "minute", 60)
122
+
123
+ get_statistics
124
+ end
125
+
126
+ def record_time_stat(stat_name, value, type="minute", duration=60)
127
+ key = DateTime.now.strftime("%Y-%m-%d %H:%M")
128
+ if type == "hour"
129
+ key = DateTime.now.strftime("%Y-%m-%d %H:00")
81
130
  end
82
- redis.hset "statistics", "status_counts", status_counts.to_json
131
+ stat_value = @redis.hget(stat_name, key).to_i
132
+ stat_count = @redis.hget("#{stat_name}-count", key).to_i
83
133
 
84
- @@statistics = @redis.hgetall "statistics"
134
+ if minute_count.nil?
135
+ @redis.hset stat_name, key, value
136
+ @redis.hset "#{stat_name}-count", key, 1
137
+ else
138
+ @redis.hset stat_name, key, ((stat_value*stat_count) + value) / (stat_count+1)
139
+ @redis.hset "#{stat_name}-count", key, stat_count+1
140
+ end
141
+ end
142
+
143
+ def increment_time_stat(stat_name, type="minute", duration=60)
144
+ key = DateTime.now.strftime("%Y-%m-%d %H:%M")
145
+ if type == "hour"
146
+ key = DateTime.now.strftime("%Y-%m-%d %H:00")
147
+ end
148
+ minute_count = @redis.hget(stat_name, key).to_i
149
+ if minute_count.nil?
150
+ @redis.hset stat_name, key, 1
151
+ else
152
+ @redis.hset stat_name, key, minute_count + 1
153
+ end
154
+ #clear up older data
155
+ @redis.hgetall(stat_name).keys.each do |key|
156
+ if DateTime.parse(key) < DateTime.now-(duration/1440.0)
157
+ puts "Deleting #{stat_name} - #{key}"
158
+ @redis.hdel(stat_name, key)
159
+ end
160
+ end
161
+ end
162
+
163
+ def get_statistics
164
+ @redis.hgetall "statistics"
85
165
  end
86
166
 
87
- set :views, settings.root + '/../views'
167
+ def update_status(status)
168
+ @redis.hset "statistics", "current_status", status
169
+ end
88
170
 
89
- get '/' do
90
- @statistics = @@statistics
91
- @status = @@status
92
- haml :statistics
171
+ def get_status
172
+ @redis.hget "statistics", "current_status"
93
173
  end
94
174
 
95
- def self.start
96
- thread = Thread.new do
97
- Stats.run!
98
-
99
- ## we need to manually kill the main thread as sinatra traps the interrupts
100
- Thread.main.kill
101
- end
175
+ def set_totals
176
+ stats = get_statistics
177
+ stats[:crawled] = @redis.smembers "crawled"
102
178
  end
179
+
103
180
  end
104
181
 
105
182