flapjack 0.7.28 → 0.7.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +10 -0
  3. data/features/notification_rules.feature +25 -25
  4. data/features/rollup.feature +38 -18
  5. data/features/steps/events_steps.rb +10 -5
  6. data/features/steps/notifications_steps.rb +8 -4
  7. data/lib/flapjack/data/alert.rb +207 -0
  8. data/lib/flapjack/data/contact.rb +14 -7
  9. data/lib/flapjack/data/entity_check.rb +4 -3
  10. data/lib/flapjack/data/notification.rb +28 -27
  11. data/lib/flapjack/gateways/api/contact_methods.rb +32 -12
  12. data/lib/flapjack/gateways/email.rb +49 -53
  13. data/lib/flapjack/gateways/email/alert.html.erb +15 -15
  14. data/lib/flapjack/gateways/email/alert.text.erb +15 -15
  15. data/lib/flapjack/gateways/email/alert_subject.text.erb +3 -13
  16. data/lib/flapjack/gateways/email/rollup.html.erb +6 -6
  17. data/lib/flapjack/gateways/email/rollup.text.erb +7 -7
  18. data/lib/flapjack/gateways/email/rollup_subject.text.erb +1 -19
  19. data/lib/flapjack/gateways/jabber.rb +57 -47
  20. data/lib/flapjack/gateways/jabber/alert.text.erb +12 -0
  21. data/lib/flapjack/gateways/jabber/rollup.text.erb +2 -0
  22. data/lib/flapjack/gateways/pagerduty.rb +60 -30
  23. data/lib/flapjack/gateways/pagerduty/alert.text.erb +10 -0
  24. data/lib/flapjack/gateways/sms_messagenet.rb +29 -36
  25. data/lib/flapjack/gateways/sms_messagenet/alert.text.erb +4 -14
  26. data/lib/flapjack/gateways/sms_messagenet/rollup.text.erb +2 -34
  27. data/lib/flapjack/gateways/web.rb +23 -14
  28. data/lib/flapjack/gateways/web/views/check.html.erb +16 -11
  29. data/lib/flapjack/gateways/web/views/contact.html.erb +58 -16
  30. data/lib/flapjack/gateways/web/views/self_stats.html.erb +80 -71
  31. data/lib/flapjack/notifier.rb +8 -2
  32. data/lib/flapjack/pikelet.rb +17 -3
  33. data/lib/flapjack/processor.rb +0 -1
  34. data/lib/flapjack/redis_pool.rb +1 -1
  35. data/lib/flapjack/utility.rb +13 -0
  36. data/lib/flapjack/version.rb +1 -1
  37. data/spec/lib/flapjack/data/contact_spec.rb +44 -29
  38. data/spec/lib/flapjack/gateways/api/contact_methods_spec.rb +24 -4
  39. data/spec/lib/flapjack/gateways/email_spec.rb +0 -5
  40. data/spec/lib/flapjack/gateways/jabber_spec.rb +5 -1
  41. data/spec/lib/flapjack/gateways/pagerduty_spec.rb +5 -2
  42. data/spec/lib/flapjack/gateways/{sms_messagenet.spec.rb → sms_messagenet_spec.rb} +16 -12
  43. data/spec/lib/flapjack/gateways/web_spec.rb +1 -1
  44. data/spec/spec_helper.rb +28 -6
  45. metadata +43 -89
@@ -1,15 +1,5 @@
1
- <% case @notification_type -%>
2
- <% when "problem" -%>
3
- <%= "Problem: " -%>
4
- <% when "recovery" -%>
5
- <%= "Recovery: " -%>
6
- <% when "acknowledgement" -%>
7
- <%= "Ack: " -%>
8
- <% when "test" -%>
9
- <%= "Test notification: " -%>
1
+ <%= @alert.type_sentence_case %>: '<%= @alert.check %>' on <%= @alert.entity -%>
2
+ <% unless ['acknowledgement', 'test'].include?(@alert.notification_type) -%>
3
+ is <%= @alert.state_title_case -%>
10
4
  <% end -%>
11
- '<%= @check %>' on <%= @entity_name -%>
12
- <% unless ['acknowledgement', 'test'].include?(@notification_type) -%>
13
- is <%= ['ok'].include?(@state) ? @state.upcase : @state.titleize -%>
14
- <% end -%>
15
- at <%= Time.at(@time).strftime('%-d %b %H:%M') %>, <%= @summary -%>
5
+ at <%= Time.at(@alert.time).strftime('%-d %b %H:%M') %>, <%= @alert.summary -%>
@@ -1,34 +1,2 @@
1
- <%
2
- state_counts = @rollup_alerts.inject({}) do |memo, alert|
3
- memo[alert[1]['state']] = (memo[alert[1]['state']] || 0) + 1
4
- memo
5
- end
6
-
7
- states_summary = ['critical', 'warning', 'unknown'].inject([]) do |memo, state|
8
- next memo unless state_counts[state]
9
- memo << "#{state.titleize}: #{state_counts[state]}"
10
- memo
11
- end.join(', ')
12
-
13
- states_detail = ['critical', 'warning', 'unknown'].inject([]) do |memo, state|
14
-
15
- alerts = @rollup_alerts.find_all {|alert| alert[1]['state'] == state}
16
- next memo if alerts.to_a.empty?
17
-
18
- memo << "#{state.titleize}: " + alerts.inject([]) do |ret, alert|
19
- entity, check = alert[0].split(':', 2)
20
- ret << "'#{check}' on #{entity}"
21
- ret
22
- end.join(', ')
23
-
24
- memo
25
- end.join('; ')
26
-
27
- -%>
28
- <% case @rollup -%>
29
- <% when "problem" -%>
30
- <%= "Problem summary: " -%>
31
- <% when "recovery" -%>
32
- <%= "Problem summaries finishing: " -%>
33
- <% end -%>
34
- <%= states_summary %> (<%= states_detail -%>)
1
+ <%= @alert.type_sentence_case %>: <%= @alert.rollup_states_summary -%>
2
+ (<%= @alert.rollup_states_detail_text(:max_checks_per_state => 3) -%>)
@@ -371,20 +371,29 @@ module Flapjack
371
371
  def self_stats
372
372
  @fqdn = `/bin/hostname -f`.chomp
373
373
  @pid = Process.pid
374
- @instance_id = "#{@fqdn}:#{@pid}"
375
-
376
- @dbsize = redis.dbsize
377
- @executive_instances = redis.keys("executive_instance:*").map {|i|
378
- [ i.match(/executive_instance:(.*)/)[1], redis.hget(i, 'boot_time').to_i ]
379
- }.sort {|a, b| b[1] <=> a[1]}
380
- @event_counters = redis.hgetall('event_counters')
381
- @event_counters_instance = redis.hgetall("event_counters:#{@instance_id}")
382
- @boot_time = Time.at(redis.hget("executive_instance:#{@instance_id}", 'boot_time').to_i)
383
- @uptime = Time.now.to_i - @boot_time.to_i
384
- @uptime_string = time_period_in_words(@uptime)
385
- @event_rate_all = (@uptime > 0) ?
386
- (@event_counters_instance['all'].to_f / @uptime) : 0
387
- @events_queued = redis.llen('events')
374
+
375
+ @dbsize = redis.dbsize
376
+ @executive_instances = redis.keys("executive_instance:*").inject({}) do |memo, i|
377
+ instance_id = i.match(/executive_instance:(.*)/)[1]
378
+ boot_time = redis.hget(i, 'boot_time').to_i
379
+ uptime = Time.now.to_i - boot_time
380
+ uptime_string = ChronicDuration.output(uptime, :format => :short, :keep_zero => true, :units => 2)
381
+ event_counters = redis.hgetall("event_counters:#{instance_id}")
382
+ event_rates = event_counters.inject({}) do |er, ec|
383
+ er[ec[0]] = uptime && uptime > 0 ? (ec[1].to_f / uptime).round : nil
384
+ er
385
+ end
386
+ memo[instance_id] = {
387
+ 'boot_time' => boot_time,
388
+ 'uptime' => uptime,
389
+ 'uptime_string' => uptime_string,
390
+ 'event_counters' => event_counters,
391
+ 'event_rates' => event_rates
392
+ }
393
+ memo
394
+ end
395
+ @event_counters = redis.hgetall('event_counters')
396
+ @events_queued = redis.llen('events')
388
397
  end
389
398
 
390
399
  def entity_stats
@@ -19,6 +19,18 @@
19
19
  </div>
20
20
  <% state_qualifier = @check_enabled ? '' : "DISABLED. Last " %>
21
21
  <h3><%= state_qualifier %>State: <%= @check_state ? h(@check_state.upcase) : '' %></h3>
22
+ <% if @current_unscheduled_maintenance %>
23
+ <% ack_msg = "#{@current_unscheduled_maintenance[:summary]}" %>
24
+ <% ack_msg = ack_msg.length > 0 ? ack_msg : 'no summary given' %>
25
+ <h4>Acknowledged (<%= h ack_msg %>)</h4>
26
+ <% start = Time.at(@current_unscheduled_maintenance[:start_time]) %>
27
+ <% finish = Time.at(@current_unscheduled_maintenance[:start_time] + @current_unscheduled_maintenance[:duration]) %>
28
+ <% remain = time_period_in_words( (finish - current_time).ceil ) %>
29
+ <p><%= h start.to_s %> -&gt; <%= h finish.to_s %> (<%= h remain %> remaining)</p>
30
+ <form action="/end_unscheduled_maintenance/<%= check_path_escaped %>" method="post">
31
+ <input type="submit" value="End Unscheduled Maintenance (Unacknowledge)" class="button">
32
+ </form>
33
+ <% end %>
22
34
  <% if (['warning', 'critical', 'unknown'].include?(@check_state) and !@current_scheduled_maintenance) %>
23
35
  <form action="/acknowledgements/<%= check_path_escaped %>" method="post" class="form-inline">
24
36
  <input type="hidden" name="acknowledgement_id" value="<%= @acknowledgement_id %>">
@@ -31,18 +43,10 @@
31
43
  e.g. "5 hours"
32
44
  </form>
33
45
  <% end %>
34
- <% if @current_unscheduled_maintenance %>
35
- <h3>(Acknowledged - <%= h @current_unscheduled_maintenance[:summary] %>)</h3>
36
- <% start = Time.at(@current_unscheduled_maintenance[:start_time]) %>
37
- <% finish = Time.at(@current_unscheduled_maintenance[:start_time] + @current_unscheduled_maintenance[:duration]) %>
38
- <% remain = time_period_in_words( (finish - current_time).ceil ) %>
39
- <p><%= h start.to_s %> -&gt; <%= h finish.to_s %> (<%= h remain %> remaining)</p>
40
- <form action="/end_unscheduled_maintenance/<%= check_path_escaped %>" method="post">
41
- <input type="submit" value="End Unscheduled Maintenance (Unacknowledge)" class="button">
42
- </form>
43
- <% end %>
44
46
  <% if @current_scheduled_maintenance %>
45
- <h4>(Scheduled Maintenance - <%= h @current_scheduled_maintenance[:summary] %></h4>
47
+ <% maint_msg = "#{@current_scheduled_maintenance[:summary]}" %>
48
+ <% maint_msg = maint_msg.length > 0 ? maint_msg : 'no summary given' %>
49
+ <h4>In <a href="#scheduled_maintenance_periods">Scheduled Maintenance</a> (<%= h maint_msg %>)</h4>
46
50
  <% start = Time.at(@current_scheduled_maintenance[:start_time]) %>
47
51
  <% finish = Time.at(@current_scheduled_maintenance[:start_time] + @current_scheduled_maintenance[:duration]) %>
48
52
  <% remain = time_period_in_words( (finish - current_time).ceil ) %>
@@ -84,6 +88,7 @@
84
88
  Disabled
85
89
  <% end %>
86
90
 
91
+ <a name="scheduled_maintenance_periods"></a>
87
92
  <h3>Scheduled Maintenance Periods</h3>
88
93
  <% if @scheduled_maintenances && !@scheduled_maintenances.empty? %>
89
94
  <table class="table table-bordered table-hover table-condensed">
@@ -15,6 +15,8 @@
15
15
  <h2><%= h @contact.name %></h2>
16
16
  </div>
17
17
 
18
+ <% alerting = {} %>
19
+
18
20
  <h3>Contact Media</h3>
19
21
  <% if !@contact.media || @contact.media.empty? %>
20
22
  <p>No media</p>
@@ -24,9 +26,11 @@
24
26
  <th>Media</th>
25
27
  <th>Address</th>
26
28
  <th>Interval</th>
27
- <th>Rollup Threshold</th>
29
+ <th>Summary Mode</th>
30
+ <th>Summary Threshold</th>
28
31
  </tr>
29
32
  <% @contact.media.each_pair do |mk, mv| %>
33
+ <% alerting[mk] = @contact.alerting_checks_for_media(mk) %>
30
34
  <tr>
31
35
  <% if 'pagerduty'.eql?(mk) %>
32
36
  <td>PagerDuty</td>
@@ -35,31 +39,69 @@
35
39
  <p><%= 'password'.eql?(pk) ? h("#{pk}: ...") : h("#{pk}: #{pv}") %></p>
36
40
  <% end %>
37
41
  </td>
42
+ <td></td>
43
+ <td></td>
44
+ <td></td>
38
45
  <% else %>
39
46
  <td><%= h mk.capitalize %></td>
40
47
  <td><%= h mv %></td>
48
+ <td>
49
+ <% if @contact.media_intervals[mk] %>
50
+ <%= h @contact.media_intervals[mk] %> seconds
51
+ <% else %>
52
+ no custom interval
53
+ <% end %>
54
+ </td>
55
+ <td>
56
+ <% if alerting[mk].length >= @contact.media_rollup_thresholds[mk].to_i %>
57
+ Yes -
58
+ <% else %>
59
+ No -
60
+ <% end %>
61
+ <%= alerting[mk].length %> alerting
62
+ </td>
63
+ <td>
64
+ <% if @contact.media_rollup_thresholds[mk] %>
65
+ <%= h @contact.media_rollup_thresholds[mk] %>
66
+ <% else %>
67
+ -
68
+ <% end %>
69
+ </td>
41
70
  <% end %>
71
+ </tr>
72
+ <% end %>
73
+ </table>
74
+ <% end %>
42
75
 
76
+ <h3>Alerting Checks</h3>
77
+ <p>These failing checks are currently alerting because they are not acknowledged, not in scheduled maintenance, and currently allowed by this contact's notification rules.</p>
78
+
79
+ <table class="table table-bordered table-hover table-condensed">
80
+ <tr>
81
+ <th>Media</th>
82
+ <th>Alerting Checks</th>
83
+ </tr>
84
+ <% alerting.each_pair do |media, checks| %>
85
+ <% if checks.length > 0 %>
86
+ <tr>
87
+ <td><%= h media.capitalize %></td>
43
88
  <td>
44
- <% if @contact.media_intervals[mk] %>
45
- <%= h @contact.media_intervals[mk] %> seconds
46
- <% else %>
47
- no custom interval
48
- <% end %>
49
- </td>
50
- <td>
51
- <% if @contact.media_rollup_thresholds[mk] %>
52
- <%= h @contact.media_rollup_thresholds[mk] %> failing checks
53
- <% else %>
54
- -
89
+ <% checks.each do |entity_check| %>
90
+ <% entity, check = entity_check.split(':', 2) %>
91
+ <% check_link = "<a href=\"/check?entity=#{u(entity)}&amp;check=#{u(check)}\" title=\"check status\">" +
92
+ h(check) + "</a>"%>
93
+ <a href="/entity/<%= u(entity) %>" title="entity status"><%= h entity %></a> ::
94
+ <%= check_link %> <br />
55
95
  <% end %>
56
96
  </td>
57
97
  </tr>
98
+ <% else %>
99
+ <tr><td colspan="2">No alerting checks, yay!</td></tr>
58
100
  <% end %>
59
- </table>
60
- <% end %>
101
+ <% end %>
102
+ </table>
61
103
 
62
- <h3>Entities and Checks</h3>
104
+ <h3>All Entities and Checks</h3>
63
105
  <% if !@entities_and_checks || @entities_and_checks.empty? %>
64
106
  <p>No entities</p>
65
107
  <% else %>
@@ -77,7 +119,7 @@
77
119
  <td><a href="/entity/<%= u(entity.name) %>" title="entity status"><%= h entity.name %></a></td>
78
120
  <td>
79
121
  <% checks.each do |check| %>
80
- <a href="/check?entity=<%= u(entity.name) %>&amp;check=<%= u(check) %>" title="check status"><%= h check %></a>
122
+ <%= "<a href=\"/check?entity=#{u(entity.name)}&amp;check=#{u(check)}\" title=\"check status\">#{ h check }</a>" %>
81
123
  <% end %>
82
124
  </td>
83
125
  </tr>
@@ -15,76 +15,85 @@
15
15
  <h2>Internal Statistics</h2>
16
16
  </div>
17
17
 
18
- <table class="table table-bordered table-hover table-condensed">
19
- <tr>
20
- <td>Events queued:</td>
21
- <td><%= h @events_queued %></td>
22
- </tr>
23
- <tr>
24
- <td>Number of entities:</td>
25
- <td><%= h @count_all_entities %></td>
26
- </tr>
27
- <tr>
28
- <td>Number of failing entities:</td>
29
- <td><%= h @count_failing_entities %></td>
30
- </tr>
31
- <tr>
32
- <td>Number of checks:</td>
33
- <td><%= h @count_all_checks %></td>
34
- </tr>
35
- <tr>
36
- <td>Number of failing checks:</td>
37
- <td><%= h @count_failing_checks %></td>
38
- </tr>
39
- <tr>
40
- <td>Events processed (all time)</td>
41
- <td><%= h @event_counters['all'] %> (ok: <%= h @event_counters['ok'] %>, failure: <%= h @event_counters['failure'] %>, action: <%= h @event_counters['action'] %>)</td>
42
- </tr>
43
- <tr>
44
- <td>Events processed (this instance)</td>
45
- <td><%= h @event_counters_instance['all'] %> (ok: <%= h @event_counters_instance['ok'] %>, failure: <%= h @event_counters_instance['failure'] %>, action: <%= h @event_counters_instance['action'] %>)</td>
46
- </tr>
47
- <tr>
48
- <td>Average rate (this instance)</td>
49
- <td><%= h @event_rate_all %></td> events per second
50
- </tr>
51
- <tr>
52
- <td>Total keys in redis</td>
53
- <td><%= h @dbsize %></td>
54
- </tr>
55
- <tr>
56
- <td>Uptime</td>
57
- <td><%= h @uptime_string %></td>
58
- </tr>
59
- <tr>
60
- <td>Boot Time</td>
61
- <td><%= h @boot_time %></td>
62
- </tr>
63
- <tr>
64
- <td>Current time</td>
65
- <td><%= h Time.now.to_s %></td>
66
- </tr>
67
- </table>
68
-
69
- <h4>Executive Instances:</h4>
70
- <table class="table table-bordered table-hover">
71
- <tr>
72
- <th>Hostname</th>
73
- <th>PID</th>
74
- <th>Started</th>
75
- </tr>
76
- <% @executive_instances.each do |i| %>
77
- <%
78
- hostname, pid = i[0].split(':')
79
- started = "#{relative_time_ago(Time.at(i[1].to_i))} ago"
80
- %>
81
- <tr>
82
- <td><%= h hostname %></td>
83
- <td><%= h pid %></td>
84
- <td><%= h started %></td>
85
- </tr>
86
- <% end %>
87
- </table>
18
+ <div class="row">
19
+ <div class="span5">
20
+ <h4>Global Statistics:</h4>
21
+ <table class="table table-bordered table-hover table-condensed">
22
+ <tr>
23
+ <td>Events queued:</td>
24
+ <td><%= h @events_queued %></td>
25
+ </tr>
26
+ <tr>
27
+ <td>Number of entities:</td>
28
+ <td><%= h @count_all_entities %></td>
29
+ </tr>
30
+ <tr>
31
+ <td>Number of failing entities:</td>
32
+ <td><%= h @count_failing_entities %></td>
33
+ </tr>
34
+ <tr>
35
+ <td>Number of checks:</td>
36
+ <td><%= h @count_all_checks %></td>
37
+ </tr>
38
+ <tr>
39
+ <td>Number of failing checks:</td>
40
+ <td><%= h @count_failing_checks %></td>
41
+ </tr>
42
+ <tr>
43
+ <td>Events processed (all time)</td>
44
+ <td>
45
+ <ul>
46
+ <li>all: <%= h @event_counters['all'] %> events</li>
47
+ <li>ok: <%= h @event_counters['ok'] %> events</li>
48
+ <li>failure: <%= h @event_counters['failure'] %> events</li>
49
+ <li>action: <%= h @event_counters['action'] %> events</li>
50
+ </ul>
51
+ </td>
52
+ </tr>
53
+ <tr>
54
+ <td>Total keys in redis</td>
55
+ <td><%= h @dbsize %></td>
56
+ </tr>
57
+ <tr>
58
+ <td>Current time</td>
59
+ <td><%= h Time.now.to_s %></td>
60
+ </tr>
61
+ </table>
62
+ </div>
63
+ <div class="span7">
64
+ <h4>Processor Instances:</h4>
65
+ <table class="table table-bordered table-hover">
66
+ <tr>
67
+ <th>Hostname</th>
68
+ <th>PID</th>
69
+ <th>Uptime</th>
70
+ <th>Events Processed</th>
71
+ </tr>
72
+ <% @executive_instances.sort_by {|i, d| d['uptime']}.each do |ei| %>
73
+ <%
74
+ instance_id, details = ei
75
+ hostname, pid = instance_id.split(':')
76
+ started = details['uptime_string']
77
+ event_counters = details['event_counters']
78
+ event_rates = details['event_rates']
79
+ %>
80
+ <tr>
81
+ <td><%= h hostname %></td>
82
+ <td><%= h pid %></td>
83
+ <td><%= h started %></td>
84
+ <td>
85
+ <ul>
86
+ <li>all: <%= h event_counters['all'] %> (<%= h event_rates['all'] %> events/s)</li>
87
+ <li>ok: <%= h event_counters['ok'] %> (<%= h event_rates['ok'] %> events/s)</li>
88
+ <li>failure: <%= h event_counters['failure'] %> (<%= h event_rates['failure'] %> events/s)</li>
89
+ <li>action: <%= h event_counters['action'] %> (<%= h event_rates['action'] %> events/s)</li>
90
+ </ul>
91
+ </td>
92
+ </tr>
93
+ <% end %>
94
+ </table>
95
+ </div>
96
+ </div>
88
97
 
89
98
  <p><a href="/self_stats.json">Instrument as JSON</a></p>
90
99
  </div>
@@ -94,4 +103,4 @@
94
103
  <%= foot %>
95
104
  </div>
96
105
  </body>
97
- </html>
106
+ </html>
@@ -92,7 +92,7 @@ module Flapjack
92
92
 
93
93
  timestamp = Time.now
94
94
  event_id = notification.event_id
95
- entity_check = Flapjack::Data::EntityCheck.for_event_id(event_id, :redis => @redis)
95
+ entity_check = Flapjack::Data::EntityCheck.for_event_id(event_id, :redis => @redis, :logger => @logger)
96
96
  contacts = entity_check.contacts
97
97
 
98
98
  if contacts.empty?
@@ -106,13 +106,16 @@ module Flapjack
106
106
 
107
107
  notification_contents = notification.contents
108
108
 
109
+ in_unscheduled_maintenance = entity_check.in_scheduled_maintenance?
110
+ in_scheduled_maintenance = entity_check.in_unscheduled_maintenance?
111
+
109
112
  messages.each do |message|
110
113
  media_type = message.medium
111
114
  address = message.address
112
115
  contents = message.contents.merge(notification_contents)
113
116
 
114
117
  if message.rollup
115
- contents['rollup_alerts'] = message.contact.alerting_checks_for_media(media_type).inject({}) do |memo, alert|
118
+ rollup_alerts = message.contact.alerting_checks_for_media(media_type).inject({}) do |memo, alert|
116
119
  ec = Flapjack::Data::EntityCheck.for_event_id(alert, :redis => @redis)
117
120
  last_change = ec.last_change
118
121
  memo[alert] = {
@@ -121,7 +124,10 @@ module Flapjack
121
124
  }
122
125
  memo
123
126
  end
127
+ contents['rollup_alerts'] = rollup_alerts
128
+
124
129
  contents['rollup_threshold'] = message.contact.rollup_threshold_for_media(media_type)
130
+
125
131
  end
126
132
 
127
133
  @notifylog.info("#{event_id} | " +