flapjack 0.7.28 → 0.7.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +10 -0
  3. data/features/notification_rules.feature +25 -25
  4. data/features/rollup.feature +38 -18
  5. data/features/steps/events_steps.rb +10 -5
  6. data/features/steps/notifications_steps.rb +8 -4
  7. data/lib/flapjack/data/alert.rb +207 -0
  8. data/lib/flapjack/data/contact.rb +14 -7
  9. data/lib/flapjack/data/entity_check.rb +4 -3
  10. data/lib/flapjack/data/notification.rb +28 -27
  11. data/lib/flapjack/gateways/api/contact_methods.rb +32 -12
  12. data/lib/flapjack/gateways/email.rb +49 -53
  13. data/lib/flapjack/gateways/email/alert.html.erb +15 -15
  14. data/lib/flapjack/gateways/email/alert.text.erb +15 -15
  15. data/lib/flapjack/gateways/email/alert_subject.text.erb +3 -13
  16. data/lib/flapjack/gateways/email/rollup.html.erb +6 -6
  17. data/lib/flapjack/gateways/email/rollup.text.erb +7 -7
  18. data/lib/flapjack/gateways/email/rollup_subject.text.erb +1 -19
  19. data/lib/flapjack/gateways/jabber.rb +57 -47
  20. data/lib/flapjack/gateways/jabber/alert.text.erb +12 -0
  21. data/lib/flapjack/gateways/jabber/rollup.text.erb +2 -0
  22. data/lib/flapjack/gateways/pagerduty.rb +60 -30
  23. data/lib/flapjack/gateways/pagerduty/alert.text.erb +10 -0
  24. data/lib/flapjack/gateways/sms_messagenet.rb +29 -36
  25. data/lib/flapjack/gateways/sms_messagenet/alert.text.erb +4 -14
  26. data/lib/flapjack/gateways/sms_messagenet/rollup.text.erb +2 -34
  27. data/lib/flapjack/gateways/web.rb +23 -14
  28. data/lib/flapjack/gateways/web/views/check.html.erb +16 -11
  29. data/lib/flapjack/gateways/web/views/contact.html.erb +58 -16
  30. data/lib/flapjack/gateways/web/views/self_stats.html.erb +80 -71
  31. data/lib/flapjack/notifier.rb +8 -2
  32. data/lib/flapjack/pikelet.rb +17 -3
  33. data/lib/flapjack/processor.rb +0 -1
  34. data/lib/flapjack/redis_pool.rb +1 -1
  35. data/lib/flapjack/utility.rb +13 -0
  36. data/lib/flapjack/version.rb +1 -1
  37. data/spec/lib/flapjack/data/contact_spec.rb +44 -29
  38. data/spec/lib/flapjack/gateways/api/contact_methods_spec.rb +24 -4
  39. data/spec/lib/flapjack/gateways/email_spec.rb +0 -5
  40. data/spec/lib/flapjack/gateways/jabber_spec.rb +5 -1
  41. data/spec/lib/flapjack/gateways/pagerduty_spec.rb +5 -2
  42. data/spec/lib/flapjack/gateways/{sms_messagenet.spec.rb → sms_messagenet_spec.rb} +16 -12
  43. data/spec/lib/flapjack/gateways/web_spec.rb +1 -1
  44. data/spec/spec_helper.rb +28 -6
  45. metadata +43 -89
@@ -1,15 +1,5 @@
1
- <% case @notification_type -%>
2
- <% when "problem" -%>
3
- <%= "Problem: " -%>
4
- <% when "recovery" -%>
5
- <%= "Recovery: " -%>
6
- <% when "acknowledgement" -%>
7
- <%= "Ack: " -%>
8
- <% when "test" -%>
9
- <%= "Test notification: " -%>
1
+ <%= @alert.type_sentence_case %>: '<%= @alert.check %>' on <%= @alert.entity -%>
2
+ <% unless ['acknowledgement', 'test'].include?(@alert.notification_type) -%>
3
+ is <%= @alert.state_title_case -%>
10
4
  <% end -%>
11
- '<%= @check %>' on <%= @entity_name -%>
12
- <% unless ['acknowledgement', 'test'].include?(@notification_type) -%>
13
- is <%= ['ok'].include?(@state) ? @state.upcase : @state.titleize -%>
14
- <% end -%>
15
- at <%= Time.at(@time).strftime('%-d %b %H:%M') %>, <%= @summary -%>
5
+ at <%= Time.at(@alert.time).strftime('%-d %b %H:%M') %>, <%= @alert.summary -%>
@@ -1,34 +1,2 @@
1
- <%
2
- state_counts = @rollup_alerts.inject({}) do |memo, alert|
3
- memo[alert[1]['state']] = (memo[alert[1]['state']] || 0) + 1
4
- memo
5
- end
6
-
7
- states_summary = ['critical', 'warning', 'unknown'].inject([]) do |memo, state|
8
- next memo unless state_counts[state]
9
- memo << "#{state.titleize}: #{state_counts[state]}"
10
- memo
11
- end.join(', ')
12
-
13
- states_detail = ['critical', 'warning', 'unknown'].inject([]) do |memo, state|
14
-
15
- alerts = @rollup_alerts.find_all {|alert| alert[1]['state'] == state}
16
- next memo if alerts.to_a.empty?
17
-
18
- memo << "#{state.titleize}: " + alerts.inject([]) do |ret, alert|
19
- entity, check = alert[0].split(':', 2)
20
- ret << "'#{check}' on #{entity}"
21
- ret
22
- end.join(', ')
23
-
24
- memo
25
- end.join('; ')
26
-
27
- -%>
28
- <% case @rollup -%>
29
- <% when "problem" -%>
30
- <%= "Problem summary: " -%>
31
- <% when "recovery" -%>
32
- <%= "Problem summaries finishing: " -%>
33
- <% end -%>
34
- <%= states_summary %> (<%= states_detail -%>)
1
+ <%= @alert.type_sentence_case %>: <%= @alert.rollup_states_summary -%>
2
+ (<%= @alert.rollup_states_detail_text(:max_checks_per_state => 3) -%>)
@@ -371,20 +371,29 @@ module Flapjack
371
371
  def self_stats
372
372
  @fqdn = `/bin/hostname -f`.chomp
373
373
  @pid = Process.pid
374
- @instance_id = "#{@fqdn}:#{@pid}"
375
-
376
- @dbsize = redis.dbsize
377
- @executive_instances = redis.keys("executive_instance:*").map {|i|
378
- [ i.match(/executive_instance:(.*)/)[1], redis.hget(i, 'boot_time').to_i ]
379
- }.sort {|a, b| b[1] <=> a[1]}
380
- @event_counters = redis.hgetall('event_counters')
381
- @event_counters_instance = redis.hgetall("event_counters:#{@instance_id}")
382
- @boot_time = Time.at(redis.hget("executive_instance:#{@instance_id}", 'boot_time').to_i)
383
- @uptime = Time.now.to_i - @boot_time.to_i
384
- @uptime_string = time_period_in_words(@uptime)
385
- @event_rate_all = (@uptime > 0) ?
386
- (@event_counters_instance['all'].to_f / @uptime) : 0
387
- @events_queued = redis.llen('events')
374
+
375
+ @dbsize = redis.dbsize
376
+ @executive_instances = redis.keys("executive_instance:*").inject({}) do |memo, i|
377
+ instance_id = i.match(/executive_instance:(.*)/)[1]
378
+ boot_time = redis.hget(i, 'boot_time').to_i
379
+ uptime = Time.now.to_i - boot_time
380
+ uptime_string = ChronicDuration.output(uptime, :format => :short, :keep_zero => true, :units => 2)
381
+ event_counters = redis.hgetall("event_counters:#{instance_id}")
382
+ event_rates = event_counters.inject({}) do |er, ec|
383
+ er[ec[0]] = uptime && uptime > 0 ? (ec[1].to_f / uptime).round : nil
384
+ er
385
+ end
386
+ memo[instance_id] = {
387
+ 'boot_time' => boot_time,
388
+ 'uptime' => uptime,
389
+ 'uptime_string' => uptime_string,
390
+ 'event_counters' => event_counters,
391
+ 'event_rates' => event_rates
392
+ }
393
+ memo
394
+ end
395
+ @event_counters = redis.hgetall('event_counters')
396
+ @events_queued = redis.llen('events')
388
397
  end
389
398
 
390
399
  def entity_stats
@@ -19,6 +19,18 @@
19
19
  </div>
20
20
  <% state_qualifier = @check_enabled ? '' : "DISABLED. Last " %>
21
21
  <h3><%= state_qualifier %>State: <%= @check_state ? h(@check_state.upcase) : '' %></h3>
22
+ <% if @current_unscheduled_maintenance %>
23
+ <% ack_msg = "#{@current_unscheduled_maintenance[:summary]}" %>
24
+ <% ack_msg = ack_msg.length > 0 ? ack_msg : 'no summary given' %>
25
+ <h4>Acknowledged (<%= h ack_msg %>)</h4>
26
+ <% start = Time.at(@current_unscheduled_maintenance[:start_time]) %>
27
+ <% finish = Time.at(@current_unscheduled_maintenance[:start_time] + @current_unscheduled_maintenance[:duration]) %>
28
+ <% remain = time_period_in_words( (finish - current_time).ceil ) %>
29
+ <p><%= h start.to_s %> -&gt; <%= h finish.to_s %> (<%= h remain %> remaining)</p>
30
+ <form action="/end_unscheduled_maintenance/<%= check_path_escaped %>" method="post">
31
+ <input type="submit" value="End Unscheduled Maintenance (Unacknowledge)" class="button">
32
+ </form>
33
+ <% end %>
22
34
  <% if (['warning', 'critical', 'unknown'].include?(@check_state) and !@current_scheduled_maintenance) %>
23
35
  <form action="/acknowledgements/<%= check_path_escaped %>" method="post" class="form-inline">
24
36
  <input type="hidden" name="acknowledgement_id" value="<%= @acknowledgement_id %>">
@@ -31,18 +43,10 @@
31
43
  e.g. "5 hours"
32
44
  </form>
33
45
  <% end %>
34
- <% if @current_unscheduled_maintenance %>
35
- <h3>(Acknowledged - <%= h @current_unscheduled_maintenance[:summary] %>)</h3>
36
- <% start = Time.at(@current_unscheduled_maintenance[:start_time]) %>
37
- <% finish = Time.at(@current_unscheduled_maintenance[:start_time] + @current_unscheduled_maintenance[:duration]) %>
38
- <% remain = time_period_in_words( (finish - current_time).ceil ) %>
39
- <p><%= h start.to_s %> -&gt; <%= h finish.to_s %> (<%= h remain %> remaining)</p>
40
- <form action="/end_unscheduled_maintenance/<%= check_path_escaped %>" method="post">
41
- <input type="submit" value="End Unscheduled Maintenance (Unacknowledge)" class="button">
42
- </form>
43
- <% end %>
44
46
  <% if @current_scheduled_maintenance %>
45
- <h4>(Scheduled Maintenance - <%= h @current_scheduled_maintenance[:summary] %></h4>
47
+ <% maint_msg = "#{@current_scheduled_maintenance[:summary]}" %>
48
+ <% maint_msg = maint_msg.length > 0 ? maint_msg : 'no summary given' %>
49
+ <h4>In <a href="#scheduled_maintenance_periods">Scheduled Maintenance</a> (<%= h maint_msg %>)</h4>
46
50
  <% start = Time.at(@current_scheduled_maintenance[:start_time]) %>
47
51
  <% finish = Time.at(@current_scheduled_maintenance[:start_time] + @current_scheduled_maintenance[:duration]) %>
48
52
  <% remain = time_period_in_words( (finish - current_time).ceil ) %>
@@ -84,6 +88,7 @@
84
88
  Disabled
85
89
  <% end %>
86
90
 
91
+ <a name="scheduled_maintenance_periods"></a>
87
92
  <h3>Scheduled Maintenance Periods</h3>
88
93
  <% if @scheduled_maintenances && !@scheduled_maintenances.empty? %>
89
94
  <table class="table table-bordered table-hover table-condensed">
@@ -15,6 +15,8 @@
15
15
  <h2><%= h @contact.name %></h2>
16
16
  </div>
17
17
 
18
+ <% alerting = {} %>
19
+
18
20
  <h3>Contact Media</h3>
19
21
  <% if !@contact.media || @contact.media.empty? %>
20
22
  <p>No media</p>
@@ -24,9 +26,11 @@
24
26
  <th>Media</th>
25
27
  <th>Address</th>
26
28
  <th>Interval</th>
27
- <th>Rollup Threshold</th>
29
+ <th>Summary Mode</th>
30
+ <th>Summary Threshold</th>
28
31
  </tr>
29
32
  <% @contact.media.each_pair do |mk, mv| %>
33
+ <% alerting[mk] = @contact.alerting_checks_for_media(mk) %>
30
34
  <tr>
31
35
  <% if 'pagerduty'.eql?(mk) %>
32
36
  <td>PagerDuty</td>
@@ -35,31 +39,69 @@
35
39
  <p><%= 'password'.eql?(pk) ? h("#{pk}: ...") : h("#{pk}: #{pv}") %></p>
36
40
  <% end %>
37
41
  </td>
42
+ <td></td>
43
+ <td></td>
44
+ <td></td>
38
45
  <% else %>
39
46
  <td><%= h mk.capitalize %></td>
40
47
  <td><%= h mv %></td>
48
+ <td>
49
+ <% if @contact.media_intervals[mk] %>
50
+ <%= h @contact.media_intervals[mk] %> seconds
51
+ <% else %>
52
+ no custom interval
53
+ <% end %>
54
+ </td>
55
+ <td>
56
+ <% if alerting[mk].length >= @contact.media_rollup_thresholds[mk].to_i %>
57
+ Yes -
58
+ <% else %>
59
+ No -
60
+ <% end %>
61
+ <%= alerting[mk].length %> alerting
62
+ </td>
63
+ <td>
64
+ <% if @contact.media_rollup_thresholds[mk] %>
65
+ <%= h @contact.media_rollup_thresholds[mk] %>
66
+ <% else %>
67
+ -
68
+ <% end %>
69
+ </td>
41
70
  <% end %>
71
+ </tr>
72
+ <% end %>
73
+ </table>
74
+ <% end %>
42
75
 
76
+ <h3>Alerting Checks</h3>
77
+ <p>These failing checks are currently alerting because they are not acknowledged, not in scheduled maintenance, and currently allowed by this contact's notification rules.</p>
78
+
79
+ <table class="table table-bordered table-hover table-condensed">
80
+ <tr>
81
+ <th>Media</th>
82
+ <th>Alerting Checks</th>
83
+ </tr>
84
+ <% alerting.each_pair do |media, checks| %>
85
+ <% if checks.length > 0 %>
86
+ <tr>
87
+ <td><%= h media.capitalize %></td>
43
88
  <td>
44
- <% if @contact.media_intervals[mk] %>
45
- <%= h @contact.media_intervals[mk] %> seconds
46
- <% else %>
47
- no custom interval
48
- <% end %>
49
- </td>
50
- <td>
51
- <% if @contact.media_rollup_thresholds[mk] %>
52
- <%= h @contact.media_rollup_thresholds[mk] %> failing checks
53
- <% else %>
54
- -
89
+ <% checks.each do |entity_check| %>
90
+ <% entity, check = entity_check.split(':', 2) %>
91
+ <% check_link = "<a href=\"/check?entity=#{u(entity)}&amp;check=#{u(check)}\" title=\"check status\">" +
92
+ h(check) + "</a>"%>
93
+ <a href="/entity/<%= u(entity) %>" title="entity status"><%= h entity %></a> ::
94
+ <%= check_link %> <br />
55
95
  <% end %>
56
96
  </td>
57
97
  </tr>
98
+ <% else %>
99
+ <tr><td colspan="2">No alerting checks, yay!</td></tr>
58
100
  <% end %>
59
- </table>
60
- <% end %>
101
+ <% end %>
102
+ </table>
61
103
 
62
- <h3>Entities and Checks</h3>
104
+ <h3>All Entities and Checks</h3>
63
105
  <% if !@entities_and_checks || @entities_and_checks.empty? %>
64
106
  <p>No entities</p>
65
107
  <% else %>
@@ -77,7 +119,7 @@
77
119
  <td><a href="/entity/<%= u(entity.name) %>" title="entity status"><%= h entity.name %></a></td>
78
120
  <td>
79
121
  <% checks.each do |check| %>
80
- <a href="/check?entity=<%= u(entity.name) %>&amp;check=<%= u(check) %>" title="check status"><%= h check %></a>
122
+ <%= "<a href=\"/check?entity=#{u(entity.name)}&amp;check=#{u(check)}\" title=\"check status\">#{ h check }</a>" %>
81
123
  <% end %>
82
124
  </td>
83
125
  </tr>
@@ -15,76 +15,85 @@
15
15
  <h2>Internal Statistics</h2>
16
16
  </div>
17
17
 
18
- <table class="table table-bordered table-hover table-condensed">
19
- <tr>
20
- <td>Events queued:</td>
21
- <td><%= h @events_queued %></td>
22
- </tr>
23
- <tr>
24
- <td>Number of entities:</td>
25
- <td><%= h @count_all_entities %></td>
26
- </tr>
27
- <tr>
28
- <td>Number of failing entities:</td>
29
- <td><%= h @count_failing_entities %></td>
30
- </tr>
31
- <tr>
32
- <td>Number of checks:</td>
33
- <td><%= h @count_all_checks %></td>
34
- </tr>
35
- <tr>
36
- <td>Number of failing checks:</td>
37
- <td><%= h @count_failing_checks %></td>
38
- </tr>
39
- <tr>
40
- <td>Events processed (all time)</td>
41
- <td><%= h @event_counters['all'] %> (ok: <%= h @event_counters['ok'] %>, failure: <%= h @event_counters['failure'] %>, action: <%= h @event_counters['action'] %>)</td>
42
- </tr>
43
- <tr>
44
- <td>Events processed (this instance)</td>
45
- <td><%= h @event_counters_instance['all'] %> (ok: <%= h @event_counters_instance['ok'] %>, failure: <%= h @event_counters_instance['failure'] %>, action: <%= h @event_counters_instance['action'] %>)</td>
46
- </tr>
47
- <tr>
48
- <td>Average rate (this instance)</td>
49
- <td><%= h @event_rate_all %></td> events per second
50
- </tr>
51
- <tr>
52
- <td>Total keys in redis</td>
53
- <td><%= h @dbsize %></td>
54
- </tr>
55
- <tr>
56
- <td>Uptime</td>
57
- <td><%= h @uptime_string %></td>
58
- </tr>
59
- <tr>
60
- <td>Boot Time</td>
61
- <td><%= h @boot_time %></td>
62
- </tr>
63
- <tr>
64
- <td>Current time</td>
65
- <td><%= h Time.now.to_s %></td>
66
- </tr>
67
- </table>
68
-
69
- <h4>Executive Instances:</h4>
70
- <table class="table table-bordered table-hover">
71
- <tr>
72
- <th>Hostname</th>
73
- <th>PID</th>
74
- <th>Started</th>
75
- </tr>
76
- <% @executive_instances.each do |i| %>
77
- <%
78
- hostname, pid = i[0].split(':')
79
- started = "#{relative_time_ago(Time.at(i[1].to_i))} ago"
80
- %>
81
- <tr>
82
- <td><%= h hostname %></td>
83
- <td><%= h pid %></td>
84
- <td><%= h started %></td>
85
- </tr>
86
- <% end %>
87
- </table>
18
+ <div class="row">
19
+ <div class="span5">
20
+ <h4>Global Statistics:</h4>
21
+ <table class="table table-bordered table-hover table-condensed">
22
+ <tr>
23
+ <td>Events queued:</td>
24
+ <td><%= h @events_queued %></td>
25
+ </tr>
26
+ <tr>
27
+ <td>Number of entities:</td>
28
+ <td><%= h @count_all_entities %></td>
29
+ </tr>
30
+ <tr>
31
+ <td>Number of failing entities:</td>
32
+ <td><%= h @count_failing_entities %></td>
33
+ </tr>
34
+ <tr>
35
+ <td>Number of checks:</td>
36
+ <td><%= h @count_all_checks %></td>
37
+ </tr>
38
+ <tr>
39
+ <td>Number of failing checks:</td>
40
+ <td><%= h @count_failing_checks %></td>
41
+ </tr>
42
+ <tr>
43
+ <td>Events processed (all time)</td>
44
+ <td>
45
+ <ul>
46
+ <li>all: <%= h @event_counters['all'] %> events</li>
47
+ <li>ok: <%= h @event_counters['ok'] %> events</li>
48
+ <li>failure: <%= h @event_counters['failure'] %> events</li>
49
+ <li>action: <%= h @event_counters['action'] %> events</li>
50
+ </ul>
51
+ </td>
52
+ </tr>
53
+ <tr>
54
+ <td>Total keys in redis</td>
55
+ <td><%= h @dbsize %></td>
56
+ </tr>
57
+ <tr>
58
+ <td>Current time</td>
59
+ <td><%= h Time.now.to_s %></td>
60
+ </tr>
61
+ </table>
62
+ </div>
63
+ <div class="span7">
64
+ <h4>Processor Instances:</h4>
65
+ <table class="table table-bordered table-hover">
66
+ <tr>
67
+ <th>Hostname</th>
68
+ <th>PID</th>
69
+ <th>Uptime</th>
70
+ <th>Events Processed</th>
71
+ </tr>
72
+ <% @executive_instances.sort_by {|i, d| d['uptime']}.each do |ei| %>
73
+ <%
74
+ instance_id, details = ei
75
+ hostname, pid = instance_id.split(':')
76
+ started = details['uptime_string']
77
+ event_counters = details['event_counters']
78
+ event_rates = details['event_rates']
79
+ %>
80
+ <tr>
81
+ <td><%= h hostname %></td>
82
+ <td><%= h pid %></td>
83
+ <td><%= h started %></td>
84
+ <td>
85
+ <ul>
86
+ <li>all: <%= h event_counters['all'] %> (<%= h event_rates['all'] %> events/s)</li>
87
+ <li>ok: <%= h event_counters['ok'] %> (<%= h event_rates['ok'] %> events/s)</li>
88
+ <li>failure: <%= h event_counters['failure'] %> (<%= h event_rates['failure'] %> events/s)</li>
89
+ <li>action: <%= h event_counters['action'] %> (<%= h event_rates['action'] %> events/s)</li>
90
+ </ul>
91
+ </td>
92
+ </tr>
93
+ <% end %>
94
+ </table>
95
+ </div>
96
+ </div>
88
97
 
89
98
  <p><a href="/self_stats.json">Instrument as JSON</a></p>
90
99
  </div>
@@ -94,4 +103,4 @@
94
103
  <%= foot %>
95
104
  </div>
96
105
  </body>
97
- </html>
106
+ </html>
@@ -92,7 +92,7 @@ module Flapjack
92
92
 
93
93
  timestamp = Time.now
94
94
  event_id = notification.event_id
95
- entity_check = Flapjack::Data::EntityCheck.for_event_id(event_id, :redis => @redis)
95
+ entity_check = Flapjack::Data::EntityCheck.for_event_id(event_id, :redis => @redis, :logger => @logger)
96
96
  contacts = entity_check.contacts
97
97
 
98
98
  if contacts.empty?
@@ -106,13 +106,16 @@ module Flapjack
106
106
 
107
107
  notification_contents = notification.contents
108
108
 
109
+ in_unscheduled_maintenance = entity_check.in_scheduled_maintenance?
110
+ in_scheduled_maintenance = entity_check.in_unscheduled_maintenance?
111
+
109
112
  messages.each do |message|
110
113
  media_type = message.medium
111
114
  address = message.address
112
115
  contents = message.contents.merge(notification_contents)
113
116
 
114
117
  if message.rollup
115
- contents['rollup_alerts'] = message.contact.alerting_checks_for_media(media_type).inject({}) do |memo, alert|
118
+ rollup_alerts = message.contact.alerting_checks_for_media(media_type).inject({}) do |memo, alert|
116
119
  ec = Flapjack::Data::EntityCheck.for_event_id(alert, :redis => @redis)
117
120
  last_change = ec.last_change
118
121
  memo[alert] = {
@@ -121,7 +124,10 @@ module Flapjack
121
124
  }
122
125
  memo
123
126
  end
127
+ contents['rollup_alerts'] = rollup_alerts
128
+
124
129
  contents['rollup_threshold'] = message.contact.rollup_threshold_for_media(media_type)
130
+
125
131
  end
126
132
 
127
133
  @notifylog.info("#{event_id} | " +