flapjack 0.7.14 → 0.7.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/CHANGELOG.md +10 -0
  2. data/etc/flapjack_config.yaml.example +1 -0
  3. data/features/events.feature +5 -0
  4. data/features/notification_rules.feature +1 -1
  5. data/features/steps/events_steps.rb +28 -13
  6. data/features/steps/notifications_steps.rb +1 -1
  7. data/lib/flapjack/coordinator.rb +3 -1
  8. data/lib/flapjack/data/contact.rb +8 -6
  9. data/lib/flapjack/data/entity_check.rb +78 -113
  10. data/lib/flapjack/data/event.rb +54 -65
  11. data/lib/flapjack/data/notification.rb +5 -1
  12. data/lib/flapjack/executive.rb +42 -38
  13. data/lib/flapjack/filters/acknowledgement.rb +5 -5
  14. data/lib/flapjack/filters/base.rb +2 -2
  15. data/lib/flapjack/filters/delays.rb +11 -11
  16. data/lib/flapjack/filters/detect_mass_client_failures.rb +8 -8
  17. data/lib/flapjack/filters/ok.rb +6 -6
  18. data/lib/flapjack/filters/scheduled_maintenance.rb +2 -2
  19. data/lib/flapjack/filters/unscheduled_maintenance.rb +3 -2
  20. data/lib/flapjack/gateways/api.rb +374 -277
  21. data/lib/flapjack/gateways/api/entity_check_presenter.rb +52 -21
  22. data/lib/flapjack/gateways/api/entity_presenter.rb +14 -9
  23. data/lib/flapjack/gateways/email.rb +7 -0
  24. data/lib/flapjack/gateways/email/alert.html.haml +13 -1
  25. data/lib/flapjack/gateways/email/alert.text.erb +5 -4
  26. data/lib/flapjack/gateways/jabber.rb +90 -34
  27. data/lib/flapjack/gateways/pagerduty.rb +6 -2
  28. data/lib/flapjack/gateways/web.rb +13 -8
  29. data/lib/flapjack/gateways/web/views/check.haml +70 -45
  30. data/lib/flapjack/gateways/web/views/checks.haml +1 -1
  31. data/lib/flapjack/gateways/web/views/entity.haml +1 -1
  32. data/lib/flapjack/patches.rb +9 -2
  33. data/lib/flapjack/pikelet.rb +14 -10
  34. data/lib/flapjack/utility.rb +10 -4
  35. data/lib/flapjack/version.rb +1 -1
  36. data/spec/lib/flapjack/coordinator_spec.rb +19 -5
  37. data/spec/lib/flapjack/data/entity_check_spec.rb +3 -30
  38. data/spec/lib/flapjack/data/event_spec.rb +96 -1
  39. data/spec/lib/flapjack/executive_spec.rb +5 -11
  40. data/spec/lib/flapjack/gateways/api/entity_check_presenter_spec.rb +22 -3
  41. data/spec/lib/flapjack/gateways/api/entity_presenter_spec.rb +30 -15
  42. data/spec/lib/flapjack/gateways/api_spec.rb +552 -186
  43. data/spec/lib/flapjack/gateways/email_spec.rb +2 -0
  44. data/spec/lib/flapjack/gateways/jabber_spec.rb +5 -4
  45. data/spec/lib/flapjack/gateways/pagerduty_spec.rb +3 -2
  46. data/spec/lib/flapjack/gateways/web_spec.rb +17 -12
  47. data/spec/lib/flapjack/pikelet_spec.rb +5 -2
  48. metadata +4 -5
  49. data/config.ru +0 -11
@@ -18,6 +18,19 @@ module Flapjack
18
18
  @entity_check = entity_check
19
19
  end
20
20
 
21
+ def status
22
+ {'name' => @entity_check.check,
23
+ 'state' => @entity_check.state,
24
+ 'summary' => @entity_check.summary,
25
+ 'details' => @entity_check.details,
26
+ 'in_unscheduled_maintenance' => @entity_check.in_unscheduled_maintenance?,
27
+ 'in_scheduled_maintenance' => @entity_check.in_scheduled_maintenance?,
28
+ 'last_update' => @entity_check.last_update,
29
+ 'last_problem_notification' => entity_check.last_notification_for_state(:problem)[:timestamp],
30
+ 'last_recovery_notification' => entity_check.last_notification_for_state(:recovery)[:timestamp],
31
+ 'last_acknowledgement_notification' => entity_check.last_notification_for_state(:acknowledgement)[:timestamp]}
32
+ end
33
+
21
34
  def outages(start_time, end_time, options = {})
22
35
  # hist_states is an array of hashes, with [state, timestamp, summary] keys
23
36
  hist_states = @entity_check.historical_states(start_time, end_time)
@@ -26,32 +39,50 @@ module Flapjack
26
39
  initial = @entity_check.historical_state_before(hist_states.first[:timestamp])
27
40
  hist_states.unshift(initial) if initial
28
41
 
42
+ # TODO the following works, but isn't the neatest
29
43
  num_states = hist_states.size
30
44
 
31
- hist_states.each_with_index do |obj, index|
32
- ts = obj.delete(:timestamp)
33
- if index == (num_states - 1)
34
- # last (even if the only one)
35
- obj[:start_time] = start_time ? [ts, start_time].max : ts
36
- obj[:end_time] = end_time
37
- elsif (index == 0)
38
- # initial
39
- obj[:start_time] = start_time ? [ts, start_time].max : ts
40
- obj[:end_time] = hist_states[index + 1][:timestamp]
41
- else
42
- # except for first and last
43
- obj[:start_time] = ts
44
- obj[:end_time] = hist_states[index + 1][:timestamp]
45
+ index = 0
46
+ result = []
47
+ obj = nil
48
+
49
+ while index < num_states do
50
+ last_obj = obj
51
+ obj = hist_states[index]
52
+ index += 1
53
+
54
+ if last_obj && (last_obj[:state] == obj[:state])
55
+ # TODO maybe build up arrays of these instead, and leave calling
56
+ # classes to join them together if needed?
57
+ result.last[:summary] << " / #{obj[:summary]}"
58
+ result.last[:details] << " / #{obj[:details]}"
59
+ next
45
60
  end
46
- obj[:duration] = obj[:end_time] ? (obj[:end_time] - obj[:start_time]) : nil
47
- end
48
61
 
49
- # p hist_states
62
+ next if obj[:state] == 'ok'
63
+
64
+ ts = obj[:timestamp]
65
+
66
+ obj_st = (last_obj || !start_time) ? ts : [ts, start_time].max
67
+
68
+ next_ts_obj = hist_states[index..-1].detect {|hs| hs[:state] != obj[:state] }
69
+ obj_et = next_ts_obj ? next_ts_obj[:timestamp] : end_time
70
+
71
+ obj_dur = obj_et ? obj_et - obj_st : nil
72
+
73
+ result << {:state => obj[:state],
74
+ :start_time => obj_st,
75
+ :end_time => obj_et,
76
+ :duration => obj_dur,
77
+ :summary => obj[:summary] || '',
78
+ :details => obj[:details] || ''
79
+ }
80
+ end
50
81
 
51
- hist_states.reject {|obj| obj[:state] == 'ok'}
82
+ result
52
83
  end
53
84
 
54
- def unscheduled_maintenance(start_time, end_time)
85
+ def unscheduled_maintenances(start_time, end_time)
55
86
  # unsched_maintenance is an array of hashes, with [duration, timestamp, summary] keys
56
87
  unsched_maintenance = @entity_check.maintenances(start_time, end_time,
57
88
  :scheduled => false)
@@ -66,7 +97,7 @@ module Flapjack
66
97
  start_in_unsched + unsched_maintenance
67
98
  end
68
99
 
69
- def scheduled_maintenance(start_time, end_time)
100
+ def scheduled_maintenances(start_time, end_time)
70
101
  # sched_maintenance is an array of hashes, with [duration, timestamp, summary] keys
71
102
  sched_maintenance = @entity_check.maintenances(start_time, end_time,
72
103
  :scheduled => true)
@@ -87,7 +118,7 @@ module Flapjack
87
118
  #
88
119
  # TODO test performance with larger data sets
89
120
  def downtime(start_time, end_time)
90
- sched_maintenances = scheduled_maintenance(start_time, end_time)
121
+ sched_maintenances = scheduled_maintenances(start_time, end_time)
91
122
 
92
123
  outs = outages(start_time, end_time)
93
124
 
@@ -22,29 +22,34 @@ module Flapjack
22
22
  @redis = options[:redis]
23
23
  end
24
24
 
25
+ def status
26
+ checks.collect {|c| {:entity => @entity, :check => c,
27
+ :status => check_presenter(c).status } }
28
+ end
29
+
25
30
  def outages(start_time, end_time)
26
31
  checks.collect {|c|
27
- {:check => c, :outages => check_presenter(c).outages(start_time, end_time)}
32
+ {:entity => @entity, :check => c, :outages => check_presenter(c).outages(start_time, end_time)}
28
33
  }
29
34
  end
30
35
 
31
- def unscheduled_maintenance(start_time, end_time)
36
+ def unscheduled_maintenances(start_time, end_time)
32
37
  checks.collect {|c|
33
- {:check => c, :unscheduled_maintenance =>
34
- check_presenter(c).unscheduled_maintenance(start_time, end_time)}
38
+ {:entity => @entity, :check => c, :unscheduled_maintenances =>
39
+ check_presenter(c).unscheduled_maintenances(start_time, end_time)}
35
40
  }
36
41
  end
37
42
 
38
- def scheduled_maintenance(start_time, end_time)
43
+ def scheduled_maintenances(start_time, end_time)
39
44
  checks.collect {|c|
40
- {:check => c, :scheduled_maintenance =>
41
- check_presenter(c).scheduled_maintenance(start_time, end_time)}
45
+ {:entity => @entity, :check => c, :scheduled_maintenances =>
46
+ check_presenter(c).scheduled_maintenances(start_time, end_time)}
42
47
  }
43
48
  end
44
49
 
45
50
  def downtime(start_time, end_time)
46
51
  checks.collect {|c|
47
- {:check => c, :downtime =>
52
+ {:entity => @entity, :check => c, :downtime =>
48
53
  check_presenter(c).downtime(start_time, end_time)}
49
54
  }
50
55
  end
@@ -52,7 +57,7 @@ module Flapjack
52
57
  private
53
58
 
54
59
  def checks
55
- @check_list ||= @entity.check_list
60
+ @check_list ||= @entity.check_list.sort
56
61
  end
57
62
 
58
63
  def check_presenter(check)
@@ -8,6 +8,8 @@ require 'socket'
8
8
  require 'em-synchrony'
9
9
  require 'em/protocols/smtpclient'
10
10
 
11
+ require 'flapjack/utility'
12
+
11
13
  require 'flapjack/data/entity_check'
12
14
 
13
15
  module Flapjack
@@ -17,6 +19,8 @@ module Flapjack
17
19
 
18
20
  class << self
19
21
 
22
+ include Flapjack::Utility
23
+
20
24
  def start
21
25
  @logger.info("starting")
22
26
  @logger.debug("new email gateway pikelet with the following options: #{@config.inspect}")
@@ -32,8 +36,11 @@ module Flapjack
32
36
  @contact_last_name = notification['contact_last_name']
33
37
  @state = notification['state']
34
38
  @summary = notification['summary']
39
+ @last_state = notification['last_state']
40
+ @last_summary = notification['last_summary']
35
41
  @details = notification['details']
36
42
  @time = notification['time']
43
+ @relative = relative_time_ago(Time.at(@time))
37
44
  @entity_name, @check = notification['event_id'].split(':', 2)
38
45
 
39
46
  entity_check = Flapjack::Data::EntityCheck.for_event_id(notification['event_id'],
@@ -41,7 +41,19 @@
41
41
  %tr
42
42
  %td
43
43
  %strong Time
44
- %td= Time.at(@time.to_i).to_s
44
+ %td
45
+ = Time.at(@time.to_i).to_s
46
+ ( #{@relative} ago)
47
+
48
+ - if @last_state
49
+ %tr
50
+ %td Previous state
51
+ %td= @last_state
52
+
53
+ - if @last_summary
54
+ %tr
55
+ %td Previous summary
56
+ %td= @last_summary
45
57
 
46
58
  %p Cheers,
47
59
  %p Flapjack
@@ -2,11 +2,12 @@ Hi <%= @contact_first_name %>,
2
2
 
3
3
  Monitoring has detected the following:
4
4
 
5
- Entity: <%= @entity_name %>
6
- Check: <%= @check %>
7
- State: <%= @state %>
5
+ Entity: <%= @entity_name %>
6
+ Check: <%= @check %>
7
+ State: <%= @state %>
8
8
  Summary: <%= @summary %>
9
- Time: <%= Time.at(@time.to_i).to_s %>
9
+ Time: <%= Time.at(@time.to_i).to_s %> (<%= @relative %> ago)
10
+ <%= @last_state ? "\nPrevious state: #{@last_state}" : '' %><%= @last_summary ? "\nPrevious summary: #{@last_summary}" : '' %>
10
11
 
11
12
  Cheers,
12
13
  Flapjack
@@ -36,6 +36,8 @@ module Flapjack
36
36
  def initialize(opts = {})
37
37
  @config = opts[:config]
38
38
  @redis_config = opts[:redis_config]
39
+ @boot_time = opts[:boot_time]
40
+
39
41
  @redis = Flapjack::RedisPool.new(:config => @redis_config, :size => 2) # first will block
40
42
 
41
43
  @logger = opts[:logger]
@@ -146,56 +148,110 @@ module Flapjack
146
148
  error = "unknown entity" if entity_check.nil?
147
149
  end
148
150
 
149
- if entity_check && entity_check.in_unscheduled_maintenance?
150
- error = "#{event_id} is already acknowledged"
151
- end
152
-
153
151
  if error
154
152
  msg = "ERROR - couldn't ACK #{ackid} - #{error}"
155
153
  else
156
- msg = "ACKing #{entity_check.check} on #{entity_check.entity_name} (#{ackid})"
154
+ entity_name, check = event_id.split(':', 2)
155
+
156
+ if entity_check.in_unscheduled_maintenance?
157
+ # ack = entity_check.current_maintenance(:unscheduled => true)
158
+ # FIXME details from current?
159
+ msg = "Changing ACK for #{check} on #{entity_name} (#{ackid})"
160
+ else
161
+ msg = "ACKing #{check} on #{entity_name} (#{ackid})"
162
+ end
157
163
  action = Proc.new {
158
- entity_check.create_acknowledgement('summary' => (comment || ''),
159
- 'acknowledgement_id' => ackid, 'duration' => duration)
164
+ Flapjack::Data::Event.create_acknowledgement(
165
+ entity_name, check,
166
+ :summary => (comment || ''),
167
+ :acknowledgement_id => ackid,
168
+ :duration => duration,
169
+ :redis => @redis
170
+ )
160
171
  }
161
172
  end
162
173
 
163
174
  when command =~ /^help$/
164
- msg = "commands: \n"
165
- msg += " ACKID <id> <comment> [duration: <time spec>] \n"
166
- msg += " find entities matching /pattern/ \n"
167
- msg += " test notifications for <entity>[:<check>] \n"
168
- msg += " identify \n"
169
- msg += " help \n"
175
+ msg = "commands: \n" +
176
+ " ACKID <id> <comment> [duration: <time spec>] \n" +
177
+ " find entities matching /pattern/ \n" +
178
+ " test notifications for <entity>[:<check>] \n" +
179
+ " tell me about <entity>[:<check>]" +
180
+ " identify \n" +
181
+ " help \n"
170
182
 
171
183
  when command =~ /^identify$/
172
- t = Process.times
173
- fqdn = `/bin/hostname -f`.chomp
174
- pid = Process.pid
175
- instance_id = "#{@fqdn}:#{@pid}"
176
- boot_time = Time.at(@redis.hget("executive_instance:#{instance_id}", 'boot_time').to_i)
177
- msg = "Flapjack #{Flapjack::VERSION} process #{pid} on #{fqdn} \n"
178
- msg += "Boot time: #{boot_time}\n"
179
- msg += "User CPU Time: #{t.utime}\n"
180
- msg += "System CPU Time: #{t.stime}\n"
181
- msg += `uname -a`.chomp + "\n"
182
-
183
- when command =~ /^test notifications for\s+([a-z0-9\-\.]+)(:(.+))?$/i
184
+ t = Process.times
185
+ fqdn = `/bin/hostname -f`.chomp
186
+ pid = Process.pid
187
+ msg = "Flapjack #{Flapjack::VERSION} process #{pid} on #{fqdn} \n" +
188
+ "Boot time: #{@boot_time}\n" +
189
+ "User CPU Time: #{t.utime}\n" +
190
+ "System CPU Time: #{t.stime}\n" +
191
+ `uname -a`.chomp + "\n"
192
+
193
+ when command =~ /^test notifications for\s+([a-z0-9\-\.]+)(?::(.+))?$/i
194
+ entity_name = $1
195
+ check_name = $2 || 'test'
196
+
197
+ if entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
198
+ msg = "so you want me to test notifications for entity: #{entity_name}, check: #{check_name} eh? ... well OK!"
199
+
200
+ summary = "Testing notifications to all contacts interested in entity: #{entity_name}, check: #{check_name}"
201
+ Flapjack::Data::Event.test_notifications(entity_name, check_name, :summary => summary, :redis => @redis)
202
+ else
203
+ msg = "yeah, no I can't see #{entity_name} in my systems"
204
+ end
205
+
206
+ when command =~ /^tell me about\s+([a-z0-9\-\.]+)(?::(.+))?$+/
184
207
  entity_name = $1
185
- check_name = $3 ? $3 : 'test'
208
+ check_name = $2
186
209
 
187
- msg = "so you want me to test notifications for entity: #{entity_name}, check: #{check_name} eh? ... well OK!"
210
+ if entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
211
+ check_str = check_name.nil? ? '' : ", check: #{check_name}"
212
+ msg = "so you'd like details on entity: #{entity_name}#{check_str} hmm? ... OK!\n"
188
213
 
189
- entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
190
- if entity
191
- summary = "Testing notifications to all contacts interested in entity: #{entity.name}, check: #{check_name}"
214
+ current_time = Time.now
192
215
 
193
- entity_check = Flapjack::Data::EntityCheck.for_entity(entity, check_name, :redis => @redis)
194
- puts entity_check.inspect
195
- entity_check.test_notifications('summary' => summary)
216
+ get_details = proc {|entity_check|
217
+ sched = entity_check.current_maintenance(:scheduled => true)
218
+ unsched = entity_check.current_maintenance(:unscheduled => true)
196
219
 
220
+ if (sched || unsched) && check_name.nil?
221
+ check = entity_check.check
222
+ msg += "---\n#{entity_name}:#{check}\n"
223
+ end
224
+
225
+ unless sched.nil?
226
+ start = Time.at(sched[:start_time])
227
+ finish = Time.at(sched[:start_time] + sched[:duration])
228
+ remain = time_period_in_words( (finish - current_time).ceil )
229
+ # TODO a simpler time format?
230
+ msg += "Currently in scheduled maintenance: #{start} -> #{finish} (#{remain} remaining)\n"
231
+ end
232
+
233
+ unless unsched.nil?
234
+ start = Time.at(unsched[:start_time])
235
+ finish = Time.at(unsched[:start_time] + unsched[:duration])
236
+ remain = time_period_in_words( (finish - current_time).ceil )
237
+ # TODO a simpler time format?
238
+ msg += "Currently in unscheduled maintenance: #{start} -> #{finish} (#{remain} remaining)\n"
239
+ end
240
+ }
241
+
242
+ check_names = check_name.nil? ? entity.check_list.sort : [check_name]
243
+
244
+ if check_names.empty?
245
+ msg += "I couldn't find any checks for entity: #{entity_name}"
246
+ else
247
+ check_names.each do |check|
248
+ entity_check = Flapjack::Data::EntityCheck.for_entity(entity, check, :redis => @redis)
249
+ next if entity_check.nil?
250
+ get_details.call(entity_check)
251
+ end
252
+ end
197
253
  else
198
- msg = "yeah, no i can't see #{entity_name} in my systems"
254
+ msg = "hmmm, I can't see #{entity_name} in my systems"
199
255
  end
200
256
 
201
257
  when command =~ /^(find )?entities matching\s+\/(.*)\/.*$/i
@@ -177,12 +177,16 @@ module Flapjack
177
177
  end
178
178
 
179
179
  pg_acknowledged_by = acknowledged[:pg_acknowledged_by]
180
- @logger.info "#{entity_check.entity_name}:#{check} is acknowledged in pagerduty, creating flapjack acknowledgement... "
180
+ entity_name = entity_check.entity_name
181
+ @logger.info "#{entity_name}:#{check} is acknowledged in pagerduty, creating flapjack acknowledgement... "
181
182
  who_text = ""
182
183
  if !pg_acknowledged_by.nil? && !pg_acknowledged_by['name'].nil?
183
184
  who_text = " by #{pg_acknowledged_by['name']}"
184
185
  end
185
- entity_check.create_acknowledgement('summary' => "Acknowledged on PagerDuty" + who_text)
186
+ Flapjack::Data::Event.create_acknowledgement(
187
+ entity_name, check,
188
+ :summary => "Acknowledged on PagerDuty" + who_text,
189
+ :redis => @redis)
186
190
  end
187
191
 
188
192
  end
@@ -53,7 +53,6 @@ module Flapjack
53
53
  use Flapjack::CommonLogger, access_logger
54
54
  end
55
55
 
56
-
57
56
  end
58
57
  end
59
58
 
@@ -191,6 +190,9 @@ module Flapjack
191
190
 
192
191
  @contacts = entity_check.contacts
193
192
 
193
+ @state_changes = entity_check.historical_states(nil, Time.now.to_i,
194
+ :order => 'desc', :limit => 20)
195
+
194
196
  haml :check
195
197
  end
196
198
 
@@ -203,11 +205,14 @@ module Flapjack
203
205
  dur = ChronicDuration.parse(params[:duration] || '')
204
206
  @duration = (dur.nil? || (dur <= 0)) ? (4 * 60 * 60) : dur
205
207
 
206
- entity_check = get_entity_check(@entity, @check)
207
- return 404 if entity_check.nil?
208
+ return 404 if get_entity_check(@entity, @check).nil?
208
209
 
209
- ack = entity_check.create_acknowledgement('summary' => (@summary || ''),
210
- 'acknowledgement_id' => @acknowledgement_id, 'duration' => @duration)
210
+ ack = Flapjack::Data::Event.create_acknowledgement(
211
+ @entity, @check,
212
+ :summary => (@summary || ''),
213
+ :acknowledgement_id => @acknowledgement_id,
214
+ :duration => @duration,
215
+ :redis => redis)
211
216
 
212
217
  redirect back
213
218
  end
@@ -321,9 +326,9 @@ module Flapjack
321
326
  entity_check = Flapjack::Data::EntityCheck.for_entity(entity,
322
327
  check, :redis => redis)
323
328
  latest_notif =
324
- {:problem => entity_check.last_problem_notification,
325
- :recovery => entity_check.last_recovery_notification,
326
- :acknowledgement => entity_check.last_acknowledgement_notification
329
+ {:problem => entity_check.last_notification_for_state(:problem)[:timestamp],
330
+ :recovery => entity_check.last_notification_for_state(:recovery)[:timestamp],
331
+ :acknowledgement => entity_check.last_notification_for_state(:acknowledgement)[:timestamp]
327
332
  }.max_by {|n| n[1] || 0}
328
333
  [(entity_check.state || '-'),
329
334
  (entity_check.last_change || '-'),