flapjack 0.7.14 → 0.7.15

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/CHANGELOG.md +10 -0
  2. data/etc/flapjack_config.yaml.example +1 -0
  3. data/features/events.feature +5 -0
  4. data/features/notification_rules.feature +1 -1
  5. data/features/steps/events_steps.rb +28 -13
  6. data/features/steps/notifications_steps.rb +1 -1
  7. data/lib/flapjack/coordinator.rb +3 -1
  8. data/lib/flapjack/data/contact.rb +8 -6
  9. data/lib/flapjack/data/entity_check.rb +78 -113
  10. data/lib/flapjack/data/event.rb +54 -65
  11. data/lib/flapjack/data/notification.rb +5 -1
  12. data/lib/flapjack/executive.rb +42 -38
  13. data/lib/flapjack/filters/acknowledgement.rb +5 -5
  14. data/lib/flapjack/filters/base.rb +2 -2
  15. data/lib/flapjack/filters/delays.rb +11 -11
  16. data/lib/flapjack/filters/detect_mass_client_failures.rb +8 -8
  17. data/lib/flapjack/filters/ok.rb +6 -6
  18. data/lib/flapjack/filters/scheduled_maintenance.rb +2 -2
  19. data/lib/flapjack/filters/unscheduled_maintenance.rb +3 -2
  20. data/lib/flapjack/gateways/api.rb +374 -277
  21. data/lib/flapjack/gateways/api/entity_check_presenter.rb +52 -21
  22. data/lib/flapjack/gateways/api/entity_presenter.rb +14 -9
  23. data/lib/flapjack/gateways/email.rb +7 -0
  24. data/lib/flapjack/gateways/email/alert.html.haml +13 -1
  25. data/lib/flapjack/gateways/email/alert.text.erb +5 -4
  26. data/lib/flapjack/gateways/jabber.rb +90 -34
  27. data/lib/flapjack/gateways/pagerduty.rb +6 -2
  28. data/lib/flapjack/gateways/web.rb +13 -8
  29. data/lib/flapjack/gateways/web/views/check.haml +70 -45
  30. data/lib/flapjack/gateways/web/views/checks.haml +1 -1
  31. data/lib/flapjack/gateways/web/views/entity.haml +1 -1
  32. data/lib/flapjack/patches.rb +9 -2
  33. data/lib/flapjack/pikelet.rb +14 -10
  34. data/lib/flapjack/utility.rb +10 -4
  35. data/lib/flapjack/version.rb +1 -1
  36. data/spec/lib/flapjack/coordinator_spec.rb +19 -5
  37. data/spec/lib/flapjack/data/entity_check_spec.rb +3 -30
  38. data/spec/lib/flapjack/data/event_spec.rb +96 -1
  39. data/spec/lib/flapjack/executive_spec.rb +5 -11
  40. data/spec/lib/flapjack/gateways/api/entity_check_presenter_spec.rb +22 -3
  41. data/spec/lib/flapjack/gateways/api/entity_presenter_spec.rb +30 -15
  42. data/spec/lib/flapjack/gateways/api_spec.rb +552 -186
  43. data/spec/lib/flapjack/gateways/email_spec.rb +2 -0
  44. data/spec/lib/flapjack/gateways/jabber_spec.rb +5 -4
  45. data/spec/lib/flapjack/gateways/pagerduty_spec.rb +3 -2
  46. data/spec/lib/flapjack/gateways/web_spec.rb +17 -12
  47. data/spec/lib/flapjack/pikelet_spec.rb +5 -2
  48. metadata +4 -5
  49. data/config.ru +0 -11
@@ -18,6 +18,19 @@ module Flapjack
18
18
  @entity_check = entity_check
19
19
  end
20
20
 
21
+ def status
22
+ {'name' => @entity_check.check,
23
+ 'state' => @entity_check.state,
24
+ 'summary' => @entity_check.summary,
25
+ 'details' => @entity_check.details,
26
+ 'in_unscheduled_maintenance' => @entity_check.in_unscheduled_maintenance?,
27
+ 'in_scheduled_maintenance' => @entity_check.in_scheduled_maintenance?,
28
+ 'last_update' => @entity_check.last_update,
29
+ 'last_problem_notification' => entity_check.last_notification_for_state(:problem)[:timestamp],
30
+ 'last_recovery_notification' => entity_check.last_notification_for_state(:recovery)[:timestamp],
31
+ 'last_acknowledgement_notification' => entity_check.last_notification_for_state(:acknowledgement)[:timestamp]}
32
+ end
33
+
21
34
  def outages(start_time, end_time, options = {})
22
35
  # hist_states is an array of hashes, with [state, timestamp, summary] keys
23
36
  hist_states = @entity_check.historical_states(start_time, end_time)
@@ -26,32 +39,50 @@ module Flapjack
26
39
  initial = @entity_check.historical_state_before(hist_states.first[:timestamp])
27
40
  hist_states.unshift(initial) if initial
28
41
 
42
+ # TODO the following works, but isn't the neatest
29
43
  num_states = hist_states.size
30
44
 
31
- hist_states.each_with_index do |obj, index|
32
- ts = obj.delete(:timestamp)
33
- if index == (num_states - 1)
34
- # last (even if the only one)
35
- obj[:start_time] = start_time ? [ts, start_time].max : ts
36
- obj[:end_time] = end_time
37
- elsif (index == 0)
38
- # initial
39
- obj[:start_time] = start_time ? [ts, start_time].max : ts
40
- obj[:end_time] = hist_states[index + 1][:timestamp]
41
- else
42
- # except for first and last
43
- obj[:start_time] = ts
44
- obj[:end_time] = hist_states[index + 1][:timestamp]
45
+ index = 0
46
+ result = []
47
+ obj = nil
48
+
49
+ while index < num_states do
50
+ last_obj = obj
51
+ obj = hist_states[index]
52
+ index += 1
53
+
54
+ if last_obj && (last_obj[:state] == obj[:state])
55
+ # TODO maybe build up arrays of these instead, and leave calling
56
+ # classes to join them together if needed?
57
+ result.last[:summary] << " / #{obj[:summary]}"
58
+ result.last[:details] << " / #{obj[:details]}"
59
+ next
45
60
  end
46
- obj[:duration] = obj[:end_time] ? (obj[:end_time] - obj[:start_time]) : nil
47
- end
48
61
 
49
- # p hist_states
62
+ next if obj[:state] == 'ok'
63
+
64
+ ts = obj[:timestamp]
65
+
66
+ obj_st = (last_obj || !start_time) ? ts : [ts, start_time].max
67
+
68
+ next_ts_obj = hist_states[index..-1].detect {|hs| hs[:state] != obj[:state] }
69
+ obj_et = next_ts_obj ? next_ts_obj[:timestamp] : end_time
70
+
71
+ obj_dur = obj_et ? obj_et - obj_st : nil
72
+
73
+ result << {:state => obj[:state],
74
+ :start_time => obj_st,
75
+ :end_time => obj_et,
76
+ :duration => obj_dur,
77
+ :summary => obj[:summary] || '',
78
+ :details => obj[:details] || ''
79
+ }
80
+ end
50
81
 
51
- hist_states.reject {|obj| obj[:state] == 'ok'}
82
+ result
52
83
  end
53
84
 
54
- def unscheduled_maintenance(start_time, end_time)
85
+ def unscheduled_maintenances(start_time, end_time)
55
86
  # unsched_maintenance is an array of hashes, with [duration, timestamp, summary] keys
56
87
  unsched_maintenance = @entity_check.maintenances(start_time, end_time,
57
88
  :scheduled => false)
@@ -66,7 +97,7 @@ module Flapjack
66
97
  start_in_unsched + unsched_maintenance
67
98
  end
68
99
 
69
- def scheduled_maintenance(start_time, end_time)
100
+ def scheduled_maintenances(start_time, end_time)
70
101
  # sched_maintenance is an array of hashes, with [duration, timestamp, summary] keys
71
102
  sched_maintenance = @entity_check.maintenances(start_time, end_time,
72
103
  :scheduled => true)
@@ -87,7 +118,7 @@ module Flapjack
87
118
  #
88
119
  # TODO test performance with larger data sets
89
120
  def downtime(start_time, end_time)
90
- sched_maintenances = scheduled_maintenance(start_time, end_time)
121
+ sched_maintenances = scheduled_maintenances(start_time, end_time)
91
122
 
92
123
  outs = outages(start_time, end_time)
93
124
 
@@ -22,29 +22,34 @@ module Flapjack
22
22
  @redis = options[:redis]
23
23
  end
24
24
 
25
+ def status
26
+ checks.collect {|c| {:entity => @entity, :check => c,
27
+ :status => check_presenter(c).status } }
28
+ end
29
+
25
30
  def outages(start_time, end_time)
26
31
  checks.collect {|c|
27
- {:check => c, :outages => check_presenter(c).outages(start_time, end_time)}
32
+ {:entity => @entity, :check => c, :outages => check_presenter(c).outages(start_time, end_time)}
28
33
  }
29
34
  end
30
35
 
31
- def unscheduled_maintenance(start_time, end_time)
36
+ def unscheduled_maintenances(start_time, end_time)
32
37
  checks.collect {|c|
33
- {:check => c, :unscheduled_maintenance =>
34
- check_presenter(c).unscheduled_maintenance(start_time, end_time)}
38
+ {:entity => @entity, :check => c, :unscheduled_maintenances =>
39
+ check_presenter(c).unscheduled_maintenances(start_time, end_time)}
35
40
  }
36
41
  end
37
42
 
38
- def scheduled_maintenance(start_time, end_time)
43
+ def scheduled_maintenances(start_time, end_time)
39
44
  checks.collect {|c|
40
- {:check => c, :scheduled_maintenance =>
41
- check_presenter(c).scheduled_maintenance(start_time, end_time)}
45
+ {:entity => @entity, :check => c, :scheduled_maintenances =>
46
+ check_presenter(c).scheduled_maintenances(start_time, end_time)}
42
47
  }
43
48
  end
44
49
 
45
50
  def downtime(start_time, end_time)
46
51
  checks.collect {|c|
47
- {:check => c, :downtime =>
52
+ {:entity => @entity, :check => c, :downtime =>
48
53
  check_presenter(c).downtime(start_time, end_time)}
49
54
  }
50
55
  end
@@ -52,7 +57,7 @@ module Flapjack
52
57
  private
53
58
 
54
59
  def checks
55
- @check_list ||= @entity.check_list
60
+ @check_list ||= @entity.check_list.sort
56
61
  end
57
62
 
58
63
  def check_presenter(check)
@@ -8,6 +8,8 @@ require 'socket'
8
8
  require 'em-synchrony'
9
9
  require 'em/protocols/smtpclient'
10
10
 
11
+ require 'flapjack/utility'
12
+
11
13
  require 'flapjack/data/entity_check'
12
14
 
13
15
  module Flapjack
@@ -17,6 +19,8 @@ module Flapjack
17
19
 
18
20
  class << self
19
21
 
22
+ include Flapjack::Utility
23
+
20
24
  def start
21
25
  @logger.info("starting")
22
26
  @logger.debug("new email gateway pikelet with the following options: #{@config.inspect}")
@@ -32,8 +36,11 @@ module Flapjack
32
36
  @contact_last_name = notification['contact_last_name']
33
37
  @state = notification['state']
34
38
  @summary = notification['summary']
39
+ @last_state = notification['last_state']
40
+ @last_summary = notification['last_summary']
35
41
  @details = notification['details']
36
42
  @time = notification['time']
43
+ @relative = relative_time_ago(Time.at(@time))
37
44
  @entity_name, @check = notification['event_id'].split(':', 2)
38
45
 
39
46
  entity_check = Flapjack::Data::EntityCheck.for_event_id(notification['event_id'],
@@ -41,7 +41,19 @@
41
41
  %tr
42
42
  %td
43
43
  %strong Time
44
- %td= Time.at(@time.to_i).to_s
44
+ %td
45
+ = Time.at(@time.to_i).to_s
46
+ ( #{@relative} ago)
47
+
48
+ - if @last_state
49
+ %tr
50
+ %td Previous state
51
+ %td= @last_state
52
+
53
+ - if @last_summary
54
+ %tr
55
+ %td Previous summary
56
+ %td= @last_summary
45
57
 
46
58
  %p Cheers,
47
59
  %p Flapjack
@@ -2,11 +2,12 @@ Hi <%= @contact_first_name %>,
2
2
 
3
3
  Monitoring has detected the following:
4
4
 
5
- Entity: <%= @entity_name %>
6
- Check: <%= @check %>
7
- State: <%= @state %>
5
+ Entity: <%= @entity_name %>
6
+ Check: <%= @check %>
7
+ State: <%= @state %>
8
8
  Summary: <%= @summary %>
9
- Time: <%= Time.at(@time.to_i).to_s %>
9
+ Time: <%= Time.at(@time.to_i).to_s %> (<%= @relative %> ago)
10
+ <%= @last_state ? "\nPrevious state: #{@last_state}" : '' %><%= @last_summary ? "\nPrevious summary: #{@last_summary}" : '' %>
10
11
 
11
12
  Cheers,
12
13
  Flapjack
@@ -36,6 +36,8 @@ module Flapjack
36
36
  def initialize(opts = {})
37
37
  @config = opts[:config]
38
38
  @redis_config = opts[:redis_config]
39
+ @boot_time = opts[:boot_time]
40
+
39
41
  @redis = Flapjack::RedisPool.new(:config => @redis_config, :size => 2) # first will block
40
42
 
41
43
  @logger = opts[:logger]
@@ -146,56 +148,110 @@ module Flapjack
146
148
  error = "unknown entity" if entity_check.nil?
147
149
  end
148
150
 
149
- if entity_check && entity_check.in_unscheduled_maintenance?
150
- error = "#{event_id} is already acknowledged"
151
- end
152
-
153
151
  if error
154
152
  msg = "ERROR - couldn't ACK #{ackid} - #{error}"
155
153
  else
156
- msg = "ACKing #{entity_check.check} on #{entity_check.entity_name} (#{ackid})"
154
+ entity_name, check = event_id.split(':', 2)
155
+
156
+ if entity_check.in_unscheduled_maintenance?
157
+ # ack = entity_check.current_maintenance(:unscheduled => true)
158
+ # FIXME details from current?
159
+ msg = "Changing ACK for #{check} on #{entity_name} (#{ackid})"
160
+ else
161
+ msg = "ACKing #{check} on #{entity_name} (#{ackid})"
162
+ end
157
163
  action = Proc.new {
158
- entity_check.create_acknowledgement('summary' => (comment || ''),
159
- 'acknowledgement_id' => ackid, 'duration' => duration)
164
+ Flapjack::Data::Event.create_acknowledgement(
165
+ entity_name, check,
166
+ :summary => (comment || ''),
167
+ :acknowledgement_id => ackid,
168
+ :duration => duration,
169
+ :redis => @redis
170
+ )
160
171
  }
161
172
  end
162
173
 
163
174
  when command =~ /^help$/
164
- msg = "commands: \n"
165
- msg += " ACKID <id> <comment> [duration: <time spec>] \n"
166
- msg += " find entities matching /pattern/ \n"
167
- msg += " test notifications for <entity>[:<check>] \n"
168
- msg += " identify \n"
169
- msg += " help \n"
175
+ msg = "commands: \n" +
176
+ " ACKID <id> <comment> [duration: <time spec>] \n" +
177
+ " find entities matching /pattern/ \n" +
178
+ " test notifications for <entity>[:<check>] \n" +
179
+ " tell me about <entity>[:<check>]" +
180
+ " identify \n" +
181
+ " help \n"
170
182
 
171
183
  when command =~ /^identify$/
172
- t = Process.times
173
- fqdn = `/bin/hostname -f`.chomp
174
- pid = Process.pid
175
- instance_id = "#{@fqdn}:#{@pid}"
176
- boot_time = Time.at(@redis.hget("executive_instance:#{instance_id}", 'boot_time').to_i)
177
- msg = "Flapjack #{Flapjack::VERSION} process #{pid} on #{fqdn} \n"
178
- msg += "Boot time: #{boot_time}\n"
179
- msg += "User CPU Time: #{t.utime}\n"
180
- msg += "System CPU Time: #{t.stime}\n"
181
- msg += `uname -a`.chomp + "\n"
182
-
183
- when command =~ /^test notifications for\s+([a-z0-9\-\.]+)(:(.+))?$/i
184
+ t = Process.times
185
+ fqdn = `/bin/hostname -f`.chomp
186
+ pid = Process.pid
187
+ msg = "Flapjack #{Flapjack::VERSION} process #{pid} on #{fqdn} \n" +
188
+ "Boot time: #{@boot_time}\n" +
189
+ "User CPU Time: #{t.utime}\n" +
190
+ "System CPU Time: #{t.stime}\n" +
191
+ `uname -a`.chomp + "\n"
192
+
193
+ when command =~ /^test notifications for\s+([a-z0-9\-\.]+)(?::(.+))?$/i
194
+ entity_name = $1
195
+ check_name = $2 || 'test'
196
+
197
+ if entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
198
+ msg = "so you want me to test notifications for entity: #{entity_name}, check: #{check_name} eh? ... well OK!"
199
+
200
+ summary = "Testing notifications to all contacts interested in entity: #{entity_name}, check: #{check_name}"
201
+ Flapjack::Data::Event.test_notifications(entity_name, check_name, :summary => summary, :redis => @redis)
202
+ else
203
+ msg = "yeah, no I can't see #{entity_name} in my systems"
204
+ end
205
+
206
+ when command =~ /^tell me about\s+([a-z0-9\-\.]+)(?::(.+))?$+/
184
207
  entity_name = $1
185
- check_name = $3 ? $3 : 'test'
208
+ check_name = $2
186
209
 
187
- msg = "so you want me to test notifications for entity: #{entity_name}, check: #{check_name} eh? ... well OK!"
210
+ if entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
211
+ check_str = check_name.nil? ? '' : ", check: #{check_name}"
212
+ msg = "so you'd like details on entity: #{entity_name}#{check_str} hmm? ... OK!\n"
188
213
 
189
- entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
190
- if entity
191
- summary = "Testing notifications to all contacts interested in entity: #{entity.name}, check: #{check_name}"
214
+ current_time = Time.now
192
215
 
193
- entity_check = Flapjack::Data::EntityCheck.for_entity(entity, check_name, :redis => @redis)
194
- puts entity_check.inspect
195
- entity_check.test_notifications('summary' => summary)
216
+ get_details = proc {|entity_check|
217
+ sched = entity_check.current_maintenance(:scheduled => true)
218
+ unsched = entity_check.current_maintenance(:unscheduled => true)
196
219
 
220
+ if (sched || unsched) && check_name.nil?
221
+ check = entity_check.check
222
+ msg += "---\n#{entity_name}:#{check}\n"
223
+ end
224
+
225
+ unless sched.nil?
226
+ start = Time.at(sched[:start_time])
227
+ finish = Time.at(sched[:start_time] + sched[:duration])
228
+ remain = time_period_in_words( (finish - current_time).ceil )
229
+ # TODO a simpler time format?
230
+ msg += "Currently in scheduled maintenance: #{start} -> #{finish} (#{remain} remaining)\n"
231
+ end
232
+
233
+ unless unsched.nil?
234
+ start = Time.at(unsched[:start_time])
235
+ finish = Time.at(unsched[:start_time] + unsched[:duration])
236
+ remain = time_period_in_words( (finish - current_time).ceil )
237
+ # TODO a simpler time format?
238
+ msg += "Currently in unscheduled maintenance: #{start} -> #{finish} (#{remain} remaining)\n"
239
+ end
240
+ }
241
+
242
+ check_names = check_name.nil? ? entity.check_list.sort : [check_name]
243
+
244
+ if check_names.empty?
245
+ msg += "I couldn't find any checks for entity: #{entity_name}"
246
+ else
247
+ check_names.each do |check|
248
+ entity_check = Flapjack::Data::EntityCheck.for_entity(entity, check, :redis => @redis)
249
+ next if entity_check.nil?
250
+ get_details.call(entity_check)
251
+ end
252
+ end
197
253
  else
198
- msg = "yeah, no i can't see #{entity_name} in my systems"
254
+ msg = "hmmm, I can't see #{entity_name} in my systems"
199
255
  end
200
256
 
201
257
  when command =~ /^(find )?entities matching\s+\/(.*)\/.*$/i
@@ -177,12 +177,16 @@ module Flapjack
177
177
  end
178
178
 
179
179
  pg_acknowledged_by = acknowledged[:pg_acknowledged_by]
180
- @logger.info "#{entity_check.entity_name}:#{check} is acknowledged in pagerduty, creating flapjack acknowledgement... "
180
+ entity_name = entity_check.entity_name
181
+ @logger.info "#{entity_name}:#{check} is acknowledged in pagerduty, creating flapjack acknowledgement... "
181
182
  who_text = ""
182
183
  if !pg_acknowledged_by.nil? && !pg_acknowledged_by['name'].nil?
183
184
  who_text = " by #{pg_acknowledged_by['name']}"
184
185
  end
185
- entity_check.create_acknowledgement('summary' => "Acknowledged on PagerDuty" + who_text)
186
+ Flapjack::Data::Event.create_acknowledgement(
187
+ entity_name, check,
188
+ :summary => "Acknowledged on PagerDuty" + who_text,
189
+ :redis => @redis)
186
190
  end
187
191
 
188
192
  end
@@ -53,7 +53,6 @@ module Flapjack
53
53
  use Flapjack::CommonLogger, access_logger
54
54
  end
55
55
 
56
-
57
56
  end
58
57
  end
59
58
 
@@ -191,6 +190,9 @@ module Flapjack
191
190
 
192
191
  @contacts = entity_check.contacts
193
192
 
193
+ @state_changes = entity_check.historical_states(nil, Time.now.to_i,
194
+ :order => 'desc', :limit => 20)
195
+
194
196
  haml :check
195
197
  end
196
198
 
@@ -203,11 +205,14 @@ module Flapjack
203
205
  dur = ChronicDuration.parse(params[:duration] || '')
204
206
  @duration = (dur.nil? || (dur <= 0)) ? (4 * 60 * 60) : dur
205
207
 
206
- entity_check = get_entity_check(@entity, @check)
207
- return 404 if entity_check.nil?
208
+ return 404 if get_entity_check(@entity, @check).nil?
208
209
 
209
- ack = entity_check.create_acknowledgement('summary' => (@summary || ''),
210
- 'acknowledgement_id' => @acknowledgement_id, 'duration' => @duration)
210
+ ack = Flapjack::Data::Event.create_acknowledgement(
211
+ @entity, @check,
212
+ :summary => (@summary || ''),
213
+ :acknowledgement_id => @acknowledgement_id,
214
+ :duration => @duration,
215
+ :redis => redis)
211
216
 
212
217
  redirect back
213
218
  end
@@ -321,9 +326,9 @@ module Flapjack
321
326
  entity_check = Flapjack::Data::EntityCheck.for_entity(entity,
322
327
  check, :redis => redis)
323
328
  latest_notif =
324
- {:problem => entity_check.last_problem_notification,
325
- :recovery => entity_check.last_recovery_notification,
326
- :acknowledgement => entity_check.last_acknowledgement_notification
329
+ {:problem => entity_check.last_notification_for_state(:problem)[:timestamp],
330
+ :recovery => entity_check.last_notification_for_state(:recovery)[:timestamp],
331
+ :acknowledgement => entity_check.last_notification_for_state(:acknowledgement)[:timestamp]
327
332
  }.max_by {|n| n[1] || 0}
328
333
  [(entity_check.state || '-'),
329
334
  (entity_check.last_change || '-'),