flapjack 0.7.14 → 0.7.15
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +10 -0
- data/etc/flapjack_config.yaml.example +1 -0
- data/features/events.feature +5 -0
- data/features/notification_rules.feature +1 -1
- data/features/steps/events_steps.rb +28 -13
- data/features/steps/notifications_steps.rb +1 -1
- data/lib/flapjack/coordinator.rb +3 -1
- data/lib/flapjack/data/contact.rb +8 -6
- data/lib/flapjack/data/entity_check.rb +78 -113
- data/lib/flapjack/data/event.rb +54 -65
- data/lib/flapjack/data/notification.rb +5 -1
- data/lib/flapjack/executive.rb +42 -38
- data/lib/flapjack/filters/acknowledgement.rb +5 -5
- data/lib/flapjack/filters/base.rb +2 -2
- data/lib/flapjack/filters/delays.rb +11 -11
- data/lib/flapjack/filters/detect_mass_client_failures.rb +8 -8
- data/lib/flapjack/filters/ok.rb +6 -6
- data/lib/flapjack/filters/scheduled_maintenance.rb +2 -2
- data/lib/flapjack/filters/unscheduled_maintenance.rb +3 -2
- data/lib/flapjack/gateways/api.rb +374 -277
- data/lib/flapjack/gateways/api/entity_check_presenter.rb +52 -21
- data/lib/flapjack/gateways/api/entity_presenter.rb +14 -9
- data/lib/flapjack/gateways/email.rb +7 -0
- data/lib/flapjack/gateways/email/alert.html.haml +13 -1
- data/lib/flapjack/gateways/email/alert.text.erb +5 -4
- data/lib/flapjack/gateways/jabber.rb +90 -34
- data/lib/flapjack/gateways/pagerduty.rb +6 -2
- data/lib/flapjack/gateways/web.rb +13 -8
- data/lib/flapjack/gateways/web/views/check.haml +70 -45
- data/lib/flapjack/gateways/web/views/checks.haml +1 -1
- data/lib/flapjack/gateways/web/views/entity.haml +1 -1
- data/lib/flapjack/patches.rb +9 -2
- data/lib/flapjack/pikelet.rb +14 -10
- data/lib/flapjack/utility.rb +10 -4
- data/lib/flapjack/version.rb +1 -1
- data/spec/lib/flapjack/coordinator_spec.rb +19 -5
- data/spec/lib/flapjack/data/entity_check_spec.rb +3 -30
- data/spec/lib/flapjack/data/event_spec.rb +96 -1
- data/spec/lib/flapjack/executive_spec.rb +5 -11
- data/spec/lib/flapjack/gateways/api/entity_check_presenter_spec.rb +22 -3
- data/spec/lib/flapjack/gateways/api/entity_presenter_spec.rb +30 -15
- data/spec/lib/flapjack/gateways/api_spec.rb +552 -186
- data/spec/lib/flapjack/gateways/email_spec.rb +2 -0
- data/spec/lib/flapjack/gateways/jabber_spec.rb +5 -4
- data/spec/lib/flapjack/gateways/pagerduty_spec.rb +3 -2
- data/spec/lib/flapjack/gateways/web_spec.rb +17 -12
- data/spec/lib/flapjack/pikelet_spec.rb +5 -2
- metadata +4 -5
- data/config.ru +0 -11
@@ -18,6 +18,19 @@ module Flapjack
|
|
18
18
|
@entity_check = entity_check
|
19
19
|
end
|
20
20
|
|
21
|
+
def status
|
22
|
+
{'name' => @entity_check.check,
|
23
|
+
'state' => @entity_check.state,
|
24
|
+
'summary' => @entity_check.summary,
|
25
|
+
'details' => @entity_check.details,
|
26
|
+
'in_unscheduled_maintenance' => @entity_check.in_unscheduled_maintenance?,
|
27
|
+
'in_scheduled_maintenance' => @entity_check.in_scheduled_maintenance?,
|
28
|
+
'last_update' => @entity_check.last_update,
|
29
|
+
'last_problem_notification' => entity_check.last_notification_for_state(:problem)[:timestamp],
|
30
|
+
'last_recovery_notification' => entity_check.last_notification_for_state(:recovery)[:timestamp],
|
31
|
+
'last_acknowledgement_notification' => entity_check.last_notification_for_state(:acknowledgement)[:timestamp]}
|
32
|
+
end
|
33
|
+
|
21
34
|
def outages(start_time, end_time, options = {})
|
22
35
|
# hist_states is an array of hashes, with [state, timestamp, summary] keys
|
23
36
|
hist_states = @entity_check.historical_states(start_time, end_time)
|
@@ -26,32 +39,50 @@ module Flapjack
|
|
26
39
|
initial = @entity_check.historical_state_before(hist_states.first[:timestamp])
|
27
40
|
hist_states.unshift(initial) if initial
|
28
41
|
|
42
|
+
# TODO the following works, but isn't the neatest
|
29
43
|
num_states = hist_states.size
|
30
44
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
#
|
43
|
-
|
44
|
-
|
45
|
+
index = 0
|
46
|
+
result = []
|
47
|
+
obj = nil
|
48
|
+
|
49
|
+
while index < num_states do
|
50
|
+
last_obj = obj
|
51
|
+
obj = hist_states[index]
|
52
|
+
index += 1
|
53
|
+
|
54
|
+
if last_obj && (last_obj[:state] == obj[:state])
|
55
|
+
# TODO maybe build up arrays of these instead, and leave calling
|
56
|
+
# classes to join them together if needed?
|
57
|
+
result.last[:summary] << " / #{obj[:summary]}"
|
58
|
+
result.last[:details] << " / #{obj[:details]}"
|
59
|
+
next
|
45
60
|
end
|
46
|
-
obj[:duration] = obj[:end_time] ? (obj[:end_time] - obj[:start_time]) : nil
|
47
|
-
end
|
48
61
|
|
49
|
-
|
62
|
+
next if obj[:state] == 'ok'
|
63
|
+
|
64
|
+
ts = obj[:timestamp]
|
65
|
+
|
66
|
+
obj_st = (last_obj || !start_time) ? ts : [ts, start_time].max
|
67
|
+
|
68
|
+
next_ts_obj = hist_states[index..-1].detect {|hs| hs[:state] != obj[:state] }
|
69
|
+
obj_et = next_ts_obj ? next_ts_obj[:timestamp] : end_time
|
70
|
+
|
71
|
+
obj_dur = obj_et ? obj_et - obj_st : nil
|
72
|
+
|
73
|
+
result << {:state => obj[:state],
|
74
|
+
:start_time => obj_st,
|
75
|
+
:end_time => obj_et,
|
76
|
+
:duration => obj_dur,
|
77
|
+
:summary => obj[:summary] || '',
|
78
|
+
:details => obj[:details] || ''
|
79
|
+
}
|
80
|
+
end
|
50
81
|
|
51
|
-
|
82
|
+
result
|
52
83
|
end
|
53
84
|
|
54
|
-
def
|
85
|
+
def unscheduled_maintenances(start_time, end_time)
|
55
86
|
# unsched_maintenance is an array of hashes, with [duration, timestamp, summary] keys
|
56
87
|
unsched_maintenance = @entity_check.maintenances(start_time, end_time,
|
57
88
|
:scheduled => false)
|
@@ -66,7 +97,7 @@ module Flapjack
|
|
66
97
|
start_in_unsched + unsched_maintenance
|
67
98
|
end
|
68
99
|
|
69
|
-
def
|
100
|
+
def scheduled_maintenances(start_time, end_time)
|
70
101
|
# sched_maintenance is an array of hashes, with [duration, timestamp, summary] keys
|
71
102
|
sched_maintenance = @entity_check.maintenances(start_time, end_time,
|
72
103
|
:scheduled => true)
|
@@ -87,7 +118,7 @@ module Flapjack
|
|
87
118
|
#
|
88
119
|
# TODO test performance with larger data sets
|
89
120
|
def downtime(start_time, end_time)
|
90
|
-
sched_maintenances =
|
121
|
+
sched_maintenances = scheduled_maintenances(start_time, end_time)
|
91
122
|
|
92
123
|
outs = outages(start_time, end_time)
|
93
124
|
|
@@ -22,29 +22,34 @@ module Flapjack
|
|
22
22
|
@redis = options[:redis]
|
23
23
|
end
|
24
24
|
|
25
|
+
def status
|
26
|
+
checks.collect {|c| {:entity => @entity, :check => c,
|
27
|
+
:status => check_presenter(c).status } }
|
28
|
+
end
|
29
|
+
|
25
30
|
def outages(start_time, end_time)
|
26
31
|
checks.collect {|c|
|
27
|
-
{:check => c, :outages => check_presenter(c).outages(start_time, end_time)}
|
32
|
+
{:entity => @entity, :check => c, :outages => check_presenter(c).outages(start_time, end_time)}
|
28
33
|
}
|
29
34
|
end
|
30
35
|
|
31
|
-
def
|
36
|
+
def unscheduled_maintenances(start_time, end_time)
|
32
37
|
checks.collect {|c|
|
33
|
-
{:check => c, :
|
34
|
-
check_presenter(c).
|
38
|
+
{:entity => @entity, :check => c, :unscheduled_maintenances =>
|
39
|
+
check_presenter(c).unscheduled_maintenances(start_time, end_time)}
|
35
40
|
}
|
36
41
|
end
|
37
42
|
|
38
|
-
def
|
43
|
+
def scheduled_maintenances(start_time, end_time)
|
39
44
|
checks.collect {|c|
|
40
|
-
{:check => c, :
|
41
|
-
check_presenter(c).
|
45
|
+
{:entity => @entity, :check => c, :scheduled_maintenances =>
|
46
|
+
check_presenter(c).scheduled_maintenances(start_time, end_time)}
|
42
47
|
}
|
43
48
|
end
|
44
49
|
|
45
50
|
def downtime(start_time, end_time)
|
46
51
|
checks.collect {|c|
|
47
|
-
{:check => c, :downtime =>
|
52
|
+
{:entity => @entity, :check => c, :downtime =>
|
48
53
|
check_presenter(c).downtime(start_time, end_time)}
|
49
54
|
}
|
50
55
|
end
|
@@ -52,7 +57,7 @@ module Flapjack
|
|
52
57
|
private
|
53
58
|
|
54
59
|
def checks
|
55
|
-
@check_list ||= @entity.check_list
|
60
|
+
@check_list ||= @entity.check_list.sort
|
56
61
|
end
|
57
62
|
|
58
63
|
def check_presenter(check)
|
@@ -8,6 +8,8 @@ require 'socket'
|
|
8
8
|
require 'em-synchrony'
|
9
9
|
require 'em/protocols/smtpclient'
|
10
10
|
|
11
|
+
require 'flapjack/utility'
|
12
|
+
|
11
13
|
require 'flapjack/data/entity_check'
|
12
14
|
|
13
15
|
module Flapjack
|
@@ -17,6 +19,8 @@ module Flapjack
|
|
17
19
|
|
18
20
|
class << self
|
19
21
|
|
22
|
+
include Flapjack::Utility
|
23
|
+
|
20
24
|
def start
|
21
25
|
@logger.info("starting")
|
22
26
|
@logger.debug("new email gateway pikelet with the following options: #{@config.inspect}")
|
@@ -32,8 +36,11 @@ module Flapjack
|
|
32
36
|
@contact_last_name = notification['contact_last_name']
|
33
37
|
@state = notification['state']
|
34
38
|
@summary = notification['summary']
|
39
|
+
@last_state = notification['last_state']
|
40
|
+
@last_summary = notification['last_summary']
|
35
41
|
@details = notification['details']
|
36
42
|
@time = notification['time']
|
43
|
+
@relative = relative_time_ago(Time.at(@time))
|
37
44
|
@entity_name, @check = notification['event_id'].split(':', 2)
|
38
45
|
|
39
46
|
entity_check = Flapjack::Data::EntityCheck.for_event_id(notification['event_id'],
|
@@ -41,7 +41,19 @@
|
|
41
41
|
%tr
|
42
42
|
%td
|
43
43
|
%strong Time
|
44
|
-
%td
|
44
|
+
%td
|
45
|
+
= Time.at(@time.to_i).to_s
|
46
|
+
( #{@relative} ago)
|
47
|
+
|
48
|
+
- if @last_state
|
49
|
+
%tr
|
50
|
+
%td Previous state
|
51
|
+
%td= @last_state
|
52
|
+
|
53
|
+
- if @last_summary
|
54
|
+
%tr
|
55
|
+
%td Previous summary
|
56
|
+
%td= @last_summary
|
45
57
|
|
46
58
|
%p Cheers,
|
47
59
|
%p Flapjack
|
@@ -2,11 +2,12 @@ Hi <%= @contact_first_name %>,
|
|
2
2
|
|
3
3
|
Monitoring has detected the following:
|
4
4
|
|
5
|
-
Entity:
|
6
|
-
Check:
|
7
|
-
State:
|
5
|
+
Entity: <%= @entity_name %>
|
6
|
+
Check: <%= @check %>
|
7
|
+
State: <%= @state %>
|
8
8
|
Summary: <%= @summary %>
|
9
|
-
Time:
|
9
|
+
Time: <%= Time.at(@time.to_i).to_s %> (<%= @relative %> ago)
|
10
|
+
<%= @last_state ? "\nPrevious state: #{@last_state}" : '' %><%= @last_summary ? "\nPrevious summary: #{@last_summary}" : '' %>
|
10
11
|
|
11
12
|
Cheers,
|
12
13
|
Flapjack
|
@@ -36,6 +36,8 @@ module Flapjack
|
|
36
36
|
def initialize(opts = {})
|
37
37
|
@config = opts[:config]
|
38
38
|
@redis_config = opts[:redis_config]
|
39
|
+
@boot_time = opts[:boot_time]
|
40
|
+
|
39
41
|
@redis = Flapjack::RedisPool.new(:config => @redis_config, :size => 2) # first will block
|
40
42
|
|
41
43
|
@logger = opts[:logger]
|
@@ -146,56 +148,110 @@ module Flapjack
|
|
146
148
|
error = "unknown entity" if entity_check.nil?
|
147
149
|
end
|
148
150
|
|
149
|
-
if entity_check && entity_check.in_unscheduled_maintenance?
|
150
|
-
error = "#{event_id} is already acknowledged"
|
151
|
-
end
|
152
|
-
|
153
151
|
if error
|
154
152
|
msg = "ERROR - couldn't ACK #{ackid} - #{error}"
|
155
153
|
else
|
156
|
-
|
154
|
+
entity_name, check = event_id.split(':', 2)
|
155
|
+
|
156
|
+
if entity_check.in_unscheduled_maintenance?
|
157
|
+
# ack = entity_check.current_maintenance(:unscheduled => true)
|
158
|
+
# FIXME details from current?
|
159
|
+
msg = "Changing ACK for #{check} on #{entity_name} (#{ackid})"
|
160
|
+
else
|
161
|
+
msg = "ACKing #{check} on #{entity_name} (#{ackid})"
|
162
|
+
end
|
157
163
|
action = Proc.new {
|
158
|
-
|
159
|
-
|
164
|
+
Flapjack::Data::Event.create_acknowledgement(
|
165
|
+
entity_name, check,
|
166
|
+
:summary => (comment || ''),
|
167
|
+
:acknowledgement_id => ackid,
|
168
|
+
:duration => duration,
|
169
|
+
:redis => @redis
|
170
|
+
)
|
160
171
|
}
|
161
172
|
end
|
162
173
|
|
163
174
|
when command =~ /^help$/
|
164
|
-
msg
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
175
|
+
msg = "commands: \n" +
|
176
|
+
" ACKID <id> <comment> [duration: <time spec>] \n" +
|
177
|
+
" find entities matching /pattern/ \n" +
|
178
|
+
" test notifications for <entity>[:<check>] \n" +
|
179
|
+
" tell me about <entity>[:<check>]" +
|
180
|
+
" identify \n" +
|
181
|
+
" help \n"
|
170
182
|
|
171
183
|
when command =~ /^identify$/
|
172
|
-
t
|
173
|
-
fqdn
|
174
|
-
pid
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
+
t = Process.times
|
185
|
+
fqdn = `/bin/hostname -f`.chomp
|
186
|
+
pid = Process.pid
|
187
|
+
msg = "Flapjack #{Flapjack::VERSION} process #{pid} on #{fqdn} \n" +
|
188
|
+
"Boot time: #{@boot_time}\n" +
|
189
|
+
"User CPU Time: #{t.utime}\n" +
|
190
|
+
"System CPU Time: #{t.stime}\n" +
|
191
|
+
`uname -a`.chomp + "\n"
|
192
|
+
|
193
|
+
when command =~ /^test notifications for\s+([a-z0-9\-\.]+)(?::(.+))?$/i
|
194
|
+
entity_name = $1
|
195
|
+
check_name = $2 || 'test'
|
196
|
+
|
197
|
+
if entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
|
198
|
+
msg = "so you want me to test notifications for entity: #{entity_name}, check: #{check_name} eh? ... well OK!"
|
199
|
+
|
200
|
+
summary = "Testing notifications to all contacts interested in entity: #{entity_name}, check: #{check_name}"
|
201
|
+
Flapjack::Data::Event.test_notifications(entity_name, check_name, :summary => summary, :redis => @redis)
|
202
|
+
else
|
203
|
+
msg = "yeah, no I can't see #{entity_name} in my systems"
|
204
|
+
end
|
205
|
+
|
206
|
+
when command =~ /^tell me about\s+([a-z0-9\-\.]+)(?::(.+))?$+/
|
184
207
|
entity_name = $1
|
185
|
-
check_name = $
|
208
|
+
check_name = $2
|
186
209
|
|
187
|
-
|
210
|
+
if entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
|
211
|
+
check_str = check_name.nil? ? '' : ", check: #{check_name}"
|
212
|
+
msg = "so you'd like details on entity: #{entity_name}#{check_str} hmm? ... OK!\n"
|
188
213
|
|
189
|
-
|
190
|
-
if entity
|
191
|
-
summary = "Testing notifications to all contacts interested in entity: #{entity.name}, check: #{check_name}"
|
214
|
+
current_time = Time.now
|
192
215
|
|
193
|
-
|
194
|
-
|
195
|
-
|
216
|
+
get_details = proc {|entity_check|
|
217
|
+
sched = entity_check.current_maintenance(:scheduled => true)
|
218
|
+
unsched = entity_check.current_maintenance(:unscheduled => true)
|
196
219
|
|
220
|
+
if (sched || unsched) && check_name.nil?
|
221
|
+
check = entity_check.check
|
222
|
+
msg += "---\n#{entity_name}:#{check}\n"
|
223
|
+
end
|
224
|
+
|
225
|
+
unless sched.nil?
|
226
|
+
start = Time.at(sched[:start_time])
|
227
|
+
finish = Time.at(sched[:start_time] + sched[:duration])
|
228
|
+
remain = time_period_in_words( (finish - current_time).ceil )
|
229
|
+
# TODO a simpler time format?
|
230
|
+
msg += "Currently in scheduled maintenance: #{start} -> #{finish} (#{remain} remaining)\n"
|
231
|
+
end
|
232
|
+
|
233
|
+
unless unsched.nil?
|
234
|
+
start = Time.at(unsched[:start_time])
|
235
|
+
finish = Time.at(unsched[:start_time] + unsched[:duration])
|
236
|
+
remain = time_period_in_words( (finish - current_time).ceil )
|
237
|
+
# TODO a simpler time format?
|
238
|
+
msg += "Currently in unscheduled maintenance: #{start} -> #{finish} (#{remain} remaining)\n"
|
239
|
+
end
|
240
|
+
}
|
241
|
+
|
242
|
+
check_names = check_name.nil? ? entity.check_list.sort : [check_name]
|
243
|
+
|
244
|
+
if check_names.empty?
|
245
|
+
msg += "I couldn't find any checks for entity: #{entity_name}"
|
246
|
+
else
|
247
|
+
check_names.each do |check|
|
248
|
+
entity_check = Flapjack::Data::EntityCheck.for_entity(entity, check, :redis => @redis)
|
249
|
+
next if entity_check.nil?
|
250
|
+
get_details.call(entity_check)
|
251
|
+
end
|
252
|
+
end
|
197
253
|
else
|
198
|
-
msg = "
|
254
|
+
msg = "hmmm, I can't see #{entity_name} in my systems"
|
199
255
|
end
|
200
256
|
|
201
257
|
when command =~ /^(find )?entities matching\s+\/(.*)\/.*$/i
|
@@ -177,12 +177,16 @@ module Flapjack
|
|
177
177
|
end
|
178
178
|
|
179
179
|
pg_acknowledged_by = acknowledged[:pg_acknowledged_by]
|
180
|
-
|
180
|
+
entity_name = entity_check.entity_name
|
181
|
+
@logger.info "#{entity_name}:#{check} is acknowledged in pagerduty, creating flapjack acknowledgement... "
|
181
182
|
who_text = ""
|
182
183
|
if !pg_acknowledged_by.nil? && !pg_acknowledged_by['name'].nil?
|
183
184
|
who_text = " by #{pg_acknowledged_by['name']}"
|
184
185
|
end
|
185
|
-
|
186
|
+
Flapjack::Data::Event.create_acknowledgement(
|
187
|
+
entity_name, check,
|
188
|
+
:summary => "Acknowledged on PagerDuty" + who_text,
|
189
|
+
:redis => @redis)
|
186
190
|
end
|
187
191
|
|
188
192
|
end
|
@@ -53,7 +53,6 @@ module Flapjack
|
|
53
53
|
use Flapjack::CommonLogger, access_logger
|
54
54
|
end
|
55
55
|
|
56
|
-
|
57
56
|
end
|
58
57
|
end
|
59
58
|
|
@@ -191,6 +190,9 @@ module Flapjack
|
|
191
190
|
|
192
191
|
@contacts = entity_check.contacts
|
193
192
|
|
193
|
+
@state_changes = entity_check.historical_states(nil, Time.now.to_i,
|
194
|
+
:order => 'desc', :limit => 20)
|
195
|
+
|
194
196
|
haml :check
|
195
197
|
end
|
196
198
|
|
@@ -203,11 +205,14 @@ module Flapjack
|
|
203
205
|
dur = ChronicDuration.parse(params[:duration] || '')
|
204
206
|
@duration = (dur.nil? || (dur <= 0)) ? (4 * 60 * 60) : dur
|
205
207
|
|
206
|
-
|
207
|
-
return 404 if entity_check.nil?
|
208
|
+
return 404 if get_entity_check(@entity, @check).nil?
|
208
209
|
|
209
|
-
ack =
|
210
|
-
|
210
|
+
ack = Flapjack::Data::Event.create_acknowledgement(
|
211
|
+
@entity, @check,
|
212
|
+
:summary => (@summary || ''),
|
213
|
+
:acknowledgement_id => @acknowledgement_id,
|
214
|
+
:duration => @duration,
|
215
|
+
:redis => redis)
|
211
216
|
|
212
217
|
redirect back
|
213
218
|
end
|
@@ -321,9 +326,9 @@ module Flapjack
|
|
321
326
|
entity_check = Flapjack::Data::EntityCheck.for_entity(entity,
|
322
327
|
check, :redis => redis)
|
323
328
|
latest_notif =
|
324
|
-
{:problem => entity_check.
|
325
|
-
:recovery => entity_check.
|
326
|
-
:acknowledgement => entity_check.
|
329
|
+
{:problem => entity_check.last_notification_for_state(:problem)[:timestamp],
|
330
|
+
:recovery => entity_check.last_notification_for_state(:recovery)[:timestamp],
|
331
|
+
:acknowledgement => entity_check.last_notification_for_state(:acknowledgement)[:timestamp]
|
327
332
|
}.max_by {|n| n[1] || 0}
|
328
333
|
[(entity_check.state || '-'),
|
329
334
|
(entity_check.last_change || '-'),
|