flapjack 0.7.14 → 0.7.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +10 -0
- data/etc/flapjack_config.yaml.example +1 -0
- data/features/events.feature +5 -0
- data/features/notification_rules.feature +1 -1
- data/features/steps/events_steps.rb +28 -13
- data/features/steps/notifications_steps.rb +1 -1
- data/lib/flapjack/coordinator.rb +3 -1
- data/lib/flapjack/data/contact.rb +8 -6
- data/lib/flapjack/data/entity_check.rb +78 -113
- data/lib/flapjack/data/event.rb +54 -65
- data/lib/flapjack/data/notification.rb +5 -1
- data/lib/flapjack/executive.rb +42 -38
- data/lib/flapjack/filters/acknowledgement.rb +5 -5
- data/lib/flapjack/filters/base.rb +2 -2
- data/lib/flapjack/filters/delays.rb +11 -11
- data/lib/flapjack/filters/detect_mass_client_failures.rb +8 -8
- data/lib/flapjack/filters/ok.rb +6 -6
- data/lib/flapjack/filters/scheduled_maintenance.rb +2 -2
- data/lib/flapjack/filters/unscheduled_maintenance.rb +3 -2
- data/lib/flapjack/gateways/api.rb +374 -277
- data/lib/flapjack/gateways/api/entity_check_presenter.rb +52 -21
- data/lib/flapjack/gateways/api/entity_presenter.rb +14 -9
- data/lib/flapjack/gateways/email.rb +7 -0
- data/lib/flapjack/gateways/email/alert.html.haml +13 -1
- data/lib/flapjack/gateways/email/alert.text.erb +5 -4
- data/lib/flapjack/gateways/jabber.rb +90 -34
- data/lib/flapjack/gateways/pagerduty.rb +6 -2
- data/lib/flapjack/gateways/web.rb +13 -8
- data/lib/flapjack/gateways/web/views/check.haml +70 -45
- data/lib/flapjack/gateways/web/views/checks.haml +1 -1
- data/lib/flapjack/gateways/web/views/entity.haml +1 -1
- data/lib/flapjack/patches.rb +9 -2
- data/lib/flapjack/pikelet.rb +14 -10
- data/lib/flapjack/utility.rb +10 -4
- data/lib/flapjack/version.rb +1 -1
- data/spec/lib/flapjack/coordinator_spec.rb +19 -5
- data/spec/lib/flapjack/data/entity_check_spec.rb +3 -30
- data/spec/lib/flapjack/data/event_spec.rb +96 -1
- data/spec/lib/flapjack/executive_spec.rb +5 -11
- data/spec/lib/flapjack/gateways/api/entity_check_presenter_spec.rb +22 -3
- data/spec/lib/flapjack/gateways/api/entity_presenter_spec.rb +30 -15
- data/spec/lib/flapjack/gateways/api_spec.rb +552 -186
- data/spec/lib/flapjack/gateways/email_spec.rb +2 -0
- data/spec/lib/flapjack/gateways/jabber_spec.rb +5 -4
- data/spec/lib/flapjack/gateways/pagerduty_spec.rb +3 -2
- data/spec/lib/flapjack/gateways/web_spec.rb +17 -12
- data/spec/lib/flapjack/pikelet_spec.rb +5 -2
- metadata +4 -5
- data/config.ru +0 -11
@@ -18,6 +18,19 @@ module Flapjack
|
|
18
18
|
@entity_check = entity_check
|
19
19
|
end
|
20
20
|
|
21
|
+
def status
|
22
|
+
{'name' => @entity_check.check,
|
23
|
+
'state' => @entity_check.state,
|
24
|
+
'summary' => @entity_check.summary,
|
25
|
+
'details' => @entity_check.details,
|
26
|
+
'in_unscheduled_maintenance' => @entity_check.in_unscheduled_maintenance?,
|
27
|
+
'in_scheduled_maintenance' => @entity_check.in_scheduled_maintenance?,
|
28
|
+
'last_update' => @entity_check.last_update,
|
29
|
+
'last_problem_notification' => entity_check.last_notification_for_state(:problem)[:timestamp],
|
30
|
+
'last_recovery_notification' => entity_check.last_notification_for_state(:recovery)[:timestamp],
|
31
|
+
'last_acknowledgement_notification' => entity_check.last_notification_for_state(:acknowledgement)[:timestamp]}
|
32
|
+
end
|
33
|
+
|
21
34
|
def outages(start_time, end_time, options = {})
|
22
35
|
# hist_states is an array of hashes, with [state, timestamp, summary] keys
|
23
36
|
hist_states = @entity_check.historical_states(start_time, end_time)
|
@@ -26,32 +39,50 @@ module Flapjack
|
|
26
39
|
initial = @entity_check.historical_state_before(hist_states.first[:timestamp])
|
27
40
|
hist_states.unshift(initial) if initial
|
28
41
|
|
42
|
+
# TODO the following works, but isn't the neatest
|
29
43
|
num_states = hist_states.size
|
30
44
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
#
|
43
|
-
|
44
|
-
|
45
|
+
index = 0
|
46
|
+
result = []
|
47
|
+
obj = nil
|
48
|
+
|
49
|
+
while index < num_states do
|
50
|
+
last_obj = obj
|
51
|
+
obj = hist_states[index]
|
52
|
+
index += 1
|
53
|
+
|
54
|
+
if last_obj && (last_obj[:state] == obj[:state])
|
55
|
+
# TODO maybe build up arrays of these instead, and leave calling
|
56
|
+
# classes to join them together if needed?
|
57
|
+
result.last[:summary] << " / #{obj[:summary]}"
|
58
|
+
result.last[:details] << " / #{obj[:details]}"
|
59
|
+
next
|
45
60
|
end
|
46
|
-
obj[:duration] = obj[:end_time] ? (obj[:end_time] - obj[:start_time]) : nil
|
47
|
-
end
|
48
61
|
|
49
|
-
|
62
|
+
next if obj[:state] == 'ok'
|
63
|
+
|
64
|
+
ts = obj[:timestamp]
|
65
|
+
|
66
|
+
obj_st = (last_obj || !start_time) ? ts : [ts, start_time].max
|
67
|
+
|
68
|
+
next_ts_obj = hist_states[index..-1].detect {|hs| hs[:state] != obj[:state] }
|
69
|
+
obj_et = next_ts_obj ? next_ts_obj[:timestamp] : end_time
|
70
|
+
|
71
|
+
obj_dur = obj_et ? obj_et - obj_st : nil
|
72
|
+
|
73
|
+
result << {:state => obj[:state],
|
74
|
+
:start_time => obj_st,
|
75
|
+
:end_time => obj_et,
|
76
|
+
:duration => obj_dur,
|
77
|
+
:summary => obj[:summary] || '',
|
78
|
+
:details => obj[:details] || ''
|
79
|
+
}
|
80
|
+
end
|
50
81
|
|
51
|
-
|
82
|
+
result
|
52
83
|
end
|
53
84
|
|
54
|
-
def
|
85
|
+
def unscheduled_maintenances(start_time, end_time)
|
55
86
|
# unsched_maintenance is an array of hashes, with [duration, timestamp, summary] keys
|
56
87
|
unsched_maintenance = @entity_check.maintenances(start_time, end_time,
|
57
88
|
:scheduled => false)
|
@@ -66,7 +97,7 @@ module Flapjack
|
|
66
97
|
start_in_unsched + unsched_maintenance
|
67
98
|
end
|
68
99
|
|
69
|
-
def
|
100
|
+
def scheduled_maintenances(start_time, end_time)
|
70
101
|
# sched_maintenance is an array of hashes, with [duration, timestamp, summary] keys
|
71
102
|
sched_maintenance = @entity_check.maintenances(start_time, end_time,
|
72
103
|
:scheduled => true)
|
@@ -87,7 +118,7 @@ module Flapjack
|
|
87
118
|
#
|
88
119
|
# TODO test performance with larger data sets
|
89
120
|
def downtime(start_time, end_time)
|
90
|
-
sched_maintenances =
|
121
|
+
sched_maintenances = scheduled_maintenances(start_time, end_time)
|
91
122
|
|
92
123
|
outs = outages(start_time, end_time)
|
93
124
|
|
@@ -22,29 +22,34 @@ module Flapjack
|
|
22
22
|
@redis = options[:redis]
|
23
23
|
end
|
24
24
|
|
25
|
+
def status
|
26
|
+
checks.collect {|c| {:entity => @entity, :check => c,
|
27
|
+
:status => check_presenter(c).status } }
|
28
|
+
end
|
29
|
+
|
25
30
|
def outages(start_time, end_time)
|
26
31
|
checks.collect {|c|
|
27
|
-
{:check => c, :outages => check_presenter(c).outages(start_time, end_time)}
|
32
|
+
{:entity => @entity, :check => c, :outages => check_presenter(c).outages(start_time, end_time)}
|
28
33
|
}
|
29
34
|
end
|
30
35
|
|
31
|
-
def
|
36
|
+
def unscheduled_maintenances(start_time, end_time)
|
32
37
|
checks.collect {|c|
|
33
|
-
{:check => c, :
|
34
|
-
check_presenter(c).
|
38
|
+
{:entity => @entity, :check => c, :unscheduled_maintenances =>
|
39
|
+
check_presenter(c).unscheduled_maintenances(start_time, end_time)}
|
35
40
|
}
|
36
41
|
end
|
37
42
|
|
38
|
-
def
|
43
|
+
def scheduled_maintenances(start_time, end_time)
|
39
44
|
checks.collect {|c|
|
40
|
-
{:check => c, :
|
41
|
-
check_presenter(c).
|
45
|
+
{:entity => @entity, :check => c, :scheduled_maintenances =>
|
46
|
+
check_presenter(c).scheduled_maintenances(start_time, end_time)}
|
42
47
|
}
|
43
48
|
end
|
44
49
|
|
45
50
|
def downtime(start_time, end_time)
|
46
51
|
checks.collect {|c|
|
47
|
-
{:check => c, :downtime =>
|
52
|
+
{:entity => @entity, :check => c, :downtime =>
|
48
53
|
check_presenter(c).downtime(start_time, end_time)}
|
49
54
|
}
|
50
55
|
end
|
@@ -52,7 +57,7 @@ module Flapjack
|
|
52
57
|
private
|
53
58
|
|
54
59
|
def checks
|
55
|
-
@check_list ||= @entity.check_list
|
60
|
+
@check_list ||= @entity.check_list.sort
|
56
61
|
end
|
57
62
|
|
58
63
|
def check_presenter(check)
|
@@ -8,6 +8,8 @@ require 'socket'
|
|
8
8
|
require 'em-synchrony'
|
9
9
|
require 'em/protocols/smtpclient'
|
10
10
|
|
11
|
+
require 'flapjack/utility'
|
12
|
+
|
11
13
|
require 'flapjack/data/entity_check'
|
12
14
|
|
13
15
|
module Flapjack
|
@@ -17,6 +19,8 @@ module Flapjack
|
|
17
19
|
|
18
20
|
class << self
|
19
21
|
|
22
|
+
include Flapjack::Utility
|
23
|
+
|
20
24
|
def start
|
21
25
|
@logger.info("starting")
|
22
26
|
@logger.debug("new email gateway pikelet with the following options: #{@config.inspect}")
|
@@ -32,8 +36,11 @@ module Flapjack
|
|
32
36
|
@contact_last_name = notification['contact_last_name']
|
33
37
|
@state = notification['state']
|
34
38
|
@summary = notification['summary']
|
39
|
+
@last_state = notification['last_state']
|
40
|
+
@last_summary = notification['last_summary']
|
35
41
|
@details = notification['details']
|
36
42
|
@time = notification['time']
|
43
|
+
@relative = relative_time_ago(Time.at(@time))
|
37
44
|
@entity_name, @check = notification['event_id'].split(':', 2)
|
38
45
|
|
39
46
|
entity_check = Flapjack::Data::EntityCheck.for_event_id(notification['event_id'],
|
@@ -41,7 +41,19 @@
|
|
41
41
|
%tr
|
42
42
|
%td
|
43
43
|
%strong Time
|
44
|
-
%td
|
44
|
+
%td
|
45
|
+
= Time.at(@time.to_i).to_s
|
46
|
+
( #{@relative} ago)
|
47
|
+
|
48
|
+
- if @last_state
|
49
|
+
%tr
|
50
|
+
%td Previous state
|
51
|
+
%td= @last_state
|
52
|
+
|
53
|
+
- if @last_summary
|
54
|
+
%tr
|
55
|
+
%td Previous summary
|
56
|
+
%td= @last_summary
|
45
57
|
|
46
58
|
%p Cheers,
|
47
59
|
%p Flapjack
|
@@ -2,11 +2,12 @@ Hi <%= @contact_first_name %>,
|
|
2
2
|
|
3
3
|
Monitoring has detected the following:
|
4
4
|
|
5
|
-
Entity:
|
6
|
-
Check:
|
7
|
-
State:
|
5
|
+
Entity: <%= @entity_name %>
|
6
|
+
Check: <%= @check %>
|
7
|
+
State: <%= @state %>
|
8
8
|
Summary: <%= @summary %>
|
9
|
-
Time:
|
9
|
+
Time: <%= Time.at(@time.to_i).to_s %> (<%= @relative %> ago)
|
10
|
+
<%= @last_state ? "\nPrevious state: #{@last_state}" : '' %><%= @last_summary ? "\nPrevious summary: #{@last_summary}" : '' %>
|
10
11
|
|
11
12
|
Cheers,
|
12
13
|
Flapjack
|
@@ -36,6 +36,8 @@ module Flapjack
|
|
36
36
|
def initialize(opts = {})
|
37
37
|
@config = opts[:config]
|
38
38
|
@redis_config = opts[:redis_config]
|
39
|
+
@boot_time = opts[:boot_time]
|
40
|
+
|
39
41
|
@redis = Flapjack::RedisPool.new(:config => @redis_config, :size => 2) # first will block
|
40
42
|
|
41
43
|
@logger = opts[:logger]
|
@@ -146,56 +148,110 @@ module Flapjack
|
|
146
148
|
error = "unknown entity" if entity_check.nil?
|
147
149
|
end
|
148
150
|
|
149
|
-
if entity_check && entity_check.in_unscheduled_maintenance?
|
150
|
-
error = "#{event_id} is already acknowledged"
|
151
|
-
end
|
152
|
-
|
153
151
|
if error
|
154
152
|
msg = "ERROR - couldn't ACK #{ackid} - #{error}"
|
155
153
|
else
|
156
|
-
|
154
|
+
entity_name, check = event_id.split(':', 2)
|
155
|
+
|
156
|
+
if entity_check.in_unscheduled_maintenance?
|
157
|
+
# ack = entity_check.current_maintenance(:unscheduled => true)
|
158
|
+
# FIXME details from current?
|
159
|
+
msg = "Changing ACK for #{check} on #{entity_name} (#{ackid})"
|
160
|
+
else
|
161
|
+
msg = "ACKing #{check} on #{entity_name} (#{ackid})"
|
162
|
+
end
|
157
163
|
action = Proc.new {
|
158
|
-
|
159
|
-
|
164
|
+
Flapjack::Data::Event.create_acknowledgement(
|
165
|
+
entity_name, check,
|
166
|
+
:summary => (comment || ''),
|
167
|
+
:acknowledgement_id => ackid,
|
168
|
+
:duration => duration,
|
169
|
+
:redis => @redis
|
170
|
+
)
|
160
171
|
}
|
161
172
|
end
|
162
173
|
|
163
174
|
when command =~ /^help$/
|
164
|
-
msg
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
175
|
+
msg = "commands: \n" +
|
176
|
+
" ACKID <id> <comment> [duration: <time spec>] \n" +
|
177
|
+
" find entities matching /pattern/ \n" +
|
178
|
+
" test notifications for <entity>[:<check>] \n" +
|
179
|
+
" tell me about <entity>[:<check>]" +
|
180
|
+
" identify \n" +
|
181
|
+
" help \n"
|
170
182
|
|
171
183
|
when command =~ /^identify$/
|
172
|
-
t
|
173
|
-
fqdn
|
174
|
-
pid
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
+
t = Process.times
|
185
|
+
fqdn = `/bin/hostname -f`.chomp
|
186
|
+
pid = Process.pid
|
187
|
+
msg = "Flapjack #{Flapjack::VERSION} process #{pid} on #{fqdn} \n" +
|
188
|
+
"Boot time: #{@boot_time}\n" +
|
189
|
+
"User CPU Time: #{t.utime}\n" +
|
190
|
+
"System CPU Time: #{t.stime}\n" +
|
191
|
+
`uname -a`.chomp + "\n"
|
192
|
+
|
193
|
+
when command =~ /^test notifications for\s+([a-z0-9\-\.]+)(?::(.+))?$/i
|
194
|
+
entity_name = $1
|
195
|
+
check_name = $2 || 'test'
|
196
|
+
|
197
|
+
if entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
|
198
|
+
msg = "so you want me to test notifications for entity: #{entity_name}, check: #{check_name} eh? ... well OK!"
|
199
|
+
|
200
|
+
summary = "Testing notifications to all contacts interested in entity: #{entity_name}, check: #{check_name}"
|
201
|
+
Flapjack::Data::Event.test_notifications(entity_name, check_name, :summary => summary, :redis => @redis)
|
202
|
+
else
|
203
|
+
msg = "yeah, no I can't see #{entity_name} in my systems"
|
204
|
+
end
|
205
|
+
|
206
|
+
when command =~ /^tell me about\s+([a-z0-9\-\.]+)(?::(.+))?$+/
|
184
207
|
entity_name = $1
|
185
|
-
check_name = $
|
208
|
+
check_name = $2
|
186
209
|
|
187
|
-
|
210
|
+
if entity = Flapjack::Data::Entity.find_by_name(entity_name, :redis => @redis)
|
211
|
+
check_str = check_name.nil? ? '' : ", check: #{check_name}"
|
212
|
+
msg = "so you'd like details on entity: #{entity_name}#{check_str} hmm? ... OK!\n"
|
188
213
|
|
189
|
-
|
190
|
-
if entity
|
191
|
-
summary = "Testing notifications to all contacts interested in entity: #{entity.name}, check: #{check_name}"
|
214
|
+
current_time = Time.now
|
192
215
|
|
193
|
-
|
194
|
-
|
195
|
-
|
216
|
+
get_details = proc {|entity_check|
|
217
|
+
sched = entity_check.current_maintenance(:scheduled => true)
|
218
|
+
unsched = entity_check.current_maintenance(:unscheduled => true)
|
196
219
|
|
220
|
+
if (sched || unsched) && check_name.nil?
|
221
|
+
check = entity_check.check
|
222
|
+
msg += "---\n#{entity_name}:#{check}\n"
|
223
|
+
end
|
224
|
+
|
225
|
+
unless sched.nil?
|
226
|
+
start = Time.at(sched[:start_time])
|
227
|
+
finish = Time.at(sched[:start_time] + sched[:duration])
|
228
|
+
remain = time_period_in_words( (finish - current_time).ceil )
|
229
|
+
# TODO a simpler time format?
|
230
|
+
msg += "Currently in scheduled maintenance: #{start} -> #{finish} (#{remain} remaining)\n"
|
231
|
+
end
|
232
|
+
|
233
|
+
unless unsched.nil?
|
234
|
+
start = Time.at(unsched[:start_time])
|
235
|
+
finish = Time.at(unsched[:start_time] + unsched[:duration])
|
236
|
+
remain = time_period_in_words( (finish - current_time).ceil )
|
237
|
+
# TODO a simpler time format?
|
238
|
+
msg += "Currently in unscheduled maintenance: #{start} -> #{finish} (#{remain} remaining)\n"
|
239
|
+
end
|
240
|
+
}
|
241
|
+
|
242
|
+
check_names = check_name.nil? ? entity.check_list.sort : [check_name]
|
243
|
+
|
244
|
+
if check_names.empty?
|
245
|
+
msg += "I couldn't find any checks for entity: #{entity_name}"
|
246
|
+
else
|
247
|
+
check_names.each do |check|
|
248
|
+
entity_check = Flapjack::Data::EntityCheck.for_entity(entity, check, :redis => @redis)
|
249
|
+
next if entity_check.nil?
|
250
|
+
get_details.call(entity_check)
|
251
|
+
end
|
252
|
+
end
|
197
253
|
else
|
198
|
-
msg = "
|
254
|
+
msg = "hmmm, I can't see #{entity_name} in my systems"
|
199
255
|
end
|
200
256
|
|
201
257
|
when command =~ /^(find )?entities matching\s+\/(.*)\/.*$/i
|
@@ -177,12 +177,16 @@ module Flapjack
|
|
177
177
|
end
|
178
178
|
|
179
179
|
pg_acknowledged_by = acknowledged[:pg_acknowledged_by]
|
180
|
-
|
180
|
+
entity_name = entity_check.entity_name
|
181
|
+
@logger.info "#{entity_name}:#{check} is acknowledged in pagerduty, creating flapjack acknowledgement... "
|
181
182
|
who_text = ""
|
182
183
|
if !pg_acknowledged_by.nil? && !pg_acknowledged_by['name'].nil?
|
183
184
|
who_text = " by #{pg_acknowledged_by['name']}"
|
184
185
|
end
|
185
|
-
|
186
|
+
Flapjack::Data::Event.create_acknowledgement(
|
187
|
+
entity_name, check,
|
188
|
+
:summary => "Acknowledged on PagerDuty" + who_text,
|
189
|
+
:redis => @redis)
|
186
190
|
end
|
187
191
|
|
188
192
|
end
|
@@ -53,7 +53,6 @@ module Flapjack
|
|
53
53
|
use Flapjack::CommonLogger, access_logger
|
54
54
|
end
|
55
55
|
|
56
|
-
|
57
56
|
end
|
58
57
|
end
|
59
58
|
|
@@ -191,6 +190,9 @@ module Flapjack
|
|
191
190
|
|
192
191
|
@contacts = entity_check.contacts
|
193
192
|
|
193
|
+
@state_changes = entity_check.historical_states(nil, Time.now.to_i,
|
194
|
+
:order => 'desc', :limit => 20)
|
195
|
+
|
194
196
|
haml :check
|
195
197
|
end
|
196
198
|
|
@@ -203,11 +205,14 @@ module Flapjack
|
|
203
205
|
dur = ChronicDuration.parse(params[:duration] || '')
|
204
206
|
@duration = (dur.nil? || (dur <= 0)) ? (4 * 60 * 60) : dur
|
205
207
|
|
206
|
-
|
207
|
-
return 404 if entity_check.nil?
|
208
|
+
return 404 if get_entity_check(@entity, @check).nil?
|
208
209
|
|
209
|
-
ack =
|
210
|
-
|
210
|
+
ack = Flapjack::Data::Event.create_acknowledgement(
|
211
|
+
@entity, @check,
|
212
|
+
:summary => (@summary || ''),
|
213
|
+
:acknowledgement_id => @acknowledgement_id,
|
214
|
+
:duration => @duration,
|
215
|
+
:redis => redis)
|
211
216
|
|
212
217
|
redirect back
|
213
218
|
end
|
@@ -321,9 +326,9 @@ module Flapjack
|
|
321
326
|
entity_check = Flapjack::Data::EntityCheck.for_entity(entity,
|
322
327
|
check, :redis => redis)
|
323
328
|
latest_notif =
|
324
|
-
{:problem => entity_check.
|
325
|
-
:recovery => entity_check.
|
326
|
-
:acknowledgement => entity_check.
|
329
|
+
{:problem => entity_check.last_notification_for_state(:problem)[:timestamp],
|
330
|
+
:recovery => entity_check.last_notification_for_state(:recovery)[:timestamp],
|
331
|
+
:acknowledgement => entity_check.last_notification_for_state(:acknowledgement)[:timestamp]
|
327
332
|
}.max_by {|n| n[1] || 0}
|
328
333
|
[(entity_check.state || '-'),
|
329
334
|
(entity_check.last_change || '-'),
|