god 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +67 -1
- data/Manifest.txt +3 -4
- data/Rakefile +1 -1
- data/bin/god +19 -1
- data/lib/god.rb +86 -49
- data/lib/god/cli/command.rb +7 -1
- data/lib/god/cli/run.rb +58 -0
- data/lib/god/condition.rb +6 -2
- data/lib/god/conditions/cpu_usage.rb +7 -6
- data/lib/god/conditions/http_response_code.rb +5 -1
- data/lib/god/conditions/memory_usage.rb +7 -6
- data/lib/god/conditions/process_exits.rb +15 -10
- data/lib/god/conditions/process_running.rb +17 -13
- data/lib/god/diagnostics.rb +37 -0
- data/lib/god/driver.rb +108 -0
- data/lib/god/event_handler.rb +41 -1
- data/lib/god/logger.rb +69 -19
- data/lib/god/metric.rb +2 -2
- data/lib/god/process.rb +84 -27
- data/lib/god/task.rb +286 -29
- data/lib/god/timeline.rb +20 -31
- data/lib/god/watch.rb +26 -15
- data/test/configs/child_events/child_events.god +0 -5
- data/test/configs/child_polls/simple_server.rb +1 -1
- data/test/configs/daemon_events/simple_server_stop.rb +2 -0
- data/test/configs/stress/stress.god +1 -1
- data/test/configs/test.rb +12 -28
- data/test/test_condition.rb +8 -0
- data/test/test_conditions_http_response_code.rb +5 -5
- data/test/test_conditions_process_running.rb +6 -4
- data/test/test_driver.rb +11 -0
- data/test/test_event_handler.rb +7 -0
- data/test/test_god.rb +63 -62
- data/test/test_metric.rb +0 -16
- data/test/test_process.rb +29 -1
- data/test/test_task.rb +177 -1
- data/test/test_timeline.rb +2 -1
- data/test/test_watch.rb +24 -6
- metadata +6 -8
- data/lib/god/hub.rb +0 -222
- data/lib/god/timer.rb +0 -87
- data/test/test_hub.rb +0 -240
- data/test/test_timer.rb +0 -69
data/lib/god/metric.rb
CHANGED
data/lib/god/process.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'fileutils'
|
2
|
-
|
3
1
|
module God
|
4
2
|
class Process
|
5
3
|
WRITES_PID = [:start, :restart]
|
@@ -12,13 +10,13 @@ module God
|
|
12
10
|
@pid_file = nil
|
13
11
|
@tracking_pid = true
|
14
12
|
@user_log = false
|
13
|
+
@pid = nil
|
15
14
|
end
|
16
15
|
|
17
16
|
def alive?
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
rescue Errno::ENOENT
|
17
|
+
if self.pid
|
18
|
+
System::Process.new(self.pid).exists?
|
19
|
+
else
|
22
20
|
false
|
23
21
|
end
|
24
22
|
end
|
@@ -124,6 +122,23 @@ module God
|
|
124
122
|
@pid_file ||= default_pid_file
|
125
123
|
end
|
126
124
|
|
125
|
+
# Fetch the PID from pid_file. If the pid_file does not
|
126
|
+
# exist, then use the PID from the last time it was read.
|
127
|
+
# If it has never been read, then return nil.
|
128
|
+
#
|
129
|
+
# Returns Integer(pid) or nil
|
130
|
+
def pid
|
131
|
+
contents = File.read(self.pid_file).strip rescue ''
|
132
|
+
real_pid = contents =~ /^\d+$/ ? contents.to_i : nil
|
133
|
+
|
134
|
+
if real_pid
|
135
|
+
@pid = real_pid
|
136
|
+
real_pid
|
137
|
+
else
|
138
|
+
@pid
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
127
142
|
def start!
|
128
143
|
call_action(:start)
|
129
144
|
end
|
@@ -136,34 +151,21 @@ module God
|
|
136
151
|
call_action(:restart)
|
137
152
|
end
|
138
153
|
|
139
|
-
def
|
140
|
-
|
141
|
-
::Process.setsid
|
142
|
-
::Process::Sys.setgid(Etc.getgrnam(self.gid).gid) if self.gid
|
143
|
-
::Process::Sys.setuid(Etc.getpwnam(self.uid).uid) if self.uid
|
144
|
-
Dir.chdir "/"
|
145
|
-
$0 = command
|
146
|
-
STDIN.reopen "/dev/null"
|
147
|
-
STDOUT.reopen self.log, "a"
|
148
|
-
STDERR.reopen STDOUT
|
149
|
-
|
150
|
-
# close any other file descriptors
|
151
|
-
3.upto(256){|fd| IO::new(fd).close rescue nil}
|
152
|
-
|
153
|
-
exec command unless command.empty?
|
154
|
-
end
|
154
|
+
def default_pid_file
|
155
|
+
File.join(God.pid_file_directory, "#{self.name}.pid")
|
155
156
|
end
|
156
157
|
|
157
158
|
def call_action(action)
|
158
159
|
command = send(action)
|
159
160
|
|
160
161
|
if action == :stop && command.nil?
|
161
|
-
pid =
|
162
|
+
pid = self.pid
|
162
163
|
name = self.name
|
163
164
|
command = lambda do
|
164
165
|
applog(self, :info, "#{self.name} stop: default lambda killer")
|
165
166
|
|
166
|
-
::Process.kill('
|
167
|
+
::Process.kill('TERM', pid) rescue nil
|
168
|
+
applog(self, :info, "#{self.name} sent SIGTERM")
|
167
169
|
|
168
170
|
# Poll to see if it's dead
|
169
171
|
5.times do
|
@@ -171,6 +173,7 @@ module God
|
|
171
173
|
::Process.kill(0, pid)
|
172
174
|
rescue Errno::ESRCH
|
173
175
|
# It died. Good.
|
176
|
+
applog(self, :info, "#{self.name} process stopped")
|
174
177
|
return
|
175
178
|
end
|
176
179
|
|
@@ -178,6 +181,7 @@ module God
|
|
178
181
|
end
|
179
182
|
|
180
183
|
::Process.kill('KILL', pid) rescue nil
|
184
|
+
applog(self, :info, "#{self.name} still alive; sent SIGKILL")
|
181
185
|
end
|
182
186
|
end
|
183
187
|
|
@@ -208,7 +212,14 @@ module God
|
|
208
212
|
# single fork self-daemonizing processes
|
209
213
|
# we want to wait for them to finish
|
210
214
|
pid = self.spawn(command)
|
211
|
-
::Process.
|
215
|
+
status = ::Process.waitpid2(pid, 0)
|
216
|
+
exit_code = status[1] >> 8
|
217
|
+
|
218
|
+
if exit_code != 0
|
219
|
+
applog(self, :warn, "#{self.name} #{action} command exited with non-zero code = #{exit_code}")
|
220
|
+
end
|
221
|
+
|
222
|
+
ensure_stop if action == :stop
|
212
223
|
end
|
213
224
|
|
214
225
|
if @tracking_pid or (@pid_file.nil? and WRITES_PID.include?(action))
|
@@ -227,8 +238,54 @@ module God
|
|
227
238
|
end
|
228
239
|
end
|
229
240
|
|
230
|
-
|
231
|
-
|
241
|
+
# Fork/exec the given command, returns immediately
|
242
|
+
# +command+ is the String containing the shell command
|
243
|
+
#
|
244
|
+
# Returns nothing
|
245
|
+
def spawn(command)
|
246
|
+
fork do
|
247
|
+
::Process.setsid
|
248
|
+
::Process::Sys.setgid(Etc.getgrnam(self.gid).gid) if self.gid
|
249
|
+
::Process::Sys.setuid(Etc.getpwnam(self.uid).uid) if self.uid
|
250
|
+
Dir.chdir "/"
|
251
|
+
$0 = command
|
252
|
+
STDIN.reopen "/dev/null"
|
253
|
+
STDOUT.reopen self.log, "a"
|
254
|
+
STDERR.reopen STDOUT
|
255
|
+
|
256
|
+
# close any other file descriptors
|
257
|
+
3.upto(256){|fd| IO::new(fd).close rescue nil}
|
258
|
+
|
259
|
+
exec command unless command.empty?
|
260
|
+
end
|
232
261
|
end
|
262
|
+
|
263
|
+
# Ensure that a stop command actually stops the process. Force kill
|
264
|
+
# if necessary.
|
265
|
+
#
|
266
|
+
# Returns nothing
|
267
|
+
def ensure_stop
|
268
|
+
unless self.pid
|
269
|
+
applog(self, :warn, "#{self.name} stop called but pid is uknown")
|
270
|
+
return
|
271
|
+
end
|
272
|
+
|
273
|
+
# Poll to see if it's dead
|
274
|
+
10.times do
|
275
|
+
begin
|
276
|
+
::Process.kill(0, self.pid)
|
277
|
+
rescue Errno::ESRCH
|
278
|
+
# It died. Good.
|
279
|
+
return
|
280
|
+
end
|
281
|
+
|
282
|
+
sleep 1
|
283
|
+
end
|
284
|
+
|
285
|
+
# last resort
|
286
|
+
::Process.kill('KILL', self.pid) rescue nil
|
287
|
+
applog(self, :warn, "#{self.name} process still running 10 seconds after stop command returned. Force killing.")
|
288
|
+
end
|
289
|
+
|
233
290
|
end
|
234
291
|
end
|
data/lib/god/task.rb
CHANGED
@@ -1,16 +1,13 @@
|
|
1
1
|
module God
|
2
2
|
|
3
3
|
class Task
|
4
|
-
attr_accessor :name, :interval, :group, :valid_states, :initial_state
|
4
|
+
attr_accessor :name, :interval, :group, :valid_states, :initial_state, :driver
|
5
5
|
|
6
6
|
attr_writer :autostart
|
7
7
|
def autostart?; @autostart; end
|
8
8
|
|
9
9
|
# api
|
10
|
-
attr_accessor :state, :behaviors, :metrics
|
11
|
-
|
12
|
-
# internal
|
13
|
-
attr_accessor :mutex
|
10
|
+
attr_accessor :state, :behaviors, :metrics, :directory
|
14
11
|
|
15
12
|
def initialize
|
16
13
|
@autostart ||= true
|
@@ -24,8 +21,11 @@ module God
|
|
24
21
|
# the list of conditions for each action
|
25
22
|
self.metrics = {nil => [], :unmonitored => []}
|
26
23
|
|
27
|
-
#
|
28
|
-
self.
|
24
|
+
# the condition -> metric lookup
|
25
|
+
self.directory = {}
|
26
|
+
|
27
|
+
# driver
|
28
|
+
self.driver = Driver.new(self)
|
29
29
|
end
|
30
30
|
|
31
31
|
def prepare
|
@@ -92,6 +92,11 @@ module God
|
|
92
92
|
end
|
93
93
|
end
|
94
94
|
|
95
|
+
# populate the condition -> metric directory
|
96
|
+
m.conditions.each do |c|
|
97
|
+
self.directory[c] = m
|
98
|
+
end
|
99
|
+
|
95
100
|
# record the metric
|
96
101
|
self.metrics[start_state] ||= []
|
97
102
|
self.metrics[start_state] << m
|
@@ -105,6 +110,11 @@ module God
|
|
105
110
|
# let the config file define some conditions on the metric
|
106
111
|
yield(m)
|
107
112
|
|
113
|
+
# populate the condition -> metric directory
|
114
|
+
m.conditions.each do |c|
|
115
|
+
self.directory[c] = m
|
116
|
+
end
|
117
|
+
|
108
118
|
# record the metric
|
109
119
|
self.metrics[nil] << m
|
110
120
|
end
|
@@ -116,27 +126,43 @@ module God
|
|
116
126
|
###########################################################################
|
117
127
|
|
118
128
|
# Enable monitoring
|
129
|
+
#
|
130
|
+
# Returns nothing
|
119
131
|
def monitor
|
120
132
|
self.move(self.initial_state)
|
121
133
|
end
|
122
134
|
|
123
135
|
# Disable monitoring
|
136
|
+
#
|
137
|
+
# Returns nothing
|
124
138
|
def unmonitor
|
125
139
|
self.move(:unmonitored)
|
126
140
|
end
|
127
141
|
|
128
|
-
# Move
|
142
|
+
# Move to the givent state
|
143
|
+
# +to_state+ is the Symbol representing the state to move to
|
144
|
+
#
|
145
|
+
# Returns Task (self)
|
129
146
|
def move(to_state)
|
130
|
-
self.
|
147
|
+
if Thread.current != self.driver.thread
|
148
|
+
# called from outside Driver
|
149
|
+
|
150
|
+
# send an async message to Driver
|
151
|
+
self.driver.message(:move, [to_state])
|
152
|
+
else
|
153
|
+
# called from within Driver
|
154
|
+
|
155
|
+
# record original info
|
131
156
|
orig_to_state = to_state
|
132
157
|
from_state = self.state
|
133
158
|
|
159
|
+
# log
|
134
160
|
msg = "#{self.name} move '#{from_state}' to '#{to_state}'"
|
135
161
|
applog(self, :info, msg)
|
136
162
|
|
137
163
|
# cleanup from current state
|
164
|
+
self.driver.clear_events
|
138
165
|
self.metrics[from_state].each { |m| m.disable }
|
139
|
-
|
140
166
|
if to_state == :unmonitored
|
141
167
|
self.metrics[nil].each { |m| m.disable }
|
142
168
|
end
|
@@ -160,12 +186,22 @@ module God
|
|
160
186
|
# set state
|
161
187
|
self.state = to_state
|
162
188
|
|
163
|
-
#
|
189
|
+
# broadcast to interested TriggerConditions
|
164
190
|
Trigger.broadcast(self, :state_change, [from_state, orig_to_state])
|
165
191
|
|
166
|
-
#
|
167
|
-
self
|
192
|
+
# log
|
193
|
+
msg = "#{self.name} moved '#{from_state}' to '#{to_state}'"
|
194
|
+
applog(self, :info, msg)
|
168
195
|
end
|
196
|
+
|
197
|
+
self
|
198
|
+
end
|
199
|
+
|
200
|
+
# Notify the Driver that an EventCondition has triggered
|
201
|
+
#
|
202
|
+
# Returns nothing
|
203
|
+
def trigger(condition)
|
204
|
+
self.driver.message(:handle_event, [condition])
|
169
205
|
end
|
170
206
|
|
171
207
|
###########################################################################
|
@@ -189,29 +225,65 @@ module God
|
|
189
225
|
self.send(sym, *args)
|
190
226
|
end
|
191
227
|
|
228
|
+
# Perform the given action
|
192
229
|
# +a+ is the action Symbol
|
193
230
|
# +c+ is the Condition
|
231
|
+
#
|
232
|
+
# Returns Task (self)
|
194
233
|
def action(a, c = nil)
|
195
|
-
if self.
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
234
|
+
if Thread.current != self.driver.thread
|
235
|
+
# called from outside Driver
|
236
|
+
|
237
|
+
# send an async message to Driver
|
238
|
+
self.driver.message(:action, [a, c])
|
239
|
+
else
|
240
|
+
# called from within Driver
|
241
|
+
|
242
|
+
if self.respond_to?(a)
|
243
|
+
command = self.send(a)
|
244
|
+
|
245
|
+
case command
|
246
|
+
when String
|
247
|
+
msg = "#{self.name} #{a}: #{command}"
|
248
|
+
applog(self, :info, msg)
|
249
|
+
|
250
|
+
system(command)
|
251
|
+
when Proc
|
252
|
+
msg = "#{self.name} #{a}: lambda"
|
253
|
+
applog(self, :info, msg)
|
254
|
+
|
255
|
+
command.call
|
256
|
+
else
|
257
|
+
raise NotImplementedError
|
258
|
+
end
|
211
259
|
end
|
212
260
|
end
|
213
261
|
end
|
214
262
|
|
263
|
+
###########################################################################
|
264
|
+
#
|
265
|
+
# Events
|
266
|
+
#
|
267
|
+
###########################################################################
|
268
|
+
|
269
|
+
def attach(condition)
|
270
|
+
case condition
|
271
|
+
when PollCondition
|
272
|
+
self.driver.schedule(condition, 0)
|
273
|
+
when EventCondition, TriggerCondition
|
274
|
+
condition.register
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
def detach(condition)
|
279
|
+
case condition
|
280
|
+
when PollCondition
|
281
|
+
condition.reset
|
282
|
+
when EventCondition, TriggerCondition
|
283
|
+
condition.deregister
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
215
287
|
###########################################################################
|
216
288
|
#
|
217
289
|
# Registration
|
@@ -225,6 +297,191 @@ module God
|
|
225
297
|
def unregister!
|
226
298
|
# override if necessary
|
227
299
|
end
|
300
|
+
|
301
|
+
###########################################################################
|
302
|
+
#
|
303
|
+
# Handlers
|
304
|
+
#
|
305
|
+
###########################################################################
|
306
|
+
|
307
|
+
# Evaluate and handle the given poll condition. Handles logging
|
308
|
+
# notifications, and moving to the new state if necessary
|
309
|
+
# +condition+ is the Condition to handle
|
310
|
+
#
|
311
|
+
# Returns nothing
|
312
|
+
def handle_poll(condition)
|
313
|
+
# lookup metric
|
314
|
+
metric = self.directory[condition]
|
315
|
+
|
316
|
+
# run the test
|
317
|
+
result = condition.test
|
318
|
+
|
319
|
+
# log
|
320
|
+
messages = self.log_line(self, metric, condition, result)
|
321
|
+
|
322
|
+
# notify
|
323
|
+
if condition.notify && self.trigger?(metric, result)
|
324
|
+
self.notify(condition, messages.last)
|
325
|
+
end
|
326
|
+
|
327
|
+
# after-condition
|
328
|
+
condition.after
|
329
|
+
|
330
|
+
# get the destination
|
331
|
+
dest =
|
332
|
+
if result && condition.transition
|
333
|
+
# condition override
|
334
|
+
condition.transition
|
335
|
+
else
|
336
|
+
# regular
|
337
|
+
metric.destination && metric.destination[result]
|
338
|
+
end
|
339
|
+
|
340
|
+
# transition or reschedule
|
341
|
+
if dest
|
342
|
+
# transition
|
343
|
+
begin
|
344
|
+
self.move(dest)
|
345
|
+
rescue EventRegistrationFailedError
|
346
|
+
msg = watch.name + ' Event registration failed, moving back to previous state'
|
347
|
+
applog(watch, :info, msg)
|
348
|
+
|
349
|
+
dest = watch.state
|
350
|
+
retry
|
351
|
+
end
|
352
|
+
else
|
353
|
+
# reschedule
|
354
|
+
self.driver.schedule(condition)
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
# Asynchronously evaluate and handle the given event condition. Handles logging
|
359
|
+
# notifications, and moving to the new state if necessary
|
360
|
+
# +condition+ is the Condition to handle
|
361
|
+
#
|
362
|
+
# Returns nothing
|
363
|
+
def handle_event(condition)
|
364
|
+
# lookup metric
|
365
|
+
metric = self.directory[condition]
|
366
|
+
|
367
|
+
# log
|
368
|
+
messages = self.log_line(self, metric, condition, true)
|
369
|
+
|
370
|
+
# notify
|
371
|
+
if condition.notify && self.trigger?(metric, true)
|
372
|
+
self.notify(condition, messages.last)
|
373
|
+
end
|
374
|
+
|
375
|
+
# get the destination
|
376
|
+
dest =
|
377
|
+
if condition.transition
|
378
|
+
# condition override
|
379
|
+
condition.transition
|
380
|
+
else
|
381
|
+
# regular
|
382
|
+
metric.destination && metric.destination[true]
|
383
|
+
end
|
384
|
+
|
385
|
+
if dest
|
386
|
+
self.move(dest)
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
# Determine whether a trigger happened
|
391
|
+
# +metric+ is the Metric
|
392
|
+
# +result+ is the result from the condition's test
|
393
|
+
#
|
394
|
+
# Returns Boolean
|
395
|
+
def trigger?(metric, result)
|
396
|
+
metric.destination && metric.destination[result]
|
397
|
+
end
|
398
|
+
|
399
|
+
# Log info about the condition and return the list of messages logged
|
400
|
+
# +watch+ is the Watch
|
401
|
+
# +metric+ is the Metric
|
402
|
+
# +condition+ is the Condition
|
403
|
+
# +result+ is the Boolean result of the condition test evaluation
|
404
|
+
#
|
405
|
+
# Returns String[]
|
406
|
+
def log_line(watch, metric, condition, result)
|
407
|
+
status =
|
408
|
+
if self.trigger?(metric, result)
|
409
|
+
"[trigger]"
|
410
|
+
else
|
411
|
+
"[ok]"
|
412
|
+
end
|
413
|
+
|
414
|
+
messages = []
|
415
|
+
|
416
|
+
# log info if available
|
417
|
+
if condition.info
|
418
|
+
Array(condition.info).each do |condition_info|
|
419
|
+
messages << "#{watch.name} #{status} #{condition_info} (#{condition.base_name})"
|
420
|
+
applog(watch, :info, messages.last)
|
421
|
+
end
|
422
|
+
else
|
423
|
+
messages << "#{watch.name} #{status} (#{condition.base_name})"
|
424
|
+
applog(watch, :info, messages.last)
|
425
|
+
end
|
426
|
+
|
427
|
+
# log
|
428
|
+
debug_message = watch.name + ' ' + condition.base_name + " [#{result}] " + self.dest_desc(metric, condition)
|
429
|
+
applog(watch, :debug, debug_message)
|
430
|
+
|
431
|
+
messages
|
432
|
+
end
|
433
|
+
|
434
|
+
# Format the destination specification for use in debug logging
|
435
|
+
# +metric+ is the Metric
|
436
|
+
# +condition+ is the Condition
|
437
|
+
#
|
438
|
+
# Returns String
|
439
|
+
def dest_desc(metric, condition)
|
440
|
+
if condition.transition
|
441
|
+
{true => condition.transition}.inspect
|
442
|
+
else
|
443
|
+
if metric.destination
|
444
|
+
metric.destination.inspect
|
445
|
+
else
|
446
|
+
'none'
|
447
|
+
end
|
448
|
+
end
|
449
|
+
end
|
450
|
+
|
451
|
+
# Notify all recipeients of the given condition with the specified message
|
452
|
+
# +condition+ is the Condition
|
453
|
+
# +message+ is the String message to send
|
454
|
+
#
|
455
|
+
# Returns nothing
|
456
|
+
def notify(condition, message)
|
457
|
+
spec = Contact.normalize(condition.notify)
|
458
|
+
unmatched = []
|
459
|
+
|
460
|
+
# resolve contacts
|
461
|
+
resolved_contacts =
|
462
|
+
spec[:contacts].inject([]) do |acc, contact_name_or_group|
|
463
|
+
cons = Array(God.contacts[contact_name_or_group] || God.contact_groups[contact_name_or_group])
|
464
|
+
unmatched << contact_name_or_group if cons.empty?
|
465
|
+
acc += cons
|
466
|
+
acc
|
467
|
+
end
|
468
|
+
|
469
|
+
# warn about unmatched contacts
|
470
|
+
unless unmatched.empty?
|
471
|
+
msg = "#{condition.watch.name} no matching contacts for '#{unmatched.join(", ")}'"
|
472
|
+
applog(condition.watch, :warn, msg)
|
473
|
+
end
|
474
|
+
|
475
|
+
# notify each contact
|
476
|
+
resolved_contacts.each do |c|
|
477
|
+
host = `hostname`.chomp rescue 'none'
|
478
|
+
c.notify(message, Time.now, spec[:priority], spec[:category], host)
|
479
|
+
|
480
|
+
msg = "#{condition.watch.name} #{c.info ? c.info : "notification sent for contact: #{c.name}"} (#{c.base_name})"
|
481
|
+
|
482
|
+
applog(condition.watch, :info, msg % [])
|
483
|
+
end
|
484
|
+
end
|
228
485
|
end
|
229
486
|
|
230
487
|
end
|