god 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/History.txt +67 -1
  2. data/Manifest.txt +3 -4
  3. data/Rakefile +1 -1
  4. data/bin/god +19 -1
  5. data/lib/god.rb +86 -49
  6. data/lib/god/cli/command.rb +7 -1
  7. data/lib/god/cli/run.rb +58 -0
  8. data/lib/god/condition.rb +6 -2
  9. data/lib/god/conditions/cpu_usage.rb +7 -6
  10. data/lib/god/conditions/http_response_code.rb +5 -1
  11. data/lib/god/conditions/memory_usage.rb +7 -6
  12. data/lib/god/conditions/process_exits.rb +15 -10
  13. data/lib/god/conditions/process_running.rb +17 -13
  14. data/lib/god/diagnostics.rb +37 -0
  15. data/lib/god/driver.rb +108 -0
  16. data/lib/god/event_handler.rb +41 -1
  17. data/lib/god/logger.rb +69 -19
  18. data/lib/god/metric.rb +2 -2
  19. data/lib/god/process.rb +84 -27
  20. data/lib/god/task.rb +286 -29
  21. data/lib/god/timeline.rb +20 -31
  22. data/lib/god/watch.rb +26 -15
  23. data/test/configs/child_events/child_events.god +0 -5
  24. data/test/configs/child_polls/simple_server.rb +1 -1
  25. data/test/configs/daemon_events/simple_server_stop.rb +2 -0
  26. data/test/configs/stress/stress.god +1 -1
  27. data/test/configs/test.rb +12 -28
  28. data/test/test_condition.rb +8 -0
  29. data/test/test_conditions_http_response_code.rb +5 -5
  30. data/test/test_conditions_process_running.rb +6 -4
  31. data/test/test_driver.rb +11 -0
  32. data/test/test_event_handler.rb +7 -0
  33. data/test/test_god.rb +63 -62
  34. data/test/test_metric.rb +0 -16
  35. data/test/test_process.rb +29 -1
  36. data/test/test_task.rb +177 -1
  37. data/test/test_timeline.rb +2 -1
  38. data/test/test_watch.rb +24 -6
  39. metadata +6 -8
  40. data/lib/god/hub.rb +0 -222
  41. data/lib/god/timer.rb +0 -87
  42. data/test/test_hub.rb +0 -240
  43. data/test/test_timer.rb +0 -69
@@ -45,13 +45,13 @@ module God
45
45
 
46
46
  def enable
47
47
  self.conditions.each do |c|
48
- Hub.attach(c, self)
48
+ self.watch.attach(c)
49
49
  end
50
50
  end
51
51
 
52
52
  def disable
53
53
  self.conditions.each do |c|
54
- Hub.detach(c)
54
+ self.watch.detach(c)
55
55
  end
56
56
  end
57
57
  end
@@ -1,5 +1,3 @@
1
- require 'fileutils'
2
-
3
1
  module God
4
2
  class Process
5
3
  WRITES_PID = [:start, :restart]
@@ -12,13 +10,13 @@ module God
12
10
  @pid_file = nil
13
11
  @tracking_pid = true
14
12
  @user_log = false
13
+ @pid = nil
15
14
  end
16
15
 
17
16
  def alive?
18
- begin
19
- pid = File.read(self.pid_file).strip.to_i
20
- System::Process.new(pid).exists?
21
- rescue Errno::ENOENT
17
+ if self.pid
18
+ System::Process.new(self.pid).exists?
19
+ else
22
20
  false
23
21
  end
24
22
  end
@@ -124,6 +122,23 @@ module God
124
122
  @pid_file ||= default_pid_file
125
123
  end
126
124
 
125
+ # Fetch the PID from pid_file. If the pid_file does not
126
+ # exist, then use the PID from the last time it was read.
127
+ # If it has never been read, then return nil.
128
+ #
129
+ # Returns Integer(pid) or nil
130
+ def pid
131
+ contents = File.read(self.pid_file).strip rescue ''
132
+ real_pid = contents =~ /^\d+$/ ? contents.to_i : nil
133
+
134
+ if real_pid
135
+ @pid = real_pid
136
+ real_pid
137
+ else
138
+ @pid
139
+ end
140
+ end
141
+
127
142
  def start!
128
143
  call_action(:start)
129
144
  end
@@ -136,34 +151,21 @@ module God
136
151
  call_action(:restart)
137
152
  end
138
153
 
139
- def spawn(command)
140
- fork do
141
- ::Process.setsid
142
- ::Process::Sys.setgid(Etc.getgrnam(self.gid).gid) if self.gid
143
- ::Process::Sys.setuid(Etc.getpwnam(self.uid).uid) if self.uid
144
- Dir.chdir "/"
145
- $0 = command
146
- STDIN.reopen "/dev/null"
147
- STDOUT.reopen self.log, "a"
148
- STDERR.reopen STDOUT
149
-
150
- # close any other file descriptors
151
- 3.upto(256){|fd| IO::new(fd).close rescue nil}
152
-
153
- exec command unless command.empty?
154
- end
154
+ def default_pid_file
155
+ File.join(God.pid_file_directory, "#{self.name}.pid")
155
156
  end
156
157
 
157
158
  def call_action(action)
158
159
  command = send(action)
159
160
 
160
161
  if action == :stop && command.nil?
161
- pid = File.read(self.pid_file).strip.to_i
162
+ pid = self.pid
162
163
  name = self.name
163
164
  command = lambda do
164
165
  applog(self, :info, "#{self.name} stop: default lambda killer")
165
166
 
166
- ::Process.kill('HUP', pid) rescue nil
167
+ ::Process.kill('TERM', pid) rescue nil
168
+ applog(self, :info, "#{self.name} sent SIGTERM")
167
169
 
168
170
  # Poll to see if it's dead
169
171
  5.times do
@@ -171,6 +173,7 @@ module God
171
173
  ::Process.kill(0, pid)
172
174
  rescue Errno::ESRCH
173
175
  # It died. Good.
176
+ applog(self, :info, "#{self.name} process stopped")
174
177
  return
175
178
  end
176
179
 
@@ -178,6 +181,7 @@ module God
178
181
  end
179
182
 
180
183
  ::Process.kill('KILL', pid) rescue nil
184
+ applog(self, :info, "#{self.name} still alive; sent SIGKILL")
181
185
  end
182
186
  end
183
187
 
@@ -208,7 +212,14 @@ module God
208
212
  # single fork self-daemonizing processes
209
213
  # we want to wait for them to finish
210
214
  pid = self.spawn(command)
211
- ::Process.waitpid(pid, 0)
215
+ status = ::Process.waitpid2(pid, 0)
216
+ exit_code = status[1] >> 8
217
+
218
+ if exit_code != 0
219
+ applog(self, :warn, "#{self.name} #{action} command exited with non-zero code = #{exit_code}")
220
+ end
221
+
222
+ ensure_stop if action == :stop
212
223
  end
213
224
 
214
225
  if @tracking_pid or (@pid_file.nil? and WRITES_PID.include?(action))
@@ -227,8 +238,54 @@ module God
227
238
  end
228
239
  end
229
240
 
230
- def default_pid_file
231
- File.join(God.pid_file_directory, "#{self.name}.pid")
241
+ # Fork/exec the given command, returns immediately
242
+ # +command+ is the String containing the shell command
243
+ #
244
+ # Returns nothing
245
+ def spawn(command)
246
+ fork do
247
+ ::Process.setsid
248
+ ::Process::Sys.setgid(Etc.getgrnam(self.gid).gid) if self.gid
249
+ ::Process::Sys.setuid(Etc.getpwnam(self.uid).uid) if self.uid
250
+ Dir.chdir "/"
251
+ $0 = command
252
+ STDIN.reopen "/dev/null"
253
+ STDOUT.reopen self.log, "a"
254
+ STDERR.reopen STDOUT
255
+
256
+ # close any other file descriptors
257
+ 3.upto(256){|fd| IO::new(fd).close rescue nil}
258
+
259
+ exec command unless command.empty?
260
+ end
232
261
  end
262
+
263
+ # Ensure that a stop command actually stops the process. Force kill
264
+ # if necessary.
265
+ #
266
+ # Returns nothing
267
+ def ensure_stop
268
+ unless self.pid
269
+ applog(self, :warn, "#{self.name} stop called but pid is uknown")
270
+ return
271
+ end
272
+
273
+ # Poll to see if it's dead
274
+ 10.times do
275
+ begin
276
+ ::Process.kill(0, self.pid)
277
+ rescue Errno::ESRCH
278
+ # It died. Good.
279
+ return
280
+ end
281
+
282
+ sleep 1
283
+ end
284
+
285
+ # last resort
286
+ ::Process.kill('KILL', self.pid) rescue nil
287
+ applog(self, :warn, "#{self.name} process still running 10 seconds after stop command returned. Force killing.")
288
+ end
289
+
233
290
  end
234
291
  end
@@ -1,16 +1,13 @@
1
1
  module God
2
2
 
3
3
  class Task
4
- attr_accessor :name, :interval, :group, :valid_states, :initial_state
4
+ attr_accessor :name, :interval, :group, :valid_states, :initial_state, :driver
5
5
 
6
6
  attr_writer :autostart
7
7
  def autostart?; @autostart; end
8
8
 
9
9
  # api
10
- attr_accessor :state, :behaviors, :metrics
11
-
12
- # internal
13
- attr_accessor :mutex
10
+ attr_accessor :state, :behaviors, :metrics, :directory
14
11
 
15
12
  def initialize
16
13
  @autostart ||= true
@@ -24,8 +21,11 @@ module God
24
21
  # the list of conditions for each action
25
22
  self.metrics = {nil => [], :unmonitored => []}
26
23
 
27
- # mutex
28
- self.mutex = Monitor.new
24
+ # the condition -> metric lookup
25
+ self.directory = {}
26
+
27
+ # driver
28
+ self.driver = Driver.new(self)
29
29
  end
30
30
 
31
31
  def prepare
@@ -92,6 +92,11 @@ module God
92
92
  end
93
93
  end
94
94
 
95
+ # populate the condition -> metric directory
96
+ m.conditions.each do |c|
97
+ self.directory[c] = m
98
+ end
99
+
95
100
  # record the metric
96
101
  self.metrics[start_state] ||= []
97
102
  self.metrics[start_state] << m
@@ -105,6 +110,11 @@ module God
105
110
  # let the config file define some conditions on the metric
106
111
  yield(m)
107
112
 
113
+ # populate the condition -> metric directory
114
+ m.conditions.each do |c|
115
+ self.directory[c] = m
116
+ end
117
+
108
118
  # record the metric
109
119
  self.metrics[nil] << m
110
120
  end
@@ -116,27 +126,43 @@ module God
116
126
  ###########################################################################
117
127
 
118
128
  # Enable monitoring
129
+ #
130
+ # Returns nothing
119
131
  def monitor
120
132
  self.move(self.initial_state)
121
133
  end
122
134
 
123
135
  # Disable monitoring
136
+ #
137
+ # Returns nothing
124
138
  def unmonitor
125
139
  self.move(:unmonitored)
126
140
  end
127
141
 
128
- # Move from one state to another
142
+ # Move to the givent state
143
+ # +to_state+ is the Symbol representing the state to move to
144
+ #
145
+ # Returns Task (self)
129
146
  def move(to_state)
130
- self.mutex.synchronize do
147
+ if Thread.current != self.driver.thread
148
+ # called from outside Driver
149
+
150
+ # send an async message to Driver
151
+ self.driver.message(:move, [to_state])
152
+ else
153
+ # called from within Driver
154
+
155
+ # record original info
131
156
  orig_to_state = to_state
132
157
  from_state = self.state
133
158
 
159
+ # log
134
160
  msg = "#{self.name} move '#{from_state}' to '#{to_state}'"
135
161
  applog(self, :info, msg)
136
162
 
137
163
  # cleanup from current state
164
+ self.driver.clear_events
138
165
  self.metrics[from_state].each { |m| m.disable }
139
-
140
166
  if to_state == :unmonitored
141
167
  self.metrics[nil].each { |m| m.disable }
142
168
  end
@@ -160,12 +186,22 @@ module God
160
186
  # set state
161
187
  self.state = to_state
162
188
 
163
- # trigger
189
+ # broadcast to interested TriggerConditions
164
190
  Trigger.broadcast(self, :state_change, [from_state, orig_to_state])
165
191
 
166
- # return self
167
- self
192
+ # log
193
+ msg = "#{self.name} moved '#{from_state}' to '#{to_state}'"
194
+ applog(self, :info, msg)
168
195
  end
196
+
197
+ self
198
+ end
199
+
200
+ # Notify the Driver that an EventCondition has triggered
201
+ #
202
+ # Returns nothing
203
+ def trigger(condition)
204
+ self.driver.message(:handle_event, [condition])
169
205
  end
170
206
 
171
207
  ###########################################################################
@@ -189,29 +225,65 @@ module God
189
225
  self.send(sym, *args)
190
226
  end
191
227
 
228
+ # Perform the given action
192
229
  # +a+ is the action Symbol
193
230
  # +c+ is the Condition
231
+ #
232
+ # Returns Task (self)
194
233
  def action(a, c = nil)
195
- if self.respond_to?(a)
196
- command = self.send(a)
197
-
198
- case command
199
- when String
200
- msg = "#{self.name} #{a}: #{command}"
201
- applog(self, :info, msg)
202
-
203
- system(command)
204
- when Proc
205
- msg = "#{self.name} #{a}: lambda"
206
- applog(self, :info, msg)
207
-
208
- command.call
209
- else
210
- raise NotImplementedError
234
+ if Thread.current != self.driver.thread
235
+ # called from outside Driver
236
+
237
+ # send an async message to Driver
238
+ self.driver.message(:action, [a, c])
239
+ else
240
+ # called from within Driver
241
+
242
+ if self.respond_to?(a)
243
+ command = self.send(a)
244
+
245
+ case command
246
+ when String
247
+ msg = "#{self.name} #{a}: #{command}"
248
+ applog(self, :info, msg)
249
+
250
+ system(command)
251
+ when Proc
252
+ msg = "#{self.name} #{a}: lambda"
253
+ applog(self, :info, msg)
254
+
255
+ command.call
256
+ else
257
+ raise NotImplementedError
258
+ end
211
259
  end
212
260
  end
213
261
  end
214
262
 
263
+ ###########################################################################
264
+ #
265
+ # Events
266
+ #
267
+ ###########################################################################
268
+
269
+ def attach(condition)
270
+ case condition
271
+ when PollCondition
272
+ self.driver.schedule(condition, 0)
273
+ when EventCondition, TriggerCondition
274
+ condition.register
275
+ end
276
+ end
277
+
278
+ def detach(condition)
279
+ case condition
280
+ when PollCondition
281
+ condition.reset
282
+ when EventCondition, TriggerCondition
283
+ condition.deregister
284
+ end
285
+ end
286
+
215
287
  ###########################################################################
216
288
  #
217
289
  # Registration
@@ -225,6 +297,191 @@ module God
225
297
  def unregister!
226
298
  # override if necessary
227
299
  end
300
+
301
+ ###########################################################################
302
+ #
303
+ # Handlers
304
+ #
305
+ ###########################################################################
306
+
307
+ # Evaluate and handle the given poll condition. Handles logging
308
+ # notifications, and moving to the new state if necessary
309
+ # +condition+ is the Condition to handle
310
+ #
311
+ # Returns nothing
312
+ def handle_poll(condition)
313
+ # lookup metric
314
+ metric = self.directory[condition]
315
+
316
+ # run the test
317
+ result = condition.test
318
+
319
+ # log
320
+ messages = self.log_line(self, metric, condition, result)
321
+
322
+ # notify
323
+ if condition.notify && self.trigger?(metric, result)
324
+ self.notify(condition, messages.last)
325
+ end
326
+
327
+ # after-condition
328
+ condition.after
329
+
330
+ # get the destination
331
+ dest =
332
+ if result && condition.transition
333
+ # condition override
334
+ condition.transition
335
+ else
336
+ # regular
337
+ metric.destination && metric.destination[result]
338
+ end
339
+
340
+ # transition or reschedule
341
+ if dest
342
+ # transition
343
+ begin
344
+ self.move(dest)
345
+ rescue EventRegistrationFailedError
346
+ msg = watch.name + ' Event registration failed, moving back to previous state'
347
+ applog(watch, :info, msg)
348
+
349
+ dest = watch.state
350
+ retry
351
+ end
352
+ else
353
+ # reschedule
354
+ self.driver.schedule(condition)
355
+ end
356
+ end
357
+
358
+ # Asynchronously evaluate and handle the given event condition. Handles logging
359
+ # notifications, and moving to the new state if necessary
360
+ # +condition+ is the Condition to handle
361
+ #
362
+ # Returns nothing
363
+ def handle_event(condition)
364
+ # lookup metric
365
+ metric = self.directory[condition]
366
+
367
+ # log
368
+ messages = self.log_line(self, metric, condition, true)
369
+
370
+ # notify
371
+ if condition.notify && self.trigger?(metric, true)
372
+ self.notify(condition, messages.last)
373
+ end
374
+
375
+ # get the destination
376
+ dest =
377
+ if condition.transition
378
+ # condition override
379
+ condition.transition
380
+ else
381
+ # regular
382
+ metric.destination && metric.destination[true]
383
+ end
384
+
385
+ if dest
386
+ self.move(dest)
387
+ end
388
+ end
389
+
390
+ # Determine whether a trigger happened
391
+ # +metric+ is the Metric
392
+ # +result+ is the result from the condition's test
393
+ #
394
+ # Returns Boolean
395
+ def trigger?(metric, result)
396
+ metric.destination && metric.destination[result]
397
+ end
398
+
399
+ # Log info about the condition and return the list of messages logged
400
+ # +watch+ is the Watch
401
+ # +metric+ is the Metric
402
+ # +condition+ is the Condition
403
+ # +result+ is the Boolean result of the condition test evaluation
404
+ #
405
+ # Returns String[]
406
+ def log_line(watch, metric, condition, result)
407
+ status =
408
+ if self.trigger?(metric, result)
409
+ "[trigger]"
410
+ else
411
+ "[ok]"
412
+ end
413
+
414
+ messages = []
415
+
416
+ # log info if available
417
+ if condition.info
418
+ Array(condition.info).each do |condition_info|
419
+ messages << "#{watch.name} #{status} #{condition_info} (#{condition.base_name})"
420
+ applog(watch, :info, messages.last)
421
+ end
422
+ else
423
+ messages << "#{watch.name} #{status} (#{condition.base_name})"
424
+ applog(watch, :info, messages.last)
425
+ end
426
+
427
+ # log
428
+ debug_message = watch.name + ' ' + condition.base_name + " [#{result}] " + self.dest_desc(metric, condition)
429
+ applog(watch, :debug, debug_message)
430
+
431
+ messages
432
+ end
433
+
434
+ # Format the destination specification for use in debug logging
435
+ # +metric+ is the Metric
436
+ # +condition+ is the Condition
437
+ #
438
+ # Returns String
439
+ def dest_desc(metric, condition)
440
+ if condition.transition
441
+ {true => condition.transition}.inspect
442
+ else
443
+ if metric.destination
444
+ metric.destination.inspect
445
+ else
446
+ 'none'
447
+ end
448
+ end
449
+ end
450
+
451
+ # Notify all recipeients of the given condition with the specified message
452
+ # +condition+ is the Condition
453
+ # +message+ is the String message to send
454
+ #
455
+ # Returns nothing
456
+ def notify(condition, message)
457
+ spec = Contact.normalize(condition.notify)
458
+ unmatched = []
459
+
460
+ # resolve contacts
461
+ resolved_contacts =
462
+ spec[:contacts].inject([]) do |acc, contact_name_or_group|
463
+ cons = Array(God.contacts[contact_name_or_group] || God.contact_groups[contact_name_or_group])
464
+ unmatched << contact_name_or_group if cons.empty?
465
+ acc += cons
466
+ acc
467
+ end
468
+
469
+ # warn about unmatched contacts
470
+ unless unmatched.empty?
471
+ msg = "#{condition.watch.name} no matching contacts for '#{unmatched.join(", ")}'"
472
+ applog(condition.watch, :warn, msg)
473
+ end
474
+
475
+ # notify each contact
476
+ resolved_contacts.each do |c|
477
+ host = `hostname`.chomp rescue 'none'
478
+ c.notify(message, Time.now, spec[:priority], spec[:category], host)
479
+
480
+ msg = "#{condition.watch.name} #{c.info ? c.info : "notification sent for contact: #{c.name}"} (#{c.base_name})"
481
+
482
+ applog(condition.watch, :info, msg % [])
483
+ end
484
+ end
228
485
  end
229
486
 
230
487
  end