god 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/History.txt +67 -1
  2. data/Manifest.txt +3 -4
  3. data/Rakefile +1 -1
  4. data/bin/god +19 -1
  5. data/lib/god.rb +86 -49
  6. data/lib/god/cli/command.rb +7 -1
  7. data/lib/god/cli/run.rb +58 -0
  8. data/lib/god/condition.rb +6 -2
  9. data/lib/god/conditions/cpu_usage.rb +7 -6
  10. data/lib/god/conditions/http_response_code.rb +5 -1
  11. data/lib/god/conditions/memory_usage.rb +7 -6
  12. data/lib/god/conditions/process_exits.rb +15 -10
  13. data/lib/god/conditions/process_running.rb +17 -13
  14. data/lib/god/diagnostics.rb +37 -0
  15. data/lib/god/driver.rb +108 -0
  16. data/lib/god/event_handler.rb +41 -1
  17. data/lib/god/logger.rb +69 -19
  18. data/lib/god/metric.rb +2 -2
  19. data/lib/god/process.rb +84 -27
  20. data/lib/god/task.rb +286 -29
  21. data/lib/god/timeline.rb +20 -31
  22. data/lib/god/watch.rb +26 -15
  23. data/test/configs/child_events/child_events.god +0 -5
  24. data/test/configs/child_polls/simple_server.rb +1 -1
  25. data/test/configs/daemon_events/simple_server_stop.rb +2 -0
  26. data/test/configs/stress/stress.god +1 -1
  27. data/test/configs/test.rb +12 -28
  28. data/test/test_condition.rb +8 -0
  29. data/test/test_conditions_http_response_code.rb +5 -5
  30. data/test/test_conditions_process_running.rb +6 -4
  31. data/test/test_driver.rb +11 -0
  32. data/test/test_event_handler.rb +7 -0
  33. data/test/test_god.rb +63 -62
  34. data/test/test_metric.rb +0 -16
  35. data/test/test_process.rb +29 -1
  36. data/test/test_task.rb +177 -1
  37. data/test/test_timeline.rb +2 -1
  38. data/test/test_watch.rb +24 -6
  39. metadata +6 -8
  40. data/lib/god/hub.rb +0 -222
  41. data/lib/god/timer.rb +0 -87
  42. data/test/test_hub.rb +0 -240
  43. data/test/test_timer.rb +0 -69
@@ -45,13 +45,13 @@ module God
45
45
 
46
46
  def enable
47
47
  self.conditions.each do |c|
48
- Hub.attach(c, self)
48
+ self.watch.attach(c)
49
49
  end
50
50
  end
51
51
 
52
52
  def disable
53
53
  self.conditions.each do |c|
54
- Hub.detach(c)
54
+ self.watch.detach(c)
55
55
  end
56
56
  end
57
57
  end
@@ -1,5 +1,3 @@
1
- require 'fileutils'
2
-
3
1
  module God
4
2
  class Process
5
3
  WRITES_PID = [:start, :restart]
@@ -12,13 +10,13 @@ module God
12
10
  @pid_file = nil
13
11
  @tracking_pid = true
14
12
  @user_log = false
13
+ @pid = nil
15
14
  end
16
15
 
17
16
  def alive?
18
- begin
19
- pid = File.read(self.pid_file).strip.to_i
20
- System::Process.new(pid).exists?
21
- rescue Errno::ENOENT
17
+ if self.pid
18
+ System::Process.new(self.pid).exists?
19
+ else
22
20
  false
23
21
  end
24
22
  end
@@ -124,6 +122,23 @@ module God
124
122
  @pid_file ||= default_pid_file
125
123
  end
126
124
 
125
+ # Fetch the PID from pid_file. If the pid_file does not
126
+ # exist, then use the PID from the last time it was read.
127
+ # If it has never been read, then return nil.
128
+ #
129
+ # Returns Integer(pid) or nil
130
+ def pid
131
+ contents = File.read(self.pid_file).strip rescue ''
132
+ real_pid = contents =~ /^\d+$/ ? contents.to_i : nil
133
+
134
+ if real_pid
135
+ @pid = real_pid
136
+ real_pid
137
+ else
138
+ @pid
139
+ end
140
+ end
141
+
127
142
  def start!
128
143
  call_action(:start)
129
144
  end
@@ -136,34 +151,21 @@ module God
136
151
  call_action(:restart)
137
152
  end
138
153
 
139
- def spawn(command)
140
- fork do
141
- ::Process.setsid
142
- ::Process::Sys.setgid(Etc.getgrnam(self.gid).gid) if self.gid
143
- ::Process::Sys.setuid(Etc.getpwnam(self.uid).uid) if self.uid
144
- Dir.chdir "/"
145
- $0 = command
146
- STDIN.reopen "/dev/null"
147
- STDOUT.reopen self.log, "a"
148
- STDERR.reopen STDOUT
149
-
150
- # close any other file descriptors
151
- 3.upto(256){|fd| IO::new(fd).close rescue nil}
152
-
153
- exec command unless command.empty?
154
- end
154
+ def default_pid_file
155
+ File.join(God.pid_file_directory, "#{self.name}.pid")
155
156
  end
156
157
 
157
158
  def call_action(action)
158
159
  command = send(action)
159
160
 
160
161
  if action == :stop && command.nil?
161
- pid = File.read(self.pid_file).strip.to_i
162
+ pid = self.pid
162
163
  name = self.name
163
164
  command = lambda do
164
165
  applog(self, :info, "#{self.name} stop: default lambda killer")
165
166
 
166
- ::Process.kill('HUP', pid) rescue nil
167
+ ::Process.kill('TERM', pid) rescue nil
168
+ applog(self, :info, "#{self.name} sent SIGTERM")
167
169
 
168
170
  # Poll to see if it's dead
169
171
  5.times do
@@ -171,6 +173,7 @@ module God
171
173
  ::Process.kill(0, pid)
172
174
  rescue Errno::ESRCH
173
175
  # It died. Good.
176
+ applog(self, :info, "#{self.name} process stopped")
174
177
  return
175
178
  end
176
179
 
@@ -178,6 +181,7 @@ module God
178
181
  end
179
182
 
180
183
  ::Process.kill('KILL', pid) rescue nil
184
+ applog(self, :info, "#{self.name} still alive; sent SIGKILL")
181
185
  end
182
186
  end
183
187
 
@@ -208,7 +212,14 @@ module God
208
212
  # single fork self-daemonizing processes
209
213
  # we want to wait for them to finish
210
214
  pid = self.spawn(command)
211
- ::Process.waitpid(pid, 0)
215
+ status = ::Process.waitpid2(pid, 0)
216
+ exit_code = status[1] >> 8
217
+
218
+ if exit_code != 0
219
+ applog(self, :warn, "#{self.name} #{action} command exited with non-zero code = #{exit_code}")
220
+ end
221
+
222
+ ensure_stop if action == :stop
212
223
  end
213
224
 
214
225
  if @tracking_pid or (@pid_file.nil? and WRITES_PID.include?(action))
@@ -227,8 +238,54 @@ module God
227
238
  end
228
239
  end
229
240
 
230
- def default_pid_file
231
- File.join(God.pid_file_directory, "#{self.name}.pid")
241
+ # Fork/exec the given command, returns immediately
242
+ # +command+ is the String containing the shell command
243
+ #
244
+ # Returns nothing
245
+ def spawn(command)
246
+ fork do
247
+ ::Process.setsid
248
+ ::Process::Sys.setgid(Etc.getgrnam(self.gid).gid) if self.gid
249
+ ::Process::Sys.setuid(Etc.getpwnam(self.uid).uid) if self.uid
250
+ Dir.chdir "/"
251
+ $0 = command
252
+ STDIN.reopen "/dev/null"
253
+ STDOUT.reopen self.log, "a"
254
+ STDERR.reopen STDOUT
255
+
256
+ # close any other file descriptors
257
+ 3.upto(256){|fd| IO::new(fd).close rescue nil}
258
+
259
+ exec command unless command.empty?
260
+ end
232
261
  end
262
+
263
+ # Ensure that a stop command actually stops the process. Force kill
264
+ # if necessary.
265
+ #
266
+ # Returns nothing
267
+ def ensure_stop
268
+ unless self.pid
269
+ applog(self, :warn, "#{self.name} stop called but pid is uknown")
270
+ return
271
+ end
272
+
273
+ # Poll to see if it's dead
274
+ 10.times do
275
+ begin
276
+ ::Process.kill(0, self.pid)
277
+ rescue Errno::ESRCH
278
+ # It died. Good.
279
+ return
280
+ end
281
+
282
+ sleep 1
283
+ end
284
+
285
+ # last resort
286
+ ::Process.kill('KILL', self.pid) rescue nil
287
+ applog(self, :warn, "#{self.name} process still running 10 seconds after stop command returned. Force killing.")
288
+ end
289
+
233
290
  end
234
291
  end
@@ -1,16 +1,13 @@
1
1
  module God
2
2
 
3
3
  class Task
4
- attr_accessor :name, :interval, :group, :valid_states, :initial_state
4
+ attr_accessor :name, :interval, :group, :valid_states, :initial_state, :driver
5
5
 
6
6
  attr_writer :autostart
7
7
  def autostart?; @autostart; end
8
8
 
9
9
  # api
10
- attr_accessor :state, :behaviors, :metrics
11
-
12
- # internal
13
- attr_accessor :mutex
10
+ attr_accessor :state, :behaviors, :metrics, :directory
14
11
 
15
12
  def initialize
16
13
  @autostart ||= true
@@ -24,8 +21,11 @@ module God
24
21
  # the list of conditions for each action
25
22
  self.metrics = {nil => [], :unmonitored => []}
26
23
 
27
- # mutex
28
- self.mutex = Monitor.new
24
+ # the condition -> metric lookup
25
+ self.directory = {}
26
+
27
+ # driver
28
+ self.driver = Driver.new(self)
29
29
  end
30
30
 
31
31
  def prepare
@@ -92,6 +92,11 @@ module God
92
92
  end
93
93
  end
94
94
 
95
+ # populate the condition -> metric directory
96
+ m.conditions.each do |c|
97
+ self.directory[c] = m
98
+ end
99
+
95
100
  # record the metric
96
101
  self.metrics[start_state] ||= []
97
102
  self.metrics[start_state] << m
@@ -105,6 +110,11 @@ module God
105
110
  # let the config file define some conditions on the metric
106
111
  yield(m)
107
112
 
113
+ # populate the condition -> metric directory
114
+ m.conditions.each do |c|
115
+ self.directory[c] = m
116
+ end
117
+
108
118
  # record the metric
109
119
  self.metrics[nil] << m
110
120
  end
@@ -116,27 +126,43 @@ module God
116
126
  ###########################################################################
117
127
 
118
128
  # Enable monitoring
129
+ #
130
+ # Returns nothing
119
131
  def monitor
120
132
  self.move(self.initial_state)
121
133
  end
122
134
 
123
135
  # Disable monitoring
136
+ #
137
+ # Returns nothing
124
138
  def unmonitor
125
139
  self.move(:unmonitored)
126
140
  end
127
141
 
128
- # Move from one state to another
142
+ # Move to the givent state
143
+ # +to_state+ is the Symbol representing the state to move to
144
+ #
145
+ # Returns Task (self)
129
146
  def move(to_state)
130
- self.mutex.synchronize do
147
+ if Thread.current != self.driver.thread
148
+ # called from outside Driver
149
+
150
+ # send an async message to Driver
151
+ self.driver.message(:move, [to_state])
152
+ else
153
+ # called from within Driver
154
+
155
+ # record original info
131
156
  orig_to_state = to_state
132
157
  from_state = self.state
133
158
 
159
+ # log
134
160
  msg = "#{self.name} move '#{from_state}' to '#{to_state}'"
135
161
  applog(self, :info, msg)
136
162
 
137
163
  # cleanup from current state
164
+ self.driver.clear_events
138
165
  self.metrics[from_state].each { |m| m.disable }
139
-
140
166
  if to_state == :unmonitored
141
167
  self.metrics[nil].each { |m| m.disable }
142
168
  end
@@ -160,12 +186,22 @@ module God
160
186
  # set state
161
187
  self.state = to_state
162
188
 
163
- # trigger
189
+ # broadcast to interested TriggerConditions
164
190
  Trigger.broadcast(self, :state_change, [from_state, orig_to_state])
165
191
 
166
- # return self
167
- self
192
+ # log
193
+ msg = "#{self.name} moved '#{from_state}' to '#{to_state}'"
194
+ applog(self, :info, msg)
168
195
  end
196
+
197
+ self
198
+ end
199
+
200
+ # Notify the Driver that an EventCondition has triggered
201
+ #
202
+ # Returns nothing
203
+ def trigger(condition)
204
+ self.driver.message(:handle_event, [condition])
169
205
  end
170
206
 
171
207
  ###########################################################################
@@ -189,29 +225,65 @@ module God
189
225
  self.send(sym, *args)
190
226
  end
191
227
 
228
+ # Perform the given action
192
229
  # +a+ is the action Symbol
193
230
  # +c+ is the Condition
231
+ #
232
+ # Returns Task (self)
194
233
  def action(a, c = nil)
195
- if self.respond_to?(a)
196
- command = self.send(a)
197
-
198
- case command
199
- when String
200
- msg = "#{self.name} #{a}: #{command}"
201
- applog(self, :info, msg)
202
-
203
- system(command)
204
- when Proc
205
- msg = "#{self.name} #{a}: lambda"
206
- applog(self, :info, msg)
207
-
208
- command.call
209
- else
210
- raise NotImplementedError
234
+ if Thread.current != self.driver.thread
235
+ # called from outside Driver
236
+
237
+ # send an async message to Driver
238
+ self.driver.message(:action, [a, c])
239
+ else
240
+ # called from within Driver
241
+
242
+ if self.respond_to?(a)
243
+ command = self.send(a)
244
+
245
+ case command
246
+ when String
247
+ msg = "#{self.name} #{a}: #{command}"
248
+ applog(self, :info, msg)
249
+
250
+ system(command)
251
+ when Proc
252
+ msg = "#{self.name} #{a}: lambda"
253
+ applog(self, :info, msg)
254
+
255
+ command.call
256
+ else
257
+ raise NotImplementedError
258
+ end
211
259
  end
212
260
  end
213
261
  end
214
262
 
263
+ ###########################################################################
264
+ #
265
+ # Events
266
+ #
267
+ ###########################################################################
268
+
269
+ def attach(condition)
270
+ case condition
271
+ when PollCondition
272
+ self.driver.schedule(condition, 0)
273
+ when EventCondition, TriggerCondition
274
+ condition.register
275
+ end
276
+ end
277
+
278
+ def detach(condition)
279
+ case condition
280
+ when PollCondition
281
+ condition.reset
282
+ when EventCondition, TriggerCondition
283
+ condition.deregister
284
+ end
285
+ end
286
+
215
287
  ###########################################################################
216
288
  #
217
289
  # Registration
@@ -225,6 +297,191 @@ module God
225
297
  def unregister!
226
298
  # override if necessary
227
299
  end
300
+
301
+ ###########################################################################
302
+ #
303
+ # Handlers
304
+ #
305
+ ###########################################################################
306
+
307
+ # Evaluate and handle the given poll condition. Handles logging
308
+ # notifications, and moving to the new state if necessary
309
+ # +condition+ is the Condition to handle
310
+ #
311
+ # Returns nothing
312
+ def handle_poll(condition)
313
+ # lookup metric
314
+ metric = self.directory[condition]
315
+
316
+ # run the test
317
+ result = condition.test
318
+
319
+ # log
320
+ messages = self.log_line(self, metric, condition, result)
321
+
322
+ # notify
323
+ if condition.notify && self.trigger?(metric, result)
324
+ self.notify(condition, messages.last)
325
+ end
326
+
327
+ # after-condition
328
+ condition.after
329
+
330
+ # get the destination
331
+ dest =
332
+ if result && condition.transition
333
+ # condition override
334
+ condition.transition
335
+ else
336
+ # regular
337
+ metric.destination && metric.destination[result]
338
+ end
339
+
340
+ # transition or reschedule
341
+ if dest
342
+ # transition
343
+ begin
344
+ self.move(dest)
345
+ rescue EventRegistrationFailedError
346
+ msg = watch.name + ' Event registration failed, moving back to previous state'
347
+ applog(watch, :info, msg)
348
+
349
+ dest = watch.state
350
+ retry
351
+ end
352
+ else
353
+ # reschedule
354
+ self.driver.schedule(condition)
355
+ end
356
+ end
357
+
358
+ # Asynchronously evaluate and handle the given event condition. Handles logging
359
+ # notifications, and moving to the new state if necessary
360
+ # +condition+ is the Condition to handle
361
+ #
362
+ # Returns nothing
363
+ def handle_event(condition)
364
+ # lookup metric
365
+ metric = self.directory[condition]
366
+
367
+ # log
368
+ messages = self.log_line(self, metric, condition, true)
369
+
370
+ # notify
371
+ if condition.notify && self.trigger?(metric, true)
372
+ self.notify(condition, messages.last)
373
+ end
374
+
375
+ # get the destination
376
+ dest =
377
+ if condition.transition
378
+ # condition override
379
+ condition.transition
380
+ else
381
+ # regular
382
+ metric.destination && metric.destination[true]
383
+ end
384
+
385
+ if dest
386
+ self.move(dest)
387
+ end
388
+ end
389
+
390
+ # Determine whether a trigger happened
391
+ # +metric+ is the Metric
392
+ # +result+ is the result from the condition's test
393
+ #
394
+ # Returns Boolean
395
+ def trigger?(metric, result)
396
+ metric.destination && metric.destination[result]
397
+ end
398
+
399
+ # Log info about the condition and return the list of messages logged
400
+ # +watch+ is the Watch
401
+ # +metric+ is the Metric
402
+ # +condition+ is the Condition
403
+ # +result+ is the Boolean result of the condition test evaluation
404
+ #
405
+ # Returns String[]
406
+ def log_line(watch, metric, condition, result)
407
+ status =
408
+ if self.trigger?(metric, result)
409
+ "[trigger]"
410
+ else
411
+ "[ok]"
412
+ end
413
+
414
+ messages = []
415
+
416
+ # log info if available
417
+ if condition.info
418
+ Array(condition.info).each do |condition_info|
419
+ messages << "#{watch.name} #{status} #{condition_info} (#{condition.base_name})"
420
+ applog(watch, :info, messages.last)
421
+ end
422
+ else
423
+ messages << "#{watch.name} #{status} (#{condition.base_name})"
424
+ applog(watch, :info, messages.last)
425
+ end
426
+
427
+ # log
428
+ debug_message = watch.name + ' ' + condition.base_name + " [#{result}] " + self.dest_desc(metric, condition)
429
+ applog(watch, :debug, debug_message)
430
+
431
+ messages
432
+ end
433
+
434
+ # Format the destination specification for use in debug logging
435
+ # +metric+ is the Metric
436
+ # +condition+ is the Condition
437
+ #
438
+ # Returns String
439
+ def dest_desc(metric, condition)
440
+ if condition.transition
441
+ {true => condition.transition}.inspect
442
+ else
443
+ if metric.destination
444
+ metric.destination.inspect
445
+ else
446
+ 'none'
447
+ end
448
+ end
449
+ end
450
+
451
+ # Notify all recipeients of the given condition with the specified message
452
+ # +condition+ is the Condition
453
+ # +message+ is the String message to send
454
+ #
455
+ # Returns nothing
456
+ def notify(condition, message)
457
+ spec = Contact.normalize(condition.notify)
458
+ unmatched = []
459
+
460
+ # resolve contacts
461
+ resolved_contacts =
462
+ spec[:contacts].inject([]) do |acc, contact_name_or_group|
463
+ cons = Array(God.contacts[contact_name_or_group] || God.contact_groups[contact_name_or_group])
464
+ unmatched << contact_name_or_group if cons.empty?
465
+ acc += cons
466
+ acc
467
+ end
468
+
469
+ # warn about unmatched contacts
470
+ unless unmatched.empty?
471
+ msg = "#{condition.watch.name} no matching contacts for '#{unmatched.join(", ")}'"
472
+ applog(condition.watch, :warn, msg)
473
+ end
474
+
475
+ # notify each contact
476
+ resolved_contacts.each do |c|
477
+ host = `hostname`.chomp rescue 'none'
478
+ c.notify(message, Time.now, spec[:priority], spec[:category], host)
479
+
480
+ msg = "#{condition.watch.name} #{c.info ? c.info : "notification sent for contact: #{c.name}"} (#{c.base_name})"
481
+
482
+ applog(condition.watch, :info, msg % [])
483
+ end
484
+ end
228
485
  end
229
486
 
230
487
  end