god 0.7.13 → 0.7.14
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +8 -0
- data/Announce.txt +135 -0
- data/History.txt +15 -0
- data/Rakefile +44 -11
- data/VERSION.yml +4 -0
- data/ext/god/.gitignore +5 -0
- data/ext/god/netlink_handler.c +1 -0
- data/god.gemspec +223 -0
- data/ideas/execve/execve.c +29 -0
- data/ideas/execve/extconf.rb +11 -0
- data/ideas/execve/go.rb +8 -0
- data/ideas/future.god +82 -0
- data/init/lsb_compliant_god +109 -0
- data/lib/god.rb +6 -3
- data/lib/god/cli/command.rb +2 -1
- data/lib/god/cli/version.rb +2 -2
- data/lib/god/contacts/campfire.rb +3 -2
- data/lib/god/contacts/jabber.rb +82 -20
- data/lib/god/logger.rb +5 -1
- data/lib/god/process.rb +0 -6
- data/site/images/banner.jpg +0 -0
- data/site/images/bg.gif +0 -0
- data/site/images/bg_grey.gif +0 -0
- data/site/images/bullet.jpg +0 -0
- data/site/images/corner_green.gif +0 -0
- data/site/images/corner_green.psd +0 -0
- data/site/images/corner_pink.gif +0 -0
- data/site/images/god_logo1.gif +0 -0
- data/site/images/header_bg.gif +0 -0
- data/site/images/header_bg.jpg +0 -0
- data/site/images/red_dot.gif +0 -0
- data/site/images/top_bg.gif +0 -0
- data/site/index.html +563 -0
- data/site/install.html +2 -0
- data/site/javascripts/code_highlighter.js +188 -0
- data/site/javascripts/ruby.js +18 -0
- data/site/stylesheets/layout.css +174 -0
- data/test/configs/lifecycle/lifecycle.god +25 -0
- data/test/test_god.rb +2 -2
- data/test/test_jabber.rb +36 -0
- data/test/test_logger.rb +4 -1
- metadata +56 -22
- data/Manifest.txt +0 -114
data/lib/god/logger.rb
CHANGED
@@ -21,6 +21,7 @@ module God
|
|
21
21
|
self.logs = {}
|
22
22
|
@mutex = Mutex.new
|
23
23
|
@capture = nil
|
24
|
+
@spool = Time.now - 10
|
24
25
|
@templogio = StringIO.new
|
25
26
|
@templog = SimpleLogger.new(@templogio)
|
26
27
|
@templog.level = Logger::INFO
|
@@ -64,7 +65,9 @@ module God
|
|
64
65
|
@templog.send(level, text % [])
|
65
66
|
@mutex.synchronize do
|
66
67
|
@capture.puts(@templogio.string.dup) if @capture
|
67
|
-
|
68
|
+
if watch && (Time.now - @spool < 2)
|
69
|
+
self.logs[watch.name] << [Time.now, @templogio.string.dup]
|
70
|
+
end
|
68
71
|
end
|
69
72
|
|
70
73
|
# send to regular logger
|
@@ -85,6 +88,7 @@ module God
|
|
85
88
|
|
86
89
|
# get and join lines since given time
|
87
90
|
@mutex.synchronize do
|
91
|
+
@spool = Time.now
|
88
92
|
self.logs[watch_name].select do |x|
|
89
93
|
x.first > since
|
90
94
|
end.map do |x|
|
data/lib/god/process.rb
CHANGED
@@ -52,12 +52,6 @@ module God
|
|
52
52
|
applog(self, :error, "No start command was specified")
|
53
53
|
end
|
54
54
|
|
55
|
-
# self-daemonizing processes must specify a stop command
|
56
|
-
if !@tracking_pid && self.stop.nil?
|
57
|
-
valid = false
|
58
|
-
applog(self, :error, "No stop command was specified")
|
59
|
-
end
|
60
|
-
|
61
55
|
# uid must exist if specified
|
62
56
|
if self.uid
|
63
57
|
begin
|
Binary file
|
data/site/images/bg.gif
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/site/index.html
ADDED
@@ -0,0 +1,563 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
2
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
5
|
+
<title>god - process and task monitoring done right</title>
|
6
|
+
<link href="stylesheets/layout.css" rel="stylesheet" type="text/css" />
|
7
|
+
<script type="text/javascript" src="javascripts/code_highlighter.js"></script>
|
8
|
+
<script type="text/javascript" src="javascripts/ruby.js"></script>
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body id="page_home">
|
12
|
+
|
13
|
+
<div id="mothership">
|
14
|
+
|
15
|
+
</div>
|
16
|
+
<div id="content">
|
17
|
+
<div class="banner">
|
18
|
+
|
19
|
+
</div>
|
20
|
+
|
21
|
+
<!-- <div id="menu">
|
22
|
+
<div class="dots"></div>
|
23
|
+
<ul>
|
24
|
+
<li class="menu_home"><a href="/">Home</a></li>
|
25
|
+
<li class="menu_contact"><a href="mailto:tom@projectmothership.com">Contact</a></li>
|
26
|
+
</ul>
|
27
|
+
<div class="dots"></div>
|
28
|
+
</div> -->
|
29
|
+
|
30
|
+
<div class="columnleft">
|
31
|
+
<h1>A Better Way to Monitor</h1>
|
32
|
+
<p>God is an easy to configure, easy to extend monitoring framework written in Ruby.</p>
|
33
|
+
<p>Keeping your server processes and tasks running should be a simple part of your deployment process. God aims to be the simplest, most powerful monitoring application available.</p>
|
34
|
+
<p style="text-align: right">Tom Preston-Werner<br />tom at rubyisawesome dot com</p>
|
35
|
+
<p style="text-align: right">Google Group: <a href="http://groups.google.com/group/god-rb">http://groups.google.com/group/god-rb</a></p>
|
36
|
+
</div>
|
37
|
+
|
38
|
+
<div class="columnright">
|
39
|
+
<h1>Features</h1>
|
40
|
+
<ul class="features">
|
41
|
+
<li>Config file is written in Ruby</li>
|
42
|
+
<li>Easily write your own custom conditions in Ruby</li>
|
43
|
+
<li>Supports both poll and event based conditions</li>
|
44
|
+
<li>Different poll conditions can have different intervals</li>
|
45
|
+
<li>Integrated notification system (write your own too!)</li>
|
46
|
+
<li>Easily control non-daemonizing scripts</li>
|
47
|
+
</ul>
|
48
|
+
</div>
|
49
|
+
|
50
|
+
<h1>Installation (v 0.7.13)</h1>
|
51
|
+
<p>The best way to get god is via rubygems:</p>
|
52
|
+
<pre>$ sudo gem install god</pre>
|
53
|
+
|
54
|
+
<h1>Contribute</h1>
|
55
|
+
<p>God is open source and accepting pull requests via GitHub!</p>
|
56
|
+
<p><a href="http://github.com/mojombo/god">http://github.com/mojombo/god</a></p>
|
57
|
+
|
58
|
+
<h1>Requirements</h1>
|
59
|
+
|
60
|
+
<p>God currently only works on <b>Linux (kernel 2.6.15+), BSD,</b> and <b>Darwin</b> systems. No support for Windows is planned. Event based conditions on Linux systems require the <code>cn</code> (connector) kernel module loaded or compiled in to the kernel and god must be run as root.</p>
|
61
|
+
|
62
|
+
<p>The following systems have been tested. Help us test it on others!</p>
|
63
|
+
|
64
|
+
<ul>
|
65
|
+
<li>Darwin 10.4.10</li>
|
66
|
+
<li>RedHat Fedora Core 6</li>
|
67
|
+
<li>Ubuntu Dapper (no events)</li>
|
68
|
+
<li>Ubuntu Feisty</li>
|
69
|
+
<li>CentOS 4.5 (no events)</li>
|
70
|
+
</ul>
|
71
|
+
|
72
|
+
<h1>Finally, a Config File that Makes Sense</h1>
|
73
|
+
<p>The easiest way to understand how god will make your life better is by looking at a sample config file. The following configuration file is what I use at <a href="http://site.gravatar.com/">gravatar.com</a> to keep the mongrels running:</p>
|
74
|
+
|
75
|
+
<pre><code class="ruby"># run with: god -c /path/to/gravatar.god
|
76
|
+
#
|
77
|
+
# This is the actual config file used to keep the mongrels of
|
78
|
+
# gravatar.com running.
|
79
|
+
|
80
|
+
RAILS_ROOT = "/Users/tom/dev/gravatar2"
|
81
|
+
|
82
|
+
%w{8200 8201 8202}.each do |port|
|
83
|
+
God.watch do |w|
|
84
|
+
w.name = "gravatar2-mongrel-#{port}"
|
85
|
+
w.interval = 30.seconds # default
|
86
|
+
w.start = "mongrel_rails start -c #{RAILS_ROOT} -p #{port} \
|
87
|
+
-P #{RAILS_ROOT}/log/mongrel.#{port}.pid -d"
|
88
|
+
w.stop = "mongrel_rails stop -P #{RAILS_ROOT}/log/mongrel.#{port}.pid"
|
89
|
+
w.restart = "mongrel_rails restart -P #{RAILS_ROOT}/log/mongrel.#{port}.pid"
|
90
|
+
w.start_grace = 10.seconds
|
91
|
+
w.restart_grace = 10.seconds
|
92
|
+
w.pid_file = File.join(RAILS_ROOT, "log/mongrel.#{port}.pid")
|
93
|
+
|
94
|
+
w.behavior(:clean_pid_file)
|
95
|
+
|
96
|
+
w.start_if do |start|
|
97
|
+
start.condition(:process_running) do |c|
|
98
|
+
c.interval = 5.seconds
|
99
|
+
c.running = false
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
w.restart_if do |restart|
|
104
|
+
restart.condition(:memory_usage) do |c|
|
105
|
+
c.above = 150.megabytes
|
106
|
+
c.times = [3, 5] # 3 out of 5 intervals
|
107
|
+
end
|
108
|
+
|
109
|
+
restart.condition(:cpu_usage) do |c|
|
110
|
+
c.above = 50.percent
|
111
|
+
c.times = 5
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# lifecycle
|
116
|
+
w.lifecycle do |on|
|
117
|
+
on.condition(:flapping) do |c|
|
118
|
+
c.to_state = [:start, :restart]
|
119
|
+
c.times = 5
|
120
|
+
c.within = 5.minute
|
121
|
+
c.transition = :unmonitored
|
122
|
+
c.retry_in = 10.minutes
|
123
|
+
c.retry_times = 5
|
124
|
+
c.retry_within = 2.hours
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end</code></pre>
|
129
|
+
|
130
|
+
<p>That's a lot to take in at once, so I'll break it down by section and explain what's going on in each.</p>
|
131
|
+
|
132
|
+
<pre><code class="ruby">RAILS_ROOT = "/var/www/gravatar2/current"</code></pre>
|
133
|
+
|
134
|
+
<p>Here I've set a constant that is used throughout the file. Keeping the <code>RAILS_ROOT</code> value in a constant makes it easy to adapt this script to other applications. Because the config file is Ruby code, I can set whatever variables or constants I want that make the configuration more concise and easier to work with.</p>
|
135
|
+
|
136
|
+
<pre><code class="ruby">%w{8200 8201 8202}.each do |port|
|
137
|
+
...
|
138
|
+
end</code></pre>
|
139
|
+
|
140
|
+
<p>Because the config file is written in actual Ruby code, we can construct loops and do other intelligent things that are impossible in your every day, run of the mill config file. I need to watch three mongrels, so I simply loop over their port numbers, eliminating duplication and making my life a whole lot easier.</p>
|
141
|
+
|
142
|
+
<pre><code class="ruby"> God.watch do |w|
|
143
|
+
w.name = "gravatar2-mongrel-#{port}"
|
144
|
+
w.interval = 30.seconds # default
|
145
|
+
w.start = "mongrel_rails start -c #{RAILS_ROOT} -p #{port} \
|
146
|
+
-P #{RAILS_ROOT}/log/mongrel.#{port}.pid -d"
|
147
|
+
w.stop = "mongrel_rails stop -P #{RAILS_ROOT}/log/mongrel.#{port}.pid"
|
148
|
+
w.restart = "mongrel_rails restart -P #{RAILS_ROOT}/log/mongrel.#{port}.pid"
|
149
|
+
w.start_grace = 10.seconds
|
150
|
+
w.restart_grace = 10.seconds
|
151
|
+
w.pid_file = File.join(RAILS_ROOT, "log/mongrel.#{port}.pid")
|
152
|
+
|
153
|
+
...
|
154
|
+
end</code></pre>
|
155
|
+
|
156
|
+
<p>A <code>watch</code> represents a single process that has concrete start, stop, and/or restart operations. You can define as many watches as you like. In the example above, I've got some Rails instances running in Mongrels that I need to keep alive. Every watch must have a unique <code>name</code> so that it can be identified later on. The <code>interval</code> option sets the default poll interval (this can be overridden in each condition). The <code>start</code> and <code>stop</code> attributes specify the commands to start and stop the process. If no <code>restart</code> attribute is set, restart will be represented by a call to stop followed by a call to start. The optional <code>grace</code> attribute sets the amount of time following a start/stop/restart command to wait before resuming normal monitoring operations. To be more specific, I can set just <code>start_grace</code>, <code>stop_grace</code>, and/or <code>restart_grace</code>. If the process you're watching runs as a daemon (as mine does), you'll need to set the <code>pid_file</code> attribute.</p>
|
157
|
+
|
158
|
+
<pre><code class="ruby"> w.behavior(:clean_pid_file)</code></pre>
|
159
|
+
|
160
|
+
<p>Behaviors allow you to execute additional commands around start/stop/restart commands. In our case, if the process dies it will leave a PID file behind. The next time a start command is issued, it will fail, complaining about the leftover PID file. We'd like the PID file cleaned up before a start command is issued. The built-in behavior <code>clean_pid_file</code> will do just that.</p>
|
161
|
+
|
162
|
+
<pre><code class="ruby"> w.start_if do |start|
|
163
|
+
start.condition(:process_running) do |c|
|
164
|
+
c.interval = 5.seconds
|
165
|
+
c.running = false
|
166
|
+
end
|
167
|
+
end</code></pre>
|
168
|
+
|
169
|
+
<p>Watches contain conditions grouped by the action to execute should they return <code>true</code>. I start with a <code>start_if</code> block that contains a single condition. Conditions are specified by calling <code>condition</code> with an identifier, in this case
|
170
|
+
<code>:process_running</code>. Each condition can specify a poll interval that will override the default watch interval. In this case, I want to check that the process is still running every 5 seconds instead of the 30 second interval that other conditions will inherit. The ability to set condition specific poll intervals makes it possible to run critical tests (such as :process_running) more often than less critical tests (such as :memory_usage and :cpu_usage).</p>
|
171
|
+
|
172
|
+
<pre><code class="ruby"> w.restart_if do |restart|
|
173
|
+
restart.condition(:memory_usage) do |c|
|
174
|
+
c.above = 150.megabytes
|
175
|
+
c.times = [3, 5] # 3 out of 5 intervals
|
176
|
+
end
|
177
|
+
|
178
|
+
...
|
179
|
+
end</code></pre>
|
180
|
+
|
181
|
+
<p>Similar to <code>start_if</code> there is a <code>restart_if</code> command that groups conditions that should trigger a restart. The <code>memory_usage</code> condition will fail if the specified process is using too much memory. The maximum allowable amount of memory is specified with the <code>above</code> attribute (you can use the kilobytes, megabytes, or gigabytes helpers). The number of times the test needs to fail in order to trigger a restart is set with <code>times</code>. This can be either an integer or an array. An integer means it must fail that many times in a row while an array [x, y] means it must fail x times out of the last y tests.</p>
|
182
|
+
|
183
|
+
<pre><code class="ruby"> w.restart_if do |restart|
|
184
|
+
...
|
185
|
+
|
186
|
+
restart.condition(:cpu_usage) do |c|
|
187
|
+
c.above = 50.percent
|
188
|
+
c.times = 5
|
189
|
+
end
|
190
|
+
end</code></pre>
|
191
|
+
|
192
|
+
<p>To keep an eye on CPU usage, I've employed the <code>cpu_usage</code> condition. When CPU usage for a Mongrel process is over 50% for 5 consecutive intervals, it will be restarted.</p>
|
193
|
+
|
194
|
+
<pre><code class="ruby"> w.lifecycle do |on|
|
195
|
+
on.condition(:flapping) do |c|
|
196
|
+
c.to_state = [:start, :restart]
|
197
|
+
c.times = 5
|
198
|
+
c.within = 5.minute
|
199
|
+
c.transition = :unmonitored
|
200
|
+
c.retry_in = 10.minutes
|
201
|
+
c.retry_times = 5
|
202
|
+
c.retry_within = 2.hours
|
203
|
+
end
|
204
|
+
end</code></pre>
|
205
|
+
|
206
|
+
<p>Conditions inside a <code>lifecycle</code> section are active as long as the process is being monitored (they live across state changes).
|
207
|
+
|
208
|
+
<p>The <code>:flapping</code> condition guards against the edge case wherein god rapidly starts or restarts your application. Things like server configuration changes or the unavailability of external services could make it impossible for my process to start. In that case, god will try to start my process over and over to no avail. The <code>:flapping</code> condition provides two levels of giving up on flapping processes. If I were to translate the options of the code above, it would be something like: If this watch is started or restarted five times withing 5 minutes, then unmonitor it...then after ten minutes, monitor it again to see if it was just a temporary problem; if the process is seen to be flapping five times within two hours, then give up completely.</p>
|
209
|
+
|
210
|
+
<p>That's it! Simple, huh?</p>
|
211
|
+
|
212
|
+
<!-- ------------------------------------------------------------------------- -->
|
213
|
+
|
214
|
+
<h1>Changing UID/GID for processes</h1>
|
215
|
+
|
216
|
+
<p>It is possible to have god run your start/stop/restart commands as a specific user/group. This can be done by setting the <code>uid</code> and/or <code>gid</code> attributes of a watch.</p>
|
217
|
+
|
218
|
+
<pre><code class="ruby"> God.watch do |w|
|
219
|
+
...
|
220
|
+
|
221
|
+
w.uid = 'tom'
|
222
|
+
w.gid = 'devs'
|
223
|
+
|
224
|
+
...
|
225
|
+
end</code></pre>
|
226
|
+
|
227
|
+
</p>This only works for commands specified as a string. Lambda commands are unaffected.</p>
|
228
|
+
|
229
|
+
|
230
|
+
<!-- ------------------------------------------------------------------------- -->
|
231
|
+
|
232
|
+
<h1>Lambda commands</h1>
|
233
|
+
|
234
|
+
<p>In addition to specifying start/stop/restart commands as strings (to be executed via the shell), you can specify a lambda that will be called.</p>
|
235
|
+
|
236
|
+
<pre><code class="ruby"> God.watch do |w|
|
237
|
+
...
|
238
|
+
|
239
|
+
w.start = lambda { ENV['APACHE'] ? `apachectl -k graceful` : `lighttpd restart` }
|
240
|
+
|
241
|
+
...
|
242
|
+
end</code></pre>
|
243
|
+
|
244
|
+
<!-- ------------------------------------------------------------------------- -->
|
245
|
+
|
246
|
+
<h1>Starting and Controlling God</h1>
|
247
|
+
|
248
|
+
<p>To start the god monitoring process as a daemon simply run the <code>god</code> executable passing in the path to the config file (you need to sudo if you're using events on Linux or want to use the setuid/setgid functionality):</p>
|
249
|
+
|
250
|
+
<pre>$ sudo god -c /path/to/config.god</pre>
|
251
|
+
|
252
|
+
<p>While you're writing your config file, it can be helpful to run god in the foreground so you can see the log messages. You can do that with:</p>
|
253
|
+
|
254
|
+
<pre>$ sudo god -c /path/to/config.god -D</pre>
|
255
|
+
|
256
|
+
<p>You can start/restart/stop/monitor/unmonitor your Watches with the same utility like so:</p>
|
257
|
+
|
258
|
+
<pre>$ sudo god stop gravatar2-mongrel-8200</pre>
|
259
|
+
|
260
|
+
<!-- ------------------------------------------------------------------------- -->
|
261
|
+
|
262
|
+
<h1>Grouping Watches</h1>
|
263
|
+
|
264
|
+
<p>Watches can be assigned to groups. These groups can then be controlled together from the command line.</p>
|
265
|
+
|
266
|
+
<pre><code class="ruby"> God.watch do |w|
|
267
|
+
...
|
268
|
+
|
269
|
+
w.group = 'mongrels'
|
270
|
+
|
271
|
+
...
|
272
|
+
end</code></pre>
|
273
|
+
|
274
|
+
<p>The above configuration now allows you to control the watch (and any others that are in the group) with a single command:</p>
|
275
|
+
|
276
|
+
<pre>$ sudo god stop mongrels</pre>
|
277
|
+
|
278
|
+
<!-- ------------------------------------------------------------------------- -->
|
279
|
+
|
280
|
+
<h1>Advanced Configuration with Transitions and Events</h1>
|
281
|
+
|
282
|
+
<p>So far you've been introduced to a simple poll-based config file and seen how to run it. Poll-based monitoring works great for simple things, but falls short for highly critical tasks. God has native support for kqueue/netlink events on BSD/Darwin/Linux systems. For instance, instead of using the <code>process_running</code> condition to poll for the status of your process, you can use the <code>process_exits</code> condition that will be notified <b>immediately</b> upon the exit of your process. This means less load on your system and shorter downtime after a crash.</p>
|
283
|
+
|
284
|
+
<p>While the configuration syntax you saw in the previous example is very simple, it lacks the power that we need to deal with event based monitoring. In fact, the <code>start_if</code> and <code>restart_if</code> methods are really just calling out to a lower-level API. If we use the low-level API directly, we can harness the full power of god's event based lifecycle system. Let's look at another example config file.</p>
|
285
|
+
|
286
|
+
<pre><code class="ruby">RAILS_ROOT = "/Users/tom/dev/gravatar2"
|
287
|
+
|
288
|
+
God.watch do |w|
|
289
|
+
w.name = "local-3000"
|
290
|
+
w.interval = 5.seconds # default
|
291
|
+
w.start = "mongrel_rails start -c #{RAILS_ROOT} -P #{RAILS_ROOT}/log/mongrel.pid -p 3000 -d"
|
292
|
+
w.stop = "mongrel_rails stop -P #{RAILS_ROOT}/log/mongrel.pid"
|
293
|
+
w.restart = "mongrel_rails restart -P #{RAILS_ROOT}/log/mongrel.pid"
|
294
|
+
w.pid_file = File.join(RAILS_ROOT, "log/mongrel.pid")
|
295
|
+
|
296
|
+
# clean pid files before start if necessary
|
297
|
+
w.behavior(:clean_pid_file)
|
298
|
+
|
299
|
+
# determine the state on startup
|
300
|
+
w.transition(:init, { true => :up, false => :start }) do |on|
|
301
|
+
on.condition(:process_running) do |c|
|
302
|
+
c.running = true
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
# determine when process has finished starting
|
307
|
+
w.transition([:start, :restart], :up) do |on|
|
308
|
+
on.condition(:process_running) do |c|
|
309
|
+
c.running = true
|
310
|
+
end
|
311
|
+
|
312
|
+
# failsafe
|
313
|
+
on.condition(:tries) do |c|
|
314
|
+
c.times = 5
|
315
|
+
c.transition = :start
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
# start if process is not running
|
320
|
+
w.transition(:up, :start) do |on|
|
321
|
+
on.condition(:process_exits)
|
322
|
+
end
|
323
|
+
|
324
|
+
# restart if memory or cpu is too high
|
325
|
+
w.transition(:up, :restart) do |on|
|
326
|
+
on.condition(:memory_usage) do |c|
|
327
|
+
c.interval = 20
|
328
|
+
c.above = 50.megabytes
|
329
|
+
c.times = [3, 5]
|
330
|
+
end
|
331
|
+
|
332
|
+
on.condition(:cpu_usage) do |c|
|
333
|
+
c.interval = 10
|
334
|
+
c.above = 10.percent
|
335
|
+
c.times = [3, 5]
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
# lifecycle
|
340
|
+
w.lifecycle do |on|
|
341
|
+
on.condition(:flapping) do |c|
|
342
|
+
c.to_state = [:start, :restart]
|
343
|
+
c.times = 5
|
344
|
+
c.within = 5.minute
|
345
|
+
c.transition = :unmonitored
|
346
|
+
c.retry_in = 10.minutes
|
347
|
+
c.retry_times = 5
|
348
|
+
c.retry_within = 2.hours
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
352
|
+
</code></pre>
|
353
|
+
|
354
|
+
<p>A bit longer, I know, but very straighforward once you understand how the <code>transition</code> calls work. The <code>name</code>, <code>interval</code>, <code>start</code>, <code>stop</code>, and <code>pid_file</code> attributes should be familiar. We also specify the <code>clean_pid_file</code> behavior.</p>
|
355
|
+
|
356
|
+
<p>Before jumping into the code, it's important to understand the different states that a Watch can have, and how that state changes over time. At any given time, a Watch will be in one of the <code>init</code>, <code>up</code>, <code>start</code>, or <code>restart</code> states. As different conditions are satisfied, the Watch will progress from state to state, enabling and disabling conditions along the way.</p>
|
357
|
+
|
358
|
+
<p>When god first starts, each Watch is placed in the <code>init</code> state.</p>
|
359
|
+
|
360
|
+
<p>You'll use the <code>transition</code> method to tell god how to transition between states. It takes two arguments. The first argument may be either a symbol or an array of symbols representing the state or states during which the specified conditions should be enabled. The second argument may be either a symbol or a hash. If it is a symbol, then that is the state that will be transitioned to if any of the conditions return <code>true</code>. If it is a hash, then that hash must have both <code>true</code> and <code>false</code> keys, each of which point to a symbol that represents the state to transition to given the corresponding return from the single condition that must be specified.</p>
|
361
|
+
|
362
|
+
<pre><code class="ruby"> # determine the state on startup
|
363
|
+
w.transition(:init, { true => :up, false => :start }) do |on|
|
364
|
+
on.condition(:process_running) do |c|
|
365
|
+
c.running = true
|
366
|
+
end
|
367
|
+
end</code></pre>
|
368
|
+
|
369
|
+
<p>The first transition block tells god what to do when the Watch is in the <code>init</code> state (first argument). This is where I tell god how to determine if my task is already running. Since I'm monitoring a process, I can use the <code>process_running</code> condition to determine whether the process is running. If the process is running, it will return true, otherwise it will return false. Since I sent a hash as the second argument to <code>transition</code>, the return from <code>process_running</code> will determine which of the two states will be transitioned to. If the process is running, the return is true and god will put the Watch into the <code>up</code> state. If the process is not running, the return is false and god will put the Watch into the <code>start</code> state.</p>
|
370
|
+
|
371
|
+
<pre><code class="ruby"> # determine when process has finished starting
|
372
|
+
w.transition([:start, :restart], :up) do |on|
|
373
|
+
on.condition(:process_running) do |c|
|
374
|
+
c.running = true
|
375
|
+
end
|
376
|
+
|
377
|
+
...
|
378
|
+
end</code></pre>
|
379
|
+
|
380
|
+
<p>If god has determined that my process isn't running, the Watch will be put into the <code>start</code> state. Upon entering this state, the <code>start</code> command that I specified on the Watch will be called. In addition, the above transition specifies a condition that should be enabled when in either the <code>start</code> or <code>restart</code> states. The condition is another <code>process_running</code>, however this time I'm only interested in moving to another state once it returns <code>true</code>. A <code>true</code> return from this condition means that the process is running and it's ok to transition to the <code>up</code> state (second argument to <code>transition</code>).</p>
|
381
|
+
|
382
|
+
<pre><code class="ruby"> # determine when process has finished starting
|
383
|
+
w.transition([:start, :restart], :up) do |on|
|
384
|
+
...
|
385
|
+
|
386
|
+
# failsafe
|
387
|
+
on.condition(:tries) do |c|
|
388
|
+
c.times = 5
|
389
|
+
c.transition = :start
|
390
|
+
end
|
391
|
+
end</code></pre>
|
392
|
+
|
393
|
+
<p>The other half of this transition uses the <code>tries</code> condition to ensure that god doesn't get stuck in this state. It's possible that the process could go down while the transition is being made, in which case god would end up polling forever to see if the process is up. Here I've specified that if this condition is called five times, god should override the normal transition destination and move to the <code>start</code> state instead. If you specify a <code>transition</code> attribute on any condition, that state will be transferred to instead of the normal transfer destination.</p>
|
394
|
+
|
395
|
+
<pre><code class="ruby"> # start if process is not running
|
396
|
+
w.transition(:up, :start) do |on|
|
397
|
+
on.condition(:process_exits)
|
398
|
+
end</code></pre>
|
399
|
+
|
400
|
+
<p>This is where the event based system comes into play. Once in the <code>up</code> state, I want to be notified when my process exits. The <code>process_exits</code> condition registers a callback that will trigger a transition change when it is fired off. Event conditions (like this one) cannot be used in transitions that have a hash for the second argument (as they do not return true or false).</p>
|
401
|
+
|
402
|
+
<pre><code class="ruby"> # restart if memory or cpu is too high
|
403
|
+
w.transition(:up, :restart) do |on|
|
404
|
+
on.condition(:memory_usage) do |c|
|
405
|
+
c.interval = 20
|
406
|
+
c.above = 50.megabytes
|
407
|
+
c.times = [3, 5]
|
408
|
+
end
|
409
|
+
|
410
|
+
on.condition(:cpu_usage) do |c|
|
411
|
+
c.interval = 10
|
412
|
+
c.above = 10.percent
|
413
|
+
c.times = [3, 5]
|
414
|
+
end
|
415
|
+
end</code></pre>
|
416
|
+
|
417
|
+
<p>Notice that I can have multiple transitions with the same start state. In this case, I want to have the <code>memory_usage</code> and <code>cpu_usage</code> poll conditions going at the same time that I listen for the process exit event. In the case of runaway CPU or memory usage, however, I want to transition to the <code>restart</code> state. When a Watch enters the <code>restart</code> state it will either call the <code>restart</code> command that you specified, or if none has been set, call the <code>stop</code> and then <code>start</code> commands.</p>
|
418
|
+
|
419
|
+
<!-- ------------------------------------------------------------------------- -->
|
420
|
+
|
421
|
+
<h1>Watching Non-Daemon Processes</h1>
|
422
|
+
|
423
|
+
<p>Need to watch a script that doesn't have built in daemonization? No problem! God will daemonize and keep track of your process for you. If you don't specify a <code>pid_file</code> attribute for a watch, it will be auto-daemonized and a PID file will be stored for it in <code>/var/run/god</code>. If you'd rather have the PID file stored in a different location, you can set it at the top of your config:</p>
|
424
|
+
|
425
|
+
<pre><code class="ruby">God.pid_file_directory = '/home/tom/pids'
|
426
|
+
|
427
|
+
God.watch do |w|
|
428
|
+
# watch with no pid_file attribute set
|
429
|
+
end</code></pre>
|
430
|
+
|
431
|
+
<p>The directory you specify must be writable by god.</p>
|
432
|
+
|
433
|
+
<!-- ------------------------------------------------------------------------- -->
|
434
|
+
|
435
|
+
<h1>Loading Other Config Files</h1>
|
436
|
+
|
437
|
+
<p>You should feel free to separate your god configs into separate files for easier organization. You can load in other configs using Ruby's normal <code>load</code> method, or use the convenience method <code>God.load</code> which allows for glob-style paths:</p>
|
438
|
+
|
439
|
+
<pre><code class="ruby"># load in all god configs
|
440
|
+
God.load "/usr/local/conf/*.god"</code></pre>
|
441
|
+
|
442
|
+
<p>God won't start its monitoring operations until all configurations have been loaded.</p>
|
443
|
+
|
444
|
+
<!-- ------------------------------------------------------------------------- -->
|
445
|
+
|
446
|
+
<h1>Dynamically Loading Config Files Into an Already Running God</h1>
|
447
|
+
|
448
|
+
<p>God allows you to load or reload configurations into an already running instance. There are a few things to consider when doing this:</p>
|
449
|
+
|
450
|
+
<ul>
|
451
|
+
<li>Existng Watches with the same <code>name</code> as the incoming Watches will be overidden by the new config.</li>
|
452
|
+
<li>All paths must be either absolute or relative to the path from which god was started.</li>
|
453
|
+
</ul>
|
454
|
+
|
455
|
+
<p>To load a config into a running god, issue the following command:</p>
|
456
|
+
|
457
|
+
<pre>$ sudo god load path/to/config.god</pre>
|
458
|
+
|
459
|
+
<p>Config files that are loaded dynamically can contain anything that a normal config file contains, however, global options such as <code>God.pid_file_directory</code> blocks will be ignored (and produce a warning in the logs).</p>
|
460
|
+
|
461
|
+
<!-- ------------------------------------------------------------------------- -->
|
462
|
+
|
463
|
+
<h1>Getting Logs for a Single Watch</h1>
|
464
|
+
|
465
|
+
<p>Sifting through the god logs for statements specific to a specific Watch can be frustrating when you have many of them. You can get the realtime logs for a single Watch via the command line:</p>
|
466
|
+
|
467
|
+
<pre>$ sudo god log local-3000</pre>
|
468
|
+
|
469
|
+
<p>This will display the last 1000 lines of log for the 'local-3000' Watch and update every second with new log messages.</p>
|
470
|
+
|
471
|
+
<p>You can also supply a shorthand to the log command that will match one of your watches. If it happens to match several, the first match will be used:</p>
|
472
|
+
|
473
|
+
<pre>$ sudo god log l3</pre>
|
474
|
+
|
475
|
+
<!-- ------------------------------------------------------------------------- -->
|
476
|
+
|
477
|
+
<h1>Notifications</h1>
|
478
|
+
|
479
|
+
<p>God has an extensible notification framework built in that makes it easy to have notifications sent when conditions are triggered. There are three steps to enabling notifications.</p>
|
480
|
+
|
481
|
+
<p>Step 1: Set the options for the notification subsystem(s) that you'll be using. Let's look at how to setup email notifications.</p>
|
482
|
+
|
483
|
+
<pre><code class="ruby">God::Contacts::Email.message_settings = {
|
484
|
+
:from => 'god@example.com'
|
485
|
+
}
|
486
|
+
|
487
|
+
God::Contacts::Email.server_settings = {
|
488
|
+
:address => "smtp.example.com",
|
489
|
+
:port => 25,
|
490
|
+
:domain => "example.com",
|
491
|
+
:authentication => :plain,
|
492
|
+
:user_name => "john",
|
493
|
+
:password => "s3kr3ts"
|
494
|
+
}</code></pre>
|
495
|
+
|
496
|
+
<p>Step 2: Configure some contacts.</p>
|
497
|
+
|
498
|
+
<pre><code class="ruby">God.contact(:email) do |c|
|
499
|
+
c.name = 'tom'
|
500
|
+
c.email = 'tom@example.com'
|
501
|
+
end
|
502
|
+
|
503
|
+
God.contact(:email) do |c|
|
504
|
+
c.name = 'vanpelt'
|
505
|
+
c.email = 'vanpelt@example.com'
|
506
|
+
c.group = 'developers'
|
507
|
+
end
|
508
|
+
|
509
|
+
God.contact(:email) do |c|
|
510
|
+
c.name = 'kevin'
|
511
|
+
c.email = 'kevin@example.com'
|
512
|
+
c.group = 'developers'
|
513
|
+
end</code></pre>
|
514
|
+
|
515
|
+
<p>Step 3: Attach to a condition:</p>
|
516
|
+
|
517
|
+
<pre><code class="ruby"> w.transition(:up, :start) do |on|
|
518
|
+
on.condition(:process_exits) do |c|
|
519
|
+
c.notify = 'tom'
|
520
|
+
end
|
521
|
+
end</code></pre>
|
522
|
+
|
523
|
+
<p>There are two ways to specify that a notification should be sent. The first, easier way is shown above. Every condition can take an optional <code>notify</code> attribute that specifies which contacts should be notified when the condition is triggered. The value can be a contact name or contact group *or* an array of contact names and/or contact groups.</p>
|
524
|
+
|
525
|
+
<pre><code class="ruby"> w.transition(:up, :start) do |on|
|
526
|
+
on.condition(:process_exits) do |c|
|
527
|
+
c.notify = {:contacts => ['tom', 'developers'], :priority => 1, :category => 'product'}
|
528
|
+
end
|
529
|
+
end</code></pre>
|
530
|
+
|
531
|
+
</p>The second way allows you to specify the <code>priority</code> and <code>category</code> in addition to the contacts. The extra attributes can be arbitrary integers or strings and will be passed as-is to the notification subsystem.</p>
|
532
|
+
|
533
|
+
<p>The above notification will arrive as an email similar to the following.</p>
|
534
|
+
|
535
|
+
<pre><code>From: god <god@example.com>
|
536
|
+
To: tom <tom@example.com>
|
537
|
+
Subject: [god] mongrel-8600 [trigger] process exited (ProcessExits)
|
538
|
+
|
539
|
+
Message: mongre-8600 [trigger] process exited (ProcessExits)
|
540
|
+
Host: candymountain.example.com
|
541
|
+
Priority: 1
|
542
|
+
Category: product</code></pre>
|
543
|
+
|
544
|
+
<!-- ------------------------------------------------------------------------- -->
|
545
|
+
|
546
|
+
<h1>Extend God with your own Conditions</h1>
|
547
|
+
|
548
|
+
<p>God was designed from the start to allow you to easily write your own custom conditions, making it simple to add tests that are application specific.</p>
|
549
|
+
|
550
|
+
</div>
|
551
|
+
<div id="footer">
|
552
|
+
<p>Brought to you by <a href="http://rubyisawesome.com/">Ruby is Awesome</a></p>
|
553
|
+
</div>
|
554
|
+
|
555
|
+
<script src="http://www.google-analytics.com/urchin.js" type="text/javascript">
|
556
|
+
</script>
|
557
|
+
<script type="text/javascript">
|
558
|
+
_uacct = "UA-2196727-1";
|
559
|
+
urchinTracker();
|
560
|
+
</script>
|
561
|
+
|
562
|
+
</body>
|
563
|
+
</html>
|