secobarbital-daemon_controller 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,572 @@
1
+ # daemon_controller, library for robust daemon management
2
+ # Copyright (c) 2008 Phusion
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ # of this software and associated documentation files (the "Software"), to deal
6
+ # in the Software without restriction, including without limitation the rights
7
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ # copies of the Software, and to permit persons to whom the Software is
9
+ # furnished to do so, subject to the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be included in
12
+ # all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ # THE SOFTWARE.
21
+
22
+ require 'tempfile'
23
+ require 'fcntl'
24
+ require 'timeout'
25
+ require File.expand_path(File.dirname(__FILE__) << '/daemon_controller/lock_file')
26
+
27
+ # Main daemon controller object. See the README for an introduction and tutorial.
28
+ class DaemonController
29
+ ALLOWED_CONNECT_EXCEPTIONS = [Errno::ECONNREFUSED, Errno::ENETUNREACH,
30
+ Errno::ETIMEDOUT, Errno::ECONNRESET, Errno::EINVAL]
31
+
32
+ class Error < StandardError
33
+ end
34
+ class TimeoutError < Error
35
+ end
36
+ class AlreadyStarted < Error
37
+ end
38
+ class StartError < Error
39
+ end
40
+ class StartTimeout < TimeoutError
41
+ end
42
+ class StopError < Error
43
+ end
44
+ class StopTimeout < TimeoutError
45
+ end
46
+ class ConnectError < Error
47
+ end
48
+
49
+ # Create a new DaemonController object.
50
+ #
51
+ # === Mandatory options
52
+ #
53
+ # [:identifier]
54
+ # A human-readable, unique name for this daemon, e.g. "Sphinx search server".
55
+ # This identifier will be used in some error messages. On some platforms, it will
56
+ # be used for concurrency control: on such platforms, no two DaemonController
57
+ # objects will operate on the same identifier on the same time.
58
+ #
59
+ # [:start_command]
60
+ # The command to start the daemon. This must be a a String, e.g.
61
+ # "mongrel_rails start -e production".
62
+ #
63
+ # [:ping_command]
64
+ # The ping command is used to check whether the daemon can be connected to.
65
+ # It is also used to ensure that #start only returns when the daemon can be
66
+ # connected to.
67
+ #
68
+ # The value may be a command string. This command must exit with an exit code of
69
+ # 0 if the daemon can be successfully connected to, or exit with a non-0 exit
70
+ # code on failure.
71
+ #
72
+ # The value may also be a Proc, which returns an expression that evaluates to
73
+ # true (indicating that the daemon can be connected to) or false (failure).
74
+ # If the Proc raises Errno::ECONNREFUSED, Errno::ENETUNREACH, Errno::ETIMEDOUT
75
+ # or Errno::ECONNRESET, then that also means that the daemon cannot be connected
76
+ # to.
77
+ # <b>NOTE:</b> if the ping command returns an object which responds to
78
+ # <tt>#close</tt>, then that method will be called on the return value.
79
+ # This makes it possible to specify a ping command such as
80
+ # <tt>lambda { TCPSocket.new('localhost', 1234) }</tt>, without having to worry
81
+ # about closing it afterwards.
82
+ # Any exceptions raised by #close are ignored.
83
+ #
84
+ # [:pid_file]
85
+ # The PID file that the daemon will write to. Used to check whether the daemon
86
+ # is running.
87
+ #
88
+ # [:log_file]
89
+ # The log file that the daemon will write to. It will be consulted to see
90
+ # whether the daemon has printed any error messages during startup.
91
+ #
92
+ # === Optional options
93
+ # [:stop_command]
94
+ # A command to stop the daemon with, e.g. "/etc/rc.d/nginx stop". If no stop
95
+ # command is given (i.e. +nil+), then DaemonController will stop the daemon
96
+ # by killing the PID written in the PID file.
97
+ #
98
+ # The default value is +nil+.
99
+ #
100
+ # [:before_start]
101
+ # This may be a Proc. It will be called just before running the start command.
102
+ # The before_start proc is not subject to the start timeout.
103
+ #
104
+ # [:start_timeout]
105
+ # The maximum amount of time, in seconds, that #start may take to start
106
+ # the daemon. Since #start also waits until the daemon can be connected to,
107
+ # that wait time is counted as well. If the daemon does not start in time,
108
+ # then #start will raise an exception.
109
+ #
110
+ # The default value is 15.
111
+ #
112
+ # [:stop_timeout]
113
+ # The maximum amount of time, in seconds, that #stop may take to stop
114
+ # the daemon. Since #stop also waits until the daemon is no longer running,
115
+ # that wait time is counted as well. If the daemon does not stop in time,
116
+ # then #stop will raise an exception.
117
+ #
118
+ # The default value is 15.
119
+ #
120
+ # [:log_file_activity_timeout]
121
+ # Once a daemon has gone into the background, it will become difficult to
122
+ # know for certain whether it is still initializing or whether it has
123
+ # failed and exited, until it has written its PID file. It's 99.9% probable
124
+ # that the daemon has terminated with an if its start timeout has expired,
125
+ # not many system administrators want to wait 15 seconds (the default start
126
+ # timeout) to be notified of whether the daemon has terminated with an error.
127
+ #
128
+ # An alternative way to check whether the daemon has terminated with an error,
129
+ # is by checking whether its log file has been recently updated. If, after the
130
+ # daemon has started, the log file hasn't been updated for the amount of seconds
131
+ # given by the :log_file_activity_timeout option, then the daemon is assumed to
132
+ # have terminated with an error.
133
+ #
134
+ # The default value is 7.
135
+ def initialize(options)
136
+ [:identifier, :start_command, :ping_command, :pid_file, :log_file].each do |option|
137
+ if !options.has_key?(option)
138
+ raise ArgumentError, "The ':#{option}' option is mandatory."
139
+ end
140
+ end
141
+ @identifier = options[:identifier]
142
+ @start_command = options[:start_command]
143
+ @stop_command = options[:stop_command]
144
+ @ping_command = options[:ping_command]
145
+ @ping_interval = options[:ping_interval] || 0.1
146
+ @pid_file = options[:pid_file]
147
+ @log_file = options[:log_file]
148
+ @before_start = options[:before_start]
149
+ @start_timeout = options[:start_timeout] || 15
150
+ @stop_timeout = options[:stop_timeout] || 15
151
+ @log_file_activity_timeout = options[:log_file_activity_timeout] || 7
152
+ @lock_file = determine_lock_file(@identifier, @pid_file)
153
+ end
154
+
155
+ # Start the daemon and wait until it can be pinged.
156
+ #
157
+ # Raises:
158
+ # - AlreadyStarted - the daemon is already running.
159
+ # - StartError - the start command failed.
160
+ # - StartTimeout - the daemon did not start in time. This could also
161
+ # mean that the daemon failed after it has gone into the background.
162
+ def start
163
+ @lock_file.exclusive_lock do
164
+ start_without_locking
165
+ end
166
+ end
167
+
168
+ # Connect to the daemon by running the given block, which contains the
169
+ # connection logic. If the daemon isn't already running, then it will be
170
+ # started.
171
+ #
172
+ # The block must return nil or raise Errno::ECONNREFUSED, Errno::ENETUNREACH,
173
+ # Errno::ETIMEDOUT, Errno::ECONNRESET to indicate that the daemon cannot be
174
+ # connected to. It must return non-nil if the daemon can be connected to.
175
+ # Upon successful connection, the return value of the block will
176
+ # be returned by #connect.
177
+ #
178
+ # Note that the block may be called multiple times.
179
+ #
180
+ # Raises:
181
+ # - StartError - an attempt to start the daemon was made, but the start
182
+ # command failed with an error.
183
+ # - StartTimeout - an attempt to start the daemon was made, but the daemon
184
+ # did not start in time, or it failed after it has gone into the background.
185
+ # - ConnectError - the daemon wasn't already running, but we couldn't connect
186
+ # to the daemon even after starting it.
187
+ def connect
188
+ connection = nil
189
+ @lock_file.shared_lock do
190
+ begin
191
+ connection = yield
192
+ rescue *ALLOWED_CONNECT_EXCEPTIONS
193
+ connection = nil
194
+ end
195
+ end
196
+ if connection.nil?
197
+ @lock_file.exclusive_lock do
198
+ if !daemon_is_running?
199
+ start_without_locking
200
+ end
201
+ begin
202
+ connection = yield
203
+ rescue *ALLOWED_CONNECT_EXCEPTIONS
204
+ connection = nil
205
+ end
206
+ if connection.nil?
207
+ # Daemon is running but we couldn't connect to it. Possible
208
+ # reasons:
209
+ # - The daemon froze.
210
+ # - Bizarre security restrictions.
211
+ # - There's a bug in the yielded code.
212
+ raise ConnectError, "Cannot connect to the daemon"
213
+ else
214
+ return connection
215
+ end
216
+ end
217
+ else
218
+ return connection
219
+ end
220
+ end
221
+
222
+ # Stop the daemon and wait until it has exited.
223
+ #
224
+ # Raises:
225
+ # - StopError - the stop command failed.
226
+ # - StopTimeout - the daemon didn't stop in time.
227
+ def stop
228
+ @lock_file.exclusive_lock do
229
+ begin
230
+ Timeout.timeout(@stop_timeout) do
231
+ kill_daemon
232
+ wait_until do
233
+ !daemon_is_running?
234
+ end
235
+ end
236
+ rescue Timeout::Error
237
+ raise StopTimeout, "Daemon '#{@identifier}' did not exit in time"
238
+ end
239
+ end
240
+ end
241
+
242
+ # Returns the daemon's PID, as reported by its PID file. Returns the PID
243
+ # as an integer, or nil there is no valid PID in the PID file.
244
+ #
245
+ # This method doesn't check whether the daemon's actually running.
246
+ # Use #running? if you want to check whether it's actually running.
247
+ #
248
+ # Raises SystemCallError or IOError if something went wrong during
249
+ # reading of the PID file.
250
+ def pid
251
+ @lock_file.shared_lock do
252
+ return read_pid_file
253
+ end
254
+ end
255
+
256
+ # Checks whether the daemon is still running. This is done by reading
257
+ # the PID file and then checking whether there is a process with that
258
+ # PID.
259
+ #
260
+ # Raises SystemCallError or IOError if something went wrong during
261
+ # reading of the PID file.
262
+ def running?
263
+ @lock_file.shared_lock do
264
+ return daemon_is_running?
265
+ end
266
+ end
267
+
268
+ private
269
+ def start_without_locking
270
+ if daemon_is_running?
271
+ raise AlreadyStarted, "Daemon '#{@identifier}' is already started"
272
+ end
273
+ save_log_file_information
274
+ delete_pid_file
275
+ begin
276
+ started = false
277
+ before_start
278
+ Timeout.timeout(@start_timeout) do
279
+ done = false
280
+ spawn_daemon
281
+ record_activity
282
+
283
+ # We wait until the PID file is available and until
284
+ # the daemon responds to pings, but we wait no longer
285
+ # than @start_timeout seconds in total (including daemon
286
+ # spawn time).
287
+ # Furthermore, if the log file hasn't changed for
288
+ # @log_file_activity_timeout seconds, and the PID file
289
+ # still isn't available or the daemon still doesn't
290
+ # respond to pings, then assume that the daemon has
291
+ # terminated with an error.
292
+ wait_until do
293
+ if log_file_has_changed?
294
+ record_activity
295
+ elsif no_activity?(@log_file_activity_timeout)
296
+ raise Timeout::Error, "Daemon seems to have exited"
297
+ end
298
+ pid_file_available?
299
+ end
300
+ wait_until(@ping_interval) do
301
+ if log_file_has_changed?
302
+ record_activity
303
+ elsif no_activity?(@log_file_activity_timeout)
304
+ raise Timeout::Error, "Daemon seems to have exited"
305
+ end
306
+ run_ping_command || !daemon_is_running?
307
+ end
308
+ started = run_ping_command
309
+ end
310
+ result = started
311
+ rescue Timeout::Error
312
+ start_timed_out
313
+ if pid_file_available?
314
+ kill_daemon_with_signal
315
+ end
316
+ result = :timeout
317
+ end
318
+ if !result
319
+ raise(StartError, differences_in_log_file ||
320
+ "Daemon '#{@identifier}' failed to start.")
321
+ elsif result == :timeout
322
+ raise(StartTimeout, differences_in_log_file ||
323
+ "Daemon '#{@identifier}' failed to start in time.")
324
+ else
325
+ return true
326
+ end
327
+ end
328
+
329
+ def before_start
330
+ if @before_start
331
+ @before_start.call
332
+ end
333
+ end
334
+
335
+ def spawn_daemon
336
+ run_command(@start_command)
337
+ end
338
+
339
+ def kill_daemon
340
+ if @stop_command
341
+ begin
342
+ run_command(@stop_command)
343
+ rescue StartError => e
344
+ raise StopError, e.message
345
+ end
346
+ else
347
+ kill_daemon_with_signal
348
+ end
349
+ end
350
+
351
+ def kill_daemon_with_signal
352
+ pid = read_pid_file
353
+ if pid
354
+ Process.kill('SIGTERM', pid)
355
+ end
356
+ rescue Errno::ESRCH, Errno::ENOENT
357
+ end
358
+
359
+ def daemon_is_running?
360
+ begin
361
+ pid = read_pid_file
362
+ rescue Errno::ENOENT
363
+ # The PID file may not exist, or another thread/process
364
+ # executing #running? may have just deleted the PID file.
365
+ # So we catch this error.
366
+ pid = nil
367
+ end
368
+ if pid.nil?
369
+ return false
370
+ elsif check_pid(pid)
371
+ return true
372
+ else
373
+ delete_pid_file
374
+ return false
375
+ end
376
+ end
377
+
378
+ def read_pid_file
379
+ pid = File.read(@pid_file).strip
380
+ if pid =~ /\A\d+\Z/
381
+ return pid.to_i
382
+ else
383
+ return nil
384
+ end
385
+ end
386
+
387
+ def delete_pid_file
388
+ File.unlink(@pid_file)
389
+ rescue Errno::EPERM, Errno::EACCES, Errno::ENOENT # ignore
390
+ end
391
+
392
+ def check_pid(pid)
393
+ Process.kill(0, pid)
394
+ return true
395
+ rescue Errno::ESRCH
396
+ return false
397
+ rescue Errno::EPERM
398
+ # We didn't have permission to kill the process. Either the process
399
+ # is owned by someone else, or the system has draconian security
400
+ # settings and we aren't allowed to kill *any* process. Assume that
401
+ # the process is running.
402
+ return true
403
+ end
404
+
405
+ def wait_until(sleep_interval = 0.1)
406
+ while !yield
407
+ sleep(sleep_interval)
408
+ end
409
+ end
410
+
411
+ def wait_until_pid_file_is_available_or_log_file_has_changed
412
+ while !(pid_file_available? || log_file_has_changed?)
413
+ sleep 0.1
414
+ end
415
+ return pid_file_is_available?
416
+ end
417
+
418
+ def wait_until_daemon_responds_to_ping_or_has_exited_or_log_file_has_changed
419
+ while !(run_ping_command || !daemon_is_running? || log_file_has_changed?)
420
+ sleep(@ping_interval)
421
+ end
422
+ return run_ping_command
423
+ end
424
+
425
+ def record_activity
426
+ @last_activity_time = Time.now
427
+ end
428
+
429
+ # Check whether there has been no recorded activity in the past +seconds+ seconds.
430
+ def no_activity?(seconds)
431
+ return Time.now - @last_activity_time > seconds
432
+ end
433
+
434
+ def pid_file_available?
435
+ return File.exist?(@pid_file) && File.stat(@pid_file).size != 0
436
+ end
437
+
438
+ # This method does nothing and only serves as a hook for the unit test.
439
+ def start_timed_out
440
+ end
441
+
442
+ def save_log_file_information
443
+ @original_log_file_stat = File.stat(@log_file) rescue nil
444
+ @current_log_file_stat = @original_log_file_stat
445
+ end
446
+
447
+ def log_file_has_changed?
448
+ if @current_log_file_stat
449
+ stat = File.stat(@log_file) rescue nil
450
+ if stat
451
+ result = @current_log_file_stat.mtime != stat.mtime ||
452
+ @current_log_file_stat.size != stat.size
453
+ @current_log_file_stat = stat
454
+ return result
455
+ else
456
+ return true
457
+ end
458
+ else
459
+ return false
460
+ end
461
+ end
462
+
463
+ def differences_in_log_file
464
+ if @original_log_file_stat
465
+ File.open(@log_file, 'r') do |f|
466
+ f.seek(@original_log_file_stat.size, IO::SEEK_SET)
467
+ diff = f.read.strip
468
+ if diff.empty?
469
+ return nil
470
+ else
471
+ return diff
472
+ end
473
+ end
474
+ else
475
+ return nil
476
+ end
477
+ rescue Errno::ENOENT
478
+ return nil
479
+ end
480
+
481
+ def determine_lock_file(identifier, pid_file)
482
+ return LockFile.new(File.expand_path(pid_file + ".lock"))
483
+ end
484
+
485
+ def self.fork_supported?
486
+ return RUBY_PLATFORM != "java" && RUBY_PLATFORM !~ /win32/
487
+ end
488
+
489
+ def run_command(command)
490
+ # Create tempfile for storing the command's output.
491
+ tempfile = Tempfile.new('daemon-output')
492
+ tempfile_path = tempfile.path
493
+ File.chmod(0666, tempfile_path)
494
+ tempfile.close
495
+
496
+ if self.class.fork_supported?
497
+ pid = safe_fork do
498
+ STDIN.reopen("/dev/null", "r")
499
+ STDOUT.reopen(tempfile_path, "w")
500
+ STDERR.reopen(tempfile_path, "w")
501
+ ObjectSpace.each_object(IO) do |obj|
502
+ if STDIN != obj && STDOUT != obj && STDERR != obj
503
+ obj.close rescue nil
504
+ end
505
+ end
506
+ exec(command)
507
+ end
508
+ begin
509
+ Process.waitpid(pid) rescue nil
510
+ rescue Timeout::Error
511
+ # If the daemon doesn't fork into the background
512
+ # in time, then kill it.
513
+ Process.kill('SIGTERM', pid) rescue nil
514
+ begin
515
+ Timeout.timeout(5) do
516
+ Process.waitpid(pid) rescue nil
517
+ end
518
+ rescue Timeout::Error
519
+ Process.kill('SIGKILL', pid)
520
+ Process.waitpid(pid) rescue nil
521
+ end
522
+ raise
523
+ end
524
+ if $?.exitstatus != 0
525
+ raise StartError, File.read(tempfile_path).strip
526
+ end
527
+ else
528
+ cmd = "#{command} >\"#{tempfile_path}\""
529
+ cmd += " 2>\"#{tempfile_path}\"" unless PLATFORM =~ /mswin/
530
+ if !system(cmd)
531
+ raise StartError, File.read(tempfile_path).strip
532
+ end
533
+ end
534
+ ensure
535
+ File.unlink(tempfile_path) rescue nil
536
+ end
537
+
538
+ def run_ping_command
539
+ if @ping_command.respond_to?(:call)
540
+ begin
541
+ value = @ping_command.call
542
+ if value.respond_to?(:close)
543
+ value.close rescue nil
544
+ end
545
+ return value
546
+ rescue *ALLOWED_CONNECT_EXCEPTIONS
547
+ return false
548
+ end
549
+ else
550
+ return system(@ping_command)
551
+ end
552
+ end
553
+
554
+ def safe_fork
555
+ pid = fork
556
+ if pid.nil?
557
+ begin
558
+ yield
559
+ rescue Exception => e
560
+ message = "*** Exception #{e.class} " <<
561
+ "(#{e}) (process #{$$}):\n" <<
562
+ "\tfrom " << e.backtrace.join("\n\tfrom ")
563
+ STDERR.write(e)
564
+ STDERR.flush
565
+ ensure
566
+ exit!
567
+ end
568
+ else
569
+ return pid
570
+ end
571
+ end
572
+ end