daemon_controller 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "daemon_controller"
3
+ s.version = "0.2.2"
4
+ s.date = "2009-11-06"
5
+ s.summary = "A library for implementing daemon management capabilities"
6
+ s.email = "hongli@phusion.nl"
7
+ s.homepage = "http://github.com/FooBarWidget/daemon_controller/tree/master"
8
+ s.description = "A library for robust daemon management."
9
+ s.has_rdoc = true
10
+ s.authors = ["Hongli Lai"]
11
+
12
+ s.files = [
13
+ "README.markdown", "LICENSE.txt", "daemon_controller.gemspec",
14
+ "lib/daemon_controller.rb",
15
+ "lib/daemon_controller/lock_file.rb",
16
+ "spec/test_helper.rb",
17
+ "spec/daemon_controller_spec.rb",
18
+ "spec/echo_server.rb"
19
+ ]
20
+ end
@@ -0,0 +1,626 @@
1
+ # daemon_controller, library for robust daemon management
2
+ # Copyright (c) 2008 Phusion
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ # of this software and associated documentation files (the "Software"), to deal
6
+ # in the Software without restriction, including without limitation the rights
7
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ # copies of the Software, and to permit persons to whom the Software is
9
+ # furnished to do so, subject to the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be included in
12
+ # all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ # THE SOFTWARE.
21
+
22
+ require 'tempfile'
23
+ require 'fcntl'
24
+ require 'timeout'
25
+ require File.expand_path(File.dirname(__FILE__) << '/daemon_controller/lock_file')
26
+
27
+ # Main daemon controller object. See the README for an introduction and tutorial.
28
+ class DaemonController
29
+ ALLOWED_CONNECT_EXCEPTIONS = [Errno::ECONNREFUSED, Errno::ENETUNREACH,
30
+ Errno::ETIMEDOUT, Errno::ECONNRESET, Errno::EINVAL,
31
+ Errno::EADDRNOTAVAIL]
32
+
33
+ class Error < StandardError
34
+ end
35
+ class TimeoutError < Error
36
+ end
37
+ class AlreadyStarted < Error
38
+ end
39
+ class StartError < Error
40
+ end
41
+ class StartTimeout < TimeoutError
42
+ end
43
+ class StopError < Error
44
+ end
45
+ class StopTimeout < TimeoutError
46
+ end
47
+ class ConnectError < Error
48
+ end
49
+
50
+ # Create a new DaemonController object.
51
+ #
52
+ # === Mandatory options
53
+ #
54
+ # [:identifier]
55
+ # A human-readable, unique name for this daemon, e.g. "Sphinx search server".
56
+ # This identifier will be used in some error messages. On some platforms, it will
57
+ # be used for concurrency control: on such platforms, no two DaemonController
58
+ # objects will operate on the same identifier on the same time.
59
+ #
60
+ # [:start_command]
61
+ # The command to start the daemon. This must be a a String, e.g.
62
+ # "mongrel_rails start -e production".
63
+ #
64
+ # [:ping_command]
65
+ # The ping command is used to check whether the daemon can be connected to.
66
+ # It is also used to ensure that #start only returns when the daemon can be
67
+ # connected to.
68
+ #
69
+ # The value may be a command string. This command must exit with an exit code of
70
+ # 0 if the daemon can be successfully connected to, or exit with a non-0 exit
71
+ # code on failure.
72
+ #
73
+ # The value may also be a Proc, which returns an expression that evaluates to
74
+ # true (indicating that the daemon can be connected to) or false (failure).
75
+ # If the Proc raises Errno::ECONNREFUSED, Errno::ENETUNREACH, Errno::ETIMEDOUT
76
+ # or Errno::ECONNRESET, Errno::EINVAL and Errno::EADDRNOTAVAIL then that also
77
+ # means that the daemon cannot be connected to.
78
+ # <b>NOTE:</b> if the ping command returns an object which responds to
79
+ # <tt>#close</tt>, then that method will be called on the return value.
80
+ # This makes it possible to specify a ping command such as
81
+ # <tt>lambda { TCPSocket.new('localhost', 1234) }</tt>, without having to worry
82
+ # about closing it afterwards.
83
+ # Any exceptions raised by #close are ignored.
84
+ #
85
+ # [:pid_file]
86
+ # The PID file that the daemon will write to. Used to check whether the daemon
87
+ # is running.
88
+ #
89
+ # [:log_file]
90
+ # The log file that the daemon will write to. It will be consulted to see
91
+ # whether the daemon has printed any error messages during startup.
92
+ #
93
+ # === Optional options
94
+ # [:stop_command]
95
+ # A command to stop the daemon with, e.g. "/etc/rc.d/nginx stop". If no stop
96
+ # command is given (i.e. +nil+), then DaemonController will stop the daemon
97
+ # by killing the PID written in the PID file.
98
+ #
99
+ # The default value is +nil+.
100
+ #
101
+ # [:before_start]
102
+ # This may be a Proc. It will be called just before running the start command.
103
+ # The before_start proc is not subject to the start timeout.
104
+ #
105
+ # [:start_timeout]
106
+ # The maximum amount of time, in seconds, that #start may take to start
107
+ # the daemon. Since #start also waits until the daemon can be connected to,
108
+ # that wait time is counted as well. If the daemon does not start in time,
109
+ # then #start will raise an exception.
110
+ #
111
+ # The default value is 15.
112
+ #
113
+ # [:stop_timeout]
114
+ # The maximum amount of time, in seconds, that #stop may take to stop
115
+ # the daemon. Since #stop also waits until the daemon is no longer running,
116
+ # that wait time is counted as well. If the daemon does not stop in time,
117
+ # then #stop will raise an exception.
118
+ #
119
+ # The default value is 15.
120
+ #
121
+ # [:log_file_activity_timeout]
122
+ # Once a daemon has gone into the background, it will become difficult to
123
+ # know for certain whether it is still initializing or whether it has
124
+ # failed and exited, until it has written its PID file. It's 99.9% probable
125
+ # that the daemon has terminated with an if its start timeout has expired,
126
+ # not many system administrators want to wait 15 seconds (the default start
127
+ # timeout) to be notified of whether the daemon has terminated with an error.
128
+ #
129
+ # An alternative way to check whether the daemon has terminated with an error,
130
+ # is by checking whether its log file has been recently updated. If, after the
131
+ # daemon has started, the log file hasn't been updated for the amount of seconds
132
+ # given by the :log_file_activity_timeout option, then the daemon is assumed to
133
+ # have terminated with an error.
134
+ #
135
+ # The default value is 7.
136
+ def initialize(options)
137
+ [:identifier, :start_command, :ping_command, :pid_file, :log_file].each do |option|
138
+ if !options.has_key?(option)
139
+ raise ArgumentError, "The ':#{option}' option is mandatory."
140
+ end
141
+ end
142
+ @identifier = options[:identifier]
143
+ @start_command = options[:start_command]
144
+ @stop_command = options[:stop_command]
145
+ @ping_command = options[:ping_command]
146
+ @ping_interval = options[:ping_interval] || 0.1
147
+ @pid_file = options[:pid_file]
148
+ @log_file = options[:log_file]
149
+ @before_start = options[:before_start]
150
+ @start_timeout = options[:start_timeout] || 15
151
+ @stop_timeout = options[:stop_timeout] || 15
152
+ @log_file_activity_timeout = options[:log_file_activity_timeout] || 7
153
+ @lock_file = determine_lock_file(@identifier, @pid_file)
154
+ end
155
+
156
+ # Start the daemon and wait until it can be pinged.
157
+ #
158
+ # Raises:
159
+ # - AlreadyStarted - the daemon is already running.
160
+ # - StartError - the start command failed.
161
+ # - StartTimeout - the daemon did not start in time. This could also
162
+ # mean that the daemon failed after it has gone into the background.
163
+ def start
164
+ @lock_file.exclusive_lock do
165
+ start_without_locking
166
+ end
167
+ end
168
+
169
+ # Connect to the daemon by running the given block, which contains the
170
+ # connection logic. If the daemon isn't already running, then it will be
171
+ # started.
172
+ #
173
+ # The block must return nil or raise Errno::ECONNREFUSED, Errno::ENETUNREACH,
174
+ # Errno::ETIMEDOUT, Errno::ECONNRESET, Errno::EINVAL and Errno::EADDRNOTAVAIL
175
+ # to indicate that the daemon cannot be
176
+ # connected to. It must return non-nil if the daemon can be connected to.
177
+ # Upon successful connection, the return value of the block will
178
+ # be returned by #connect.
179
+ #
180
+ # Note that the block may be called multiple times.
181
+ #
182
+ # Raises:
183
+ # - StartError - an attempt to start the daemon was made, but the start
184
+ # command failed with an error.
185
+ # - StartTimeout - an attempt to start the daemon was made, but the daemon
186
+ # did not start in time, or it failed after it has gone into the background.
187
+ # - ConnectError - the daemon wasn't already running, but we couldn't connect
188
+ # to the daemon even after starting it.
189
+ def connect
190
+ connection = nil
191
+ @lock_file.shared_lock do
192
+ begin
193
+ connection = yield
194
+ rescue *ALLOWED_CONNECT_EXCEPTIONS
195
+ connection = nil
196
+ end
197
+ end
198
+ if connection.nil?
199
+ @lock_file.exclusive_lock do
200
+ if !daemon_is_running?
201
+ start_without_locking
202
+ end
203
+ connect_exception = nil
204
+ begin
205
+ connection = yield
206
+ rescue *ALLOWED_CONNECT_EXCEPTIONS => e
207
+ connection = nil
208
+ connect_exception = e
209
+ end
210
+ if connection.nil?
211
+ # Daemon is running but we couldn't connect to it. Possible
212
+ # reasons:
213
+ # - The daemon froze.
214
+ # - Bizarre security restrictions.
215
+ # - There's a bug in the yielded code.
216
+ if connect_exception
217
+ raise ConnectError, "Cannot connect to the daemon: #{connect_exception} (#{connect_exception.class})"
218
+ else
219
+ raise ConnectError, "Cannot connect to the daemon"
220
+ end
221
+ else
222
+ return connection
223
+ end
224
+ end
225
+ else
226
+ return connection
227
+ end
228
+ end
229
+
230
+ # Stop the daemon and wait until it has exited.
231
+ #
232
+ # Raises:
233
+ # - StopError - the stop command failed.
234
+ # - StopTimeout - the daemon didn't stop in time.
235
+ def stop
236
+ @lock_file.exclusive_lock do
237
+ begin
238
+ Timeout.timeout(@stop_timeout, Timeout::Error) do
239
+ kill_daemon
240
+ wait_until do
241
+ !daemon_is_running?
242
+ end
243
+ end
244
+ rescue Timeout::Error
245
+ raise StopTimeout, "Daemon '#{@identifier}' did not exit in time"
246
+ end
247
+ end
248
+ end
249
+
250
+ # Returns the daemon's PID, as reported by its PID file. Returns the PID
251
+ # as an integer, or nil there is no valid PID in the PID file.
252
+ #
253
+ # This method doesn't check whether the daemon's actually running.
254
+ # Use #running? if you want to check whether it's actually running.
255
+ #
256
+ # Raises SystemCallError or IOError if something went wrong during
257
+ # reading of the PID file.
258
+ def pid
259
+ @lock_file.shared_lock do
260
+ return read_pid_file
261
+ end
262
+ end
263
+
264
+ # Checks whether the daemon is still running. This is done by reading
265
+ # the PID file and then checking whether there is a process with that
266
+ # PID.
267
+ #
268
+ # Raises SystemCallError or IOError if something went wrong during
269
+ # reading of the PID file.
270
+ def running?
271
+ @lock_file.shared_lock do
272
+ return daemon_is_running?
273
+ end
274
+ end
275
+
276
+ private
277
+ def start_without_locking
278
+ if daemon_is_running?
279
+ raise AlreadyStarted, "Daemon '#{@identifier}' is already started"
280
+ end
281
+ save_log_file_information
282
+ delete_pid_file
283
+ begin
284
+ started = false
285
+ before_start
286
+ Timeout.timeout(@start_timeout, Timeout::Error) do
287
+ done = false
288
+ spawn_daemon
289
+ record_activity
290
+
291
+ # We wait until the PID file is available and until
292
+ # the daemon responds to pings, but we wait no longer
293
+ # than @start_timeout seconds in total (including daemon
294
+ # spawn time).
295
+ # Furthermore, if the log file hasn't changed for
296
+ # @log_file_activity_timeout seconds, and the PID file
297
+ # still isn't available or the daemon still doesn't
298
+ # respond to pings, then assume that the daemon has
299
+ # terminated with an error.
300
+ wait_until do
301
+ if log_file_has_changed?
302
+ record_activity
303
+ elsif no_activity?(@log_file_activity_timeout)
304
+ raise Timeout::Error, "Daemon seems to have exited"
305
+ end
306
+ pid_file_available?
307
+ end
308
+ wait_until(@ping_interval) do
309
+ if log_file_has_changed?
310
+ record_activity
311
+ elsif no_activity?(@log_file_activity_timeout)
312
+ raise Timeout::Error, "Daemon seems to have exited"
313
+ end
314
+ run_ping_command || !daemon_is_running?
315
+ end
316
+ started = run_ping_command
317
+ end
318
+ result = started
319
+ rescue Timeout::Error
320
+ start_timed_out
321
+ if pid_file_available?
322
+ kill_daemon_with_signal
323
+ end
324
+ result = :timeout
325
+ end
326
+ if !result
327
+ raise(StartError, differences_in_log_file ||
328
+ "Daemon '#{@identifier}' failed to start.")
329
+ elsif result == :timeout
330
+ raise(StartTimeout, differences_in_log_file ||
331
+ "Daemon '#{@identifier}' failed to start in time.")
332
+ else
333
+ return true
334
+ end
335
+ end
336
+
337
+ def before_start
338
+ if @before_start
339
+ @before_start.call
340
+ end
341
+ end
342
+
343
+ def spawn_daemon
344
+ run_command(@start_command)
345
+ end
346
+
347
+ def kill_daemon
348
+ if @stop_command
349
+ begin
350
+ run_command(@stop_command)
351
+ rescue StartError => e
352
+ raise StopError, e.message
353
+ end
354
+ else
355
+ kill_daemon_with_signal
356
+ end
357
+ end
358
+
359
+ def kill_daemon_with_signal
360
+ pid = read_pid_file
361
+ if pid
362
+ Process.kill('SIGTERM', pid)
363
+ end
364
+ rescue Errno::ESRCH, Errno::ENOENT
365
+ end
366
+
367
+ def daemon_is_running?
368
+ begin
369
+ pid = read_pid_file
370
+ rescue Errno::ENOENT
371
+ # The PID file may not exist, or another thread/process
372
+ # executing #running? may have just deleted the PID file.
373
+ # So we catch this error.
374
+ pid = nil
375
+ end
376
+ if pid.nil?
377
+ return false
378
+ elsif check_pid(pid)
379
+ return true
380
+ else
381
+ delete_pid_file
382
+ return false
383
+ end
384
+ end
385
+
386
+ def read_pid_file
387
+ pid = File.read(@pid_file).strip
388
+ if pid =~ /\A\d+\Z/
389
+ return pid.to_i
390
+ else
391
+ return nil
392
+ end
393
+ end
394
+
395
+ def delete_pid_file
396
+ File.unlink(@pid_file)
397
+ rescue Errno::EPERM, Errno::EACCES, Errno::ENOENT # ignore
398
+ end
399
+
400
+ def check_pid(pid)
401
+ Process.kill(0, pid)
402
+ return true
403
+ rescue Errno::ESRCH
404
+ return false
405
+ rescue Errno::EPERM
406
+ # We didn't have permission to kill the process. Either the process
407
+ # is owned by someone else, or the system has draconian security
408
+ # settings and we aren't allowed to kill *any* process. Assume that
409
+ # the process is running.
410
+ return true
411
+ end
412
+
413
+ def wait_until(sleep_interval = 0.1)
414
+ while !yield
415
+ sleep(sleep_interval)
416
+ end
417
+ end
418
+
419
+ def wait_until_pid_file_is_available_or_log_file_has_changed
420
+ while !(pid_file_available? || log_file_has_changed?)
421
+ sleep 0.1
422
+ end
423
+ return pid_file_is_available?
424
+ end
425
+
426
+ def wait_until_daemon_responds_to_ping_or_has_exited_or_log_file_has_changed
427
+ while !(run_ping_command || !daemon_is_running? || log_file_has_changed?)
428
+ sleep(@ping_interval)
429
+ end
430
+ return run_ping_command
431
+ end
432
+
433
+ def record_activity
434
+ @last_activity_time = Time.now
435
+ end
436
+
437
+ # Check whether there has been no recorded activity in the past +seconds+ seconds.
438
+ def no_activity?(seconds)
439
+ return Time.now - @last_activity_time > seconds
440
+ end
441
+
442
+ def pid_file_available?
443
+ return File.exist?(@pid_file) && File.stat(@pid_file).size != 0
444
+ end
445
+
446
+ # This method does nothing and only serves as a hook for the unit test.
447
+ def start_timed_out
448
+ end
449
+
450
+ def save_log_file_information
451
+ @original_log_file_stat = File.stat(@log_file) rescue nil
452
+ @current_log_file_stat = @original_log_file_stat
453
+ end
454
+
455
+ def log_file_has_changed?
456
+ if @current_log_file_stat
457
+ stat = File.stat(@log_file) rescue nil
458
+ if stat
459
+ result = @current_log_file_stat.mtime != stat.mtime ||
460
+ @current_log_file_stat.size != stat.size
461
+ @current_log_file_stat = stat
462
+ return result
463
+ else
464
+ return true
465
+ end
466
+ else
467
+ return false
468
+ end
469
+ end
470
+
471
+ def differences_in_log_file
472
+ if @original_log_file_stat
473
+ File.open(@log_file, 'r') do |f|
474
+ f.seek(@original_log_file_stat.size, IO::SEEK_SET)
475
+ diff = f.read.strip
476
+ if diff.empty?
477
+ return nil
478
+ else
479
+ return diff
480
+ end
481
+ end
482
+ else
483
+ return nil
484
+ end
485
+ rescue Errno::ENOENT
486
+ return nil
487
+ end
488
+
489
+ def determine_lock_file(identifier, pid_file)
490
+ return LockFile.new(File.expand_path(pid_file + ".lock"))
491
+ end
492
+
493
+ def self.fork_supported?
494
+ return RUBY_PLATFORM != "java" && RUBY_PLATFORM !~ /win32/
495
+ end
496
+
497
+ def run_command(command)
498
+ # Create tempfile for storing the command's output.
499
+ tempfile = Tempfile.new('daemon-output')
500
+ tempfile_path = tempfile.path
501
+ File.chmod(0666, tempfile_path)
502
+ tempfile.close
503
+
504
+ if self.class.fork_supported? || Process.respond_to?(:spawn)
505
+ if Process.respond_to?(:spawn)
506
+ pid = Process.spawn(command,
507
+ :in => "/dev/null",
508
+ :out => tempfile_path,
509
+ :err => tempfile_path,
510
+ :close_others => true
511
+ )
512
+ else
513
+ pid = safe_fork do
514
+ ObjectSpace.each_object(IO) do |obj|
515
+ obj.close rescue nil
516
+ end
517
+ STDIN.reopen("/dev/null", "r")
518
+ STDOUT.reopen(tempfile_path, "w")
519
+ STDERR.reopen(tempfile_path, "w")
520
+ exec(command)
521
+ end
522
+ end
523
+
524
+ # run_command might be running in a timeout block (like
525
+ # in #start_without_locking).
526
+ begin
527
+ interruptable_waitpid(pid)
528
+ rescue Errno::ECHILD
529
+ # Maybe a background thread or whatever waitpid()'ed
530
+ # this child process before we had the chance. There's
531
+ # no way to obtain the exit status now. Assume that
532
+ # it started successfully; if it didn't we'll know
533
+ # that later by checking the PID file and by pinging
534
+ # it.
535
+ return
536
+ rescue Timeout::Error
537
+ # If the daemon doesn't fork into the background
538
+ # in time, then kill it.
539
+ begin
540
+ Process.kill('SIGTERM', pid)
541
+ rescue SystemCallError
542
+ end
543
+ begin
544
+ Timeout.timeout(5, Timeout::Error) do
545
+ begin
546
+ interruptable_waitpid(pid)
547
+ rescue SystemCallError
548
+ end
549
+ end
550
+ rescue Timeout::Error
551
+ begin
552
+ Process.kill('SIGKILL', pid)
553
+ interruptable_waitpid(pid)
554
+ rescue SystemCallError
555
+ end
556
+ end
557
+ raise
558
+ end
559
+ if $?.exitstatus != 0
560
+ raise StartError, File.read(tempfile_path).strip
561
+ end
562
+ else
563
+ cmd = "#{command} >\"#{tempfile_path}\""
564
+ cmd += " 2>\"#{tempfile_path}\"" unless PLATFORM =~ /mswin/
565
+ if !system(cmd)
566
+ raise StartError, File.read(tempfile_path).strip
567
+ end
568
+ end
569
+ ensure
570
+ File.unlink(tempfile_path) rescue nil
571
+ end
572
+
573
+ def run_ping_command
574
+ if @ping_command.respond_to?(:call)
575
+ begin
576
+ value = @ping_command.call
577
+ if value.respond_to?(:close)
578
+ value.close rescue nil
579
+ end
580
+ return value
581
+ rescue *ALLOWED_CONNECT_EXCEPTIONS
582
+ return false
583
+ end
584
+ else
585
+ return system(@ping_command)
586
+ end
587
+ end
588
+
589
+ def safe_fork
590
+ pid = fork
591
+ if pid.nil?
592
+ begin
593
+ yield
594
+ rescue Exception => e
595
+ message = "*** Exception #{e.class} " <<
596
+ "(#{e}) (process #{$$}):\n" <<
597
+ "\tfrom " << e.backtrace.join("\n\tfrom ")
598
+ STDERR.write(e)
599
+ STDERR.flush
600
+ ensure
601
+ exit!
602
+ end
603
+ else
604
+ return pid
605
+ end
606
+ end
607
+
608
+ if RUBY_VERSION < "1.9"
609
+ def interruptable_waitpid(pid)
610
+ Process.waitpid(pid)
611
+ end
612
+ else
613
+ # On Ruby 1.9, Thread#kill (which is called by timeout.rb) may
614
+ # not be able to interrupt Process.waitpid. So here we use a
615
+ # special version that's a bit less efficient but is at least
616
+ # interruptable.
617
+ def interruptable_waitpid(pid)
618
+ result = nil
619
+ while !result
620
+ result = Process.waitpid(pid, Process::WNOHANG)
621
+ sleep 0.01 if !result
622
+ end
623
+ return result
624
+ end
625
+ end
626
+ end