daemon_controller 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "daemon_controller"
3
+ s.version = "0.2.2"
4
+ s.date = "2009-11-06"
5
+ s.summary = "A library for implementing daemon management capabilities"
6
+ s.email = "hongli@phusion.nl"
7
+ s.homepage = "http://github.com/FooBarWidget/daemon_controller/tree/master"
8
+ s.description = "A library for robust daemon management."
9
+ s.has_rdoc = true
10
+ s.authors = ["Hongli Lai"]
11
+
12
+ s.files = [
13
+ "README.markdown", "LICENSE.txt", "daemon_controller.gemspec",
14
+ "lib/daemon_controller.rb",
15
+ "lib/daemon_controller/lock_file.rb",
16
+ "spec/test_helper.rb",
17
+ "spec/daemon_controller_spec.rb",
18
+ "spec/echo_server.rb"
19
+ ]
20
+ end
@@ -0,0 +1,626 @@
1
+ # daemon_controller, library for robust daemon management
2
+ # Copyright (c) 2008 Phusion
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ # of this software and associated documentation files (the "Software"), to deal
6
+ # in the Software without restriction, including without limitation the rights
7
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ # copies of the Software, and to permit persons to whom the Software is
9
+ # furnished to do so, subject to the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be included in
12
+ # all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ # THE SOFTWARE.
21
+
22
+ require 'tempfile'
23
+ require 'fcntl'
24
+ require 'timeout'
25
+ require File.expand_path(File.dirname(__FILE__) << '/daemon_controller/lock_file')
26
+
27
+ # Main daemon controller object. See the README for an introduction and tutorial.
28
+ class DaemonController
29
+ ALLOWED_CONNECT_EXCEPTIONS = [Errno::ECONNREFUSED, Errno::ENETUNREACH,
30
+ Errno::ETIMEDOUT, Errno::ECONNRESET, Errno::EINVAL,
31
+ Errno::EADDRNOTAVAIL]
32
+
33
+ class Error < StandardError
34
+ end
35
+ class TimeoutError < Error
36
+ end
37
+ class AlreadyStarted < Error
38
+ end
39
+ class StartError < Error
40
+ end
41
+ class StartTimeout < TimeoutError
42
+ end
43
+ class StopError < Error
44
+ end
45
+ class StopTimeout < TimeoutError
46
+ end
47
+ class ConnectError < Error
48
+ end
49
+
50
+ # Create a new DaemonController object.
51
+ #
52
+ # === Mandatory options
53
+ #
54
+ # [:identifier]
55
+ # A human-readable, unique name for this daemon, e.g. "Sphinx search server".
56
+ # This identifier will be used in some error messages. On some platforms, it will
57
+ # be used for concurrency control: on such platforms, no two DaemonController
58
+ # objects will operate on the same identifier on the same time.
59
+ #
60
+ # [:start_command]
61
+ # The command to start the daemon. This must be a a String, e.g.
62
+ # "mongrel_rails start -e production".
63
+ #
64
+ # [:ping_command]
65
+ # The ping command is used to check whether the daemon can be connected to.
66
+ # It is also used to ensure that #start only returns when the daemon can be
67
+ # connected to.
68
+ #
69
+ # The value may be a command string. This command must exit with an exit code of
70
+ # 0 if the daemon can be successfully connected to, or exit with a non-0 exit
71
+ # code on failure.
72
+ #
73
+ # The value may also be a Proc, which returns an expression that evaluates to
74
+ # true (indicating that the daemon can be connected to) or false (failure).
75
+ # If the Proc raises Errno::ECONNREFUSED, Errno::ENETUNREACH, Errno::ETIMEDOUT
76
+ # or Errno::ECONNRESET, Errno::EINVAL and Errno::EADDRNOTAVAIL then that also
77
+ # means that the daemon cannot be connected to.
78
+ # <b>NOTE:</b> if the ping command returns an object which responds to
79
+ # <tt>#close</tt>, then that method will be called on the return value.
80
+ # This makes it possible to specify a ping command such as
81
+ # <tt>lambda { TCPSocket.new('localhost', 1234) }</tt>, without having to worry
82
+ # about closing it afterwards.
83
+ # Any exceptions raised by #close are ignored.
84
+ #
85
+ # [:pid_file]
86
+ # The PID file that the daemon will write to. Used to check whether the daemon
87
+ # is running.
88
+ #
89
+ # [:log_file]
90
+ # The log file that the daemon will write to. It will be consulted to see
91
+ # whether the daemon has printed any error messages during startup.
92
+ #
93
+ # === Optional options
94
+ # [:stop_command]
95
+ # A command to stop the daemon with, e.g. "/etc/rc.d/nginx stop". If no stop
96
+ # command is given (i.e. +nil+), then DaemonController will stop the daemon
97
+ # by killing the PID written in the PID file.
98
+ #
99
+ # The default value is +nil+.
100
+ #
101
+ # [:before_start]
102
+ # This may be a Proc. It will be called just before running the start command.
103
+ # The before_start proc is not subject to the start timeout.
104
+ #
105
+ # [:start_timeout]
106
+ # The maximum amount of time, in seconds, that #start may take to start
107
+ # the daemon. Since #start also waits until the daemon can be connected to,
108
+ # that wait time is counted as well. If the daemon does not start in time,
109
+ # then #start will raise an exception.
110
+ #
111
+ # The default value is 15.
112
+ #
113
+ # [:stop_timeout]
114
+ # The maximum amount of time, in seconds, that #stop may take to stop
115
+ # the daemon. Since #stop also waits until the daemon is no longer running,
116
+ # that wait time is counted as well. If the daemon does not stop in time,
117
+ # then #stop will raise an exception.
118
+ #
119
+ # The default value is 15.
120
+ #
121
+ # [:log_file_activity_timeout]
122
+ # Once a daemon has gone into the background, it will become difficult to
123
+ # know for certain whether it is still initializing or whether it has
124
+ # failed and exited, until it has written its PID file. It's 99.9% probable
125
+ # that the daemon has terminated with an if its start timeout has expired,
126
+ # not many system administrators want to wait 15 seconds (the default start
127
+ # timeout) to be notified of whether the daemon has terminated with an error.
128
+ #
129
+ # An alternative way to check whether the daemon has terminated with an error,
130
+ # is by checking whether its log file has been recently updated. If, after the
131
+ # daemon has started, the log file hasn't been updated for the amount of seconds
132
+ # given by the :log_file_activity_timeout option, then the daemon is assumed to
133
+ # have terminated with an error.
134
+ #
135
+ # The default value is 7.
136
+ def initialize(options)
137
+ [:identifier, :start_command, :ping_command, :pid_file, :log_file].each do |option|
138
+ if !options.has_key?(option)
139
+ raise ArgumentError, "The ':#{option}' option is mandatory."
140
+ end
141
+ end
142
+ @identifier = options[:identifier]
143
+ @start_command = options[:start_command]
144
+ @stop_command = options[:stop_command]
145
+ @ping_command = options[:ping_command]
146
+ @ping_interval = options[:ping_interval] || 0.1
147
+ @pid_file = options[:pid_file]
148
+ @log_file = options[:log_file]
149
+ @before_start = options[:before_start]
150
+ @start_timeout = options[:start_timeout] || 15
151
+ @stop_timeout = options[:stop_timeout] || 15
152
+ @log_file_activity_timeout = options[:log_file_activity_timeout] || 7
153
+ @lock_file = determine_lock_file(@identifier, @pid_file)
154
+ end
155
+
156
+ # Start the daemon and wait until it can be pinged.
157
+ #
158
+ # Raises:
159
+ # - AlreadyStarted - the daemon is already running.
160
+ # - StartError - the start command failed.
161
+ # - StartTimeout - the daemon did not start in time. This could also
162
+ # mean that the daemon failed after it has gone into the background.
163
+ def start
164
+ @lock_file.exclusive_lock do
165
+ start_without_locking
166
+ end
167
+ end
168
+
169
+ # Connect to the daemon by running the given block, which contains the
170
+ # connection logic. If the daemon isn't already running, then it will be
171
+ # started.
172
+ #
173
+ # The block must return nil or raise Errno::ECONNREFUSED, Errno::ENETUNREACH,
174
+ # Errno::ETIMEDOUT, Errno::ECONNRESET, Errno::EINVAL and Errno::EADDRNOTAVAIL
175
+ # to indicate that the daemon cannot be
176
+ # connected to. It must return non-nil if the daemon can be connected to.
177
+ # Upon successful connection, the return value of the block will
178
+ # be returned by #connect.
179
+ #
180
+ # Note that the block may be called multiple times.
181
+ #
182
+ # Raises:
183
+ # - StartError - an attempt to start the daemon was made, but the start
184
+ # command failed with an error.
185
+ # - StartTimeout - an attempt to start the daemon was made, but the daemon
186
+ # did not start in time, or it failed after it has gone into the background.
187
+ # - ConnectError - the daemon wasn't already running, but we couldn't connect
188
+ # to the daemon even after starting it.
189
+ def connect
190
+ connection = nil
191
+ @lock_file.shared_lock do
192
+ begin
193
+ connection = yield
194
+ rescue *ALLOWED_CONNECT_EXCEPTIONS
195
+ connection = nil
196
+ end
197
+ end
198
+ if connection.nil?
199
+ @lock_file.exclusive_lock do
200
+ if !daemon_is_running?
201
+ start_without_locking
202
+ end
203
+ connect_exception = nil
204
+ begin
205
+ connection = yield
206
+ rescue *ALLOWED_CONNECT_EXCEPTIONS => e
207
+ connection = nil
208
+ connect_exception = e
209
+ end
210
+ if connection.nil?
211
+ # Daemon is running but we couldn't connect to it. Possible
212
+ # reasons:
213
+ # - The daemon froze.
214
+ # - Bizarre security restrictions.
215
+ # - There's a bug in the yielded code.
216
+ if connect_exception
217
+ raise ConnectError, "Cannot connect to the daemon: #{connect_exception} (#{connect_exception.class})"
218
+ else
219
+ raise ConnectError, "Cannot connect to the daemon"
220
+ end
221
+ else
222
+ return connection
223
+ end
224
+ end
225
+ else
226
+ return connection
227
+ end
228
+ end
229
+
230
+ # Stop the daemon and wait until it has exited.
231
+ #
232
+ # Raises:
233
+ # - StopError - the stop command failed.
234
+ # - StopTimeout - the daemon didn't stop in time.
235
+ def stop
236
+ @lock_file.exclusive_lock do
237
+ begin
238
+ Timeout.timeout(@stop_timeout, Timeout::Error) do
239
+ kill_daemon
240
+ wait_until do
241
+ !daemon_is_running?
242
+ end
243
+ end
244
+ rescue Timeout::Error
245
+ raise StopTimeout, "Daemon '#{@identifier}' did not exit in time"
246
+ end
247
+ end
248
+ end
249
+
250
+ # Returns the daemon's PID, as reported by its PID file. Returns the PID
251
+ # as an integer, or nil there is no valid PID in the PID file.
252
+ #
253
+ # This method doesn't check whether the daemon's actually running.
254
+ # Use #running? if you want to check whether it's actually running.
255
+ #
256
+ # Raises SystemCallError or IOError if something went wrong during
257
+ # reading of the PID file.
258
+ def pid
259
+ @lock_file.shared_lock do
260
+ return read_pid_file
261
+ end
262
+ end
263
+
264
+ # Checks whether the daemon is still running. This is done by reading
265
+ # the PID file and then checking whether there is a process with that
266
+ # PID.
267
+ #
268
+ # Raises SystemCallError or IOError if something went wrong during
269
+ # reading of the PID file.
270
+ def running?
271
+ @lock_file.shared_lock do
272
+ return daemon_is_running?
273
+ end
274
+ end
275
+
276
+ private
277
+ def start_without_locking
278
+ if daemon_is_running?
279
+ raise AlreadyStarted, "Daemon '#{@identifier}' is already started"
280
+ end
281
+ save_log_file_information
282
+ delete_pid_file
283
+ begin
284
+ started = false
285
+ before_start
286
+ Timeout.timeout(@start_timeout, Timeout::Error) do
287
+ done = false
288
+ spawn_daemon
289
+ record_activity
290
+
291
+ # We wait until the PID file is available and until
292
+ # the daemon responds to pings, but we wait no longer
293
+ # than @start_timeout seconds in total (including daemon
294
+ # spawn time).
295
+ # Furthermore, if the log file hasn't changed for
296
+ # @log_file_activity_timeout seconds, and the PID file
297
+ # still isn't available or the daemon still doesn't
298
+ # respond to pings, then assume that the daemon has
299
+ # terminated with an error.
300
+ wait_until do
301
+ if log_file_has_changed?
302
+ record_activity
303
+ elsif no_activity?(@log_file_activity_timeout)
304
+ raise Timeout::Error, "Daemon seems to have exited"
305
+ end
306
+ pid_file_available?
307
+ end
308
+ wait_until(@ping_interval) do
309
+ if log_file_has_changed?
310
+ record_activity
311
+ elsif no_activity?(@log_file_activity_timeout)
312
+ raise Timeout::Error, "Daemon seems to have exited"
313
+ end
314
+ run_ping_command || !daemon_is_running?
315
+ end
316
+ started = run_ping_command
317
+ end
318
+ result = started
319
+ rescue Timeout::Error
320
+ start_timed_out
321
+ if pid_file_available?
322
+ kill_daemon_with_signal
323
+ end
324
+ result = :timeout
325
+ end
326
+ if !result
327
+ raise(StartError, differences_in_log_file ||
328
+ "Daemon '#{@identifier}' failed to start.")
329
+ elsif result == :timeout
330
+ raise(StartTimeout, differences_in_log_file ||
331
+ "Daemon '#{@identifier}' failed to start in time.")
332
+ else
333
+ return true
334
+ end
335
+ end
336
+
337
+ def before_start
338
+ if @before_start
339
+ @before_start.call
340
+ end
341
+ end
342
+
343
+ def spawn_daemon
344
+ run_command(@start_command)
345
+ end
346
+
347
+ def kill_daemon
348
+ if @stop_command
349
+ begin
350
+ run_command(@stop_command)
351
+ rescue StartError => e
352
+ raise StopError, e.message
353
+ end
354
+ else
355
+ kill_daemon_with_signal
356
+ end
357
+ end
358
+
359
+ def kill_daemon_with_signal
360
+ pid = read_pid_file
361
+ if pid
362
+ Process.kill('SIGTERM', pid)
363
+ end
364
+ rescue Errno::ESRCH, Errno::ENOENT
365
+ end
366
+
367
+ def daemon_is_running?
368
+ begin
369
+ pid = read_pid_file
370
+ rescue Errno::ENOENT
371
+ # The PID file may not exist, or another thread/process
372
+ # executing #running? may have just deleted the PID file.
373
+ # So we catch this error.
374
+ pid = nil
375
+ end
376
+ if pid.nil?
377
+ return false
378
+ elsif check_pid(pid)
379
+ return true
380
+ else
381
+ delete_pid_file
382
+ return false
383
+ end
384
+ end
385
+
386
+ def read_pid_file
387
+ pid = File.read(@pid_file).strip
388
+ if pid =~ /\A\d+\Z/
389
+ return pid.to_i
390
+ else
391
+ return nil
392
+ end
393
+ end
394
+
395
+ def delete_pid_file
396
+ File.unlink(@pid_file)
397
+ rescue Errno::EPERM, Errno::EACCES, Errno::ENOENT # ignore
398
+ end
399
+
400
+ def check_pid(pid)
401
+ Process.kill(0, pid)
402
+ return true
403
+ rescue Errno::ESRCH
404
+ return false
405
+ rescue Errno::EPERM
406
+ # We didn't have permission to kill the process. Either the process
407
+ # is owned by someone else, or the system has draconian security
408
+ # settings and we aren't allowed to kill *any* process. Assume that
409
+ # the process is running.
410
+ return true
411
+ end
412
+
413
+ def wait_until(sleep_interval = 0.1)
414
+ while !yield
415
+ sleep(sleep_interval)
416
+ end
417
+ end
418
+
419
+ def wait_until_pid_file_is_available_or_log_file_has_changed
420
+ while !(pid_file_available? || log_file_has_changed?)
421
+ sleep 0.1
422
+ end
423
+ return pid_file_is_available?
424
+ end
425
+
426
+ def wait_until_daemon_responds_to_ping_or_has_exited_or_log_file_has_changed
427
+ while !(run_ping_command || !daemon_is_running? || log_file_has_changed?)
428
+ sleep(@ping_interval)
429
+ end
430
+ return run_ping_command
431
+ end
432
+
433
+ def record_activity
434
+ @last_activity_time = Time.now
435
+ end
436
+
437
+ # Check whether there has been no recorded activity in the past +seconds+ seconds.
438
+ def no_activity?(seconds)
439
+ return Time.now - @last_activity_time > seconds
440
+ end
441
+
442
+ def pid_file_available?
443
+ return File.exist?(@pid_file) && File.stat(@pid_file).size != 0
444
+ end
445
+
446
+ # This method does nothing and only serves as a hook for the unit test.
447
+ def start_timed_out
448
+ end
449
+
450
+ def save_log_file_information
451
+ @original_log_file_stat = File.stat(@log_file) rescue nil
452
+ @current_log_file_stat = @original_log_file_stat
453
+ end
454
+
455
+ def log_file_has_changed?
456
+ if @current_log_file_stat
457
+ stat = File.stat(@log_file) rescue nil
458
+ if stat
459
+ result = @current_log_file_stat.mtime != stat.mtime ||
460
+ @current_log_file_stat.size != stat.size
461
+ @current_log_file_stat = stat
462
+ return result
463
+ else
464
+ return true
465
+ end
466
+ else
467
+ return false
468
+ end
469
+ end
470
+
471
+ def differences_in_log_file
472
+ if @original_log_file_stat
473
+ File.open(@log_file, 'r') do |f|
474
+ f.seek(@original_log_file_stat.size, IO::SEEK_SET)
475
+ diff = f.read.strip
476
+ if diff.empty?
477
+ return nil
478
+ else
479
+ return diff
480
+ end
481
+ end
482
+ else
483
+ return nil
484
+ end
485
+ rescue Errno::ENOENT
486
+ return nil
487
+ end
488
+
489
+ def determine_lock_file(identifier, pid_file)
490
+ return LockFile.new(File.expand_path(pid_file + ".lock"))
491
+ end
492
+
493
+ def self.fork_supported?
494
+ return RUBY_PLATFORM != "java" && RUBY_PLATFORM !~ /win32/
495
+ end
496
+
497
+ def run_command(command)
498
+ # Create tempfile for storing the command's output.
499
+ tempfile = Tempfile.new('daemon-output')
500
+ tempfile_path = tempfile.path
501
+ File.chmod(0666, tempfile_path)
502
+ tempfile.close
503
+
504
+ if self.class.fork_supported? || Process.respond_to?(:spawn)
505
+ if Process.respond_to?(:spawn)
506
+ pid = Process.spawn(command,
507
+ :in => "/dev/null",
508
+ :out => tempfile_path,
509
+ :err => tempfile_path,
510
+ :close_others => true
511
+ )
512
+ else
513
+ pid = safe_fork do
514
+ ObjectSpace.each_object(IO) do |obj|
515
+ obj.close rescue nil
516
+ end
517
+ STDIN.reopen("/dev/null", "r")
518
+ STDOUT.reopen(tempfile_path, "w")
519
+ STDERR.reopen(tempfile_path, "w")
520
+ exec(command)
521
+ end
522
+ end
523
+
524
+ # run_command might be running in a timeout block (like
525
+ # in #start_without_locking).
526
+ begin
527
+ interruptable_waitpid(pid)
528
+ rescue Errno::ECHILD
529
+ # Maybe a background thread or whatever waitpid()'ed
530
+ # this child process before we had the chance. There's
531
+ # no way to obtain the exit status now. Assume that
532
+ # it started successfully; if it didn't we'll know
533
+ # that later by checking the PID file and by pinging
534
+ # it.
535
+ return
536
+ rescue Timeout::Error
537
+ # If the daemon doesn't fork into the background
538
+ # in time, then kill it.
539
+ begin
540
+ Process.kill('SIGTERM', pid)
541
+ rescue SystemCallError
542
+ end
543
+ begin
544
+ Timeout.timeout(5, Timeout::Error) do
545
+ begin
546
+ interruptable_waitpid(pid)
547
+ rescue SystemCallError
548
+ end
549
+ end
550
+ rescue Timeout::Error
551
+ begin
552
+ Process.kill('SIGKILL', pid)
553
+ interruptable_waitpid(pid)
554
+ rescue SystemCallError
555
+ end
556
+ end
557
+ raise
558
+ end
559
+ if $?.exitstatus != 0
560
+ raise StartError, File.read(tempfile_path).strip
561
+ end
562
+ else
563
+ cmd = "#{command} >\"#{tempfile_path}\""
564
+ cmd += " 2>\"#{tempfile_path}\"" unless PLATFORM =~ /mswin/
565
+ if !system(cmd)
566
+ raise StartError, File.read(tempfile_path).strip
567
+ end
568
+ end
569
+ ensure
570
+ File.unlink(tempfile_path) rescue nil
571
+ end
572
+
573
+ def run_ping_command
574
+ if @ping_command.respond_to?(:call)
575
+ begin
576
+ value = @ping_command.call
577
+ if value.respond_to?(:close)
578
+ value.close rescue nil
579
+ end
580
+ return value
581
+ rescue *ALLOWED_CONNECT_EXCEPTIONS
582
+ return false
583
+ end
584
+ else
585
+ return system(@ping_command)
586
+ end
587
+ end
588
+
589
+ def safe_fork
590
+ pid = fork
591
+ if pid.nil?
592
+ begin
593
+ yield
594
+ rescue Exception => e
595
+ message = "*** Exception #{e.class} " <<
596
+ "(#{e}) (process #{$$}):\n" <<
597
+ "\tfrom " << e.backtrace.join("\n\tfrom ")
598
+ STDERR.write(e)
599
+ STDERR.flush
600
+ ensure
601
+ exit!
602
+ end
603
+ else
604
+ return pid
605
+ end
606
+ end
607
+
608
+ if RUBY_VERSION < "1.9"
609
+ def interruptable_waitpid(pid)
610
+ Process.waitpid(pid)
611
+ end
612
+ else
613
+ # On Ruby 1.9, Thread#kill (which is called by timeout.rb) may
614
+ # not be able to interrupt Process.waitpid. So here we use a
615
+ # special version that's a bit less efficient but is at least
616
+ # interruptable.
617
+ def interruptable_waitpid(pid)
618
+ result = nil
619
+ while !result
620
+ result = Process.waitpid(pid, Process::WNOHANG)
621
+ sleep 0.01 if !result
622
+ end
623
+ return result
624
+ end
625
+ end
626
+ end