evented_bluepill 0.0.46

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,422 @@
1
+ require "state_machine"
2
+ require "daemons"
3
+
4
+ module Bluepill
5
+ class Process
6
+ CONFIGURABLE_ATTRIBUTES = [
7
+ :start_command,
8
+ :stop_command,
9
+ :restart_command,
10
+
11
+ :stdout,
12
+ :stderr,
13
+ :stdin,
14
+
15
+ :daemonize,
16
+ :pid_file,
17
+ :working_dir,
18
+ :environment,
19
+
20
+ :start_grace_time,
21
+ :stop_grace_time,
22
+ :restart_grace_time,
23
+
24
+ :uid,
25
+ :gid,
26
+
27
+ :monitor_children,
28
+ :child_process_template
29
+ ]
30
+
31
+ attr_accessor :name, :watches, :triggers, :logger, :skip_ticks_until
32
+ attr_accessor *CONFIGURABLE_ATTRIBUTES
33
+ attr_reader :children, :statistics
34
+
35
+ state_machine :initial => :unmonitored do
36
+ # These are the idle states, i.e. only an event (either external or internal) will trigger a transition.
37
+ # The distinction between down and unmonitored is that down
38
+ # means we know it is not running and unmonitored is that we don't care if it's running.
39
+ state :unmonitored, :up, :down
40
+
41
+ # These are transitionary states, we expect the process to change state after a certain period of time.
42
+ state :starting, :stopping, :restarting
43
+
44
+ event :tick do
45
+ transition :starting => :up, :if => :process_running?
46
+ transition :starting => :down, :unless => :process_running?
47
+
48
+ transition :up => :up, :if => :process_running?
49
+ transition :up => :down, :unless => :process_running?
50
+
51
+ # The process failed to die after entering the stopping state. Change the state to reflect
52
+ # reality.
53
+ transition :stopping => :up, :if => :process_running?
54
+ transition :stopping => :down, :unless => :process_running?
55
+
56
+ transition :down => :up, :if => :process_running?
57
+ transition :down => :starting, :unless => :process_running?
58
+
59
+ transition :restarting => :up, :if => :process_running?
60
+ transition :restarting => :down, :unless => :process_running?
61
+ end
62
+
63
+ event :start do
64
+ transition [:unmonitored, :down] => :starting
65
+ end
66
+
67
+ event :stop do
68
+ transition :up => :stopping
69
+ end
70
+
71
+ event :unmonitor do
72
+ transition any => :unmonitored
73
+ end
74
+
75
+ event :restart do
76
+ transition [:up, :down] => :restarting
77
+ end
78
+
79
+ before_transition any => any, :do => :notify_triggers
80
+
81
+ after_transition any => :starting, :do => :start_process
82
+ after_transition any => :stopping, :do => :stop_process
83
+ after_transition any => :restarting, :do => :restart_process
84
+
85
+ after_transition any => any, :do => :record_transition
86
+ end
87
+
88
+ def initialize(process_name, options = {})
89
+ @name = process_name
90
+ @event_mutex = Monitor.new
91
+ @transition_history = Util::RotationalArray.new(10)
92
+ @watches = []
93
+ @triggers = []
94
+ @children = []
95
+ @statistics = ProcessStatistics.new
96
+
97
+ # These defaults are overriden below if it's configured to be something else.
98
+ @monitor_children = false
99
+ @start_grace_time = @stop_grace_time = @restart_grace_time = 3
100
+ @environment = {}
101
+
102
+ CONFIGURABLE_ATTRIBUTES.each do |attribute_name|
103
+ self.send("#{attribute_name}=", options[attribute_name]) if options.has_key?(attribute_name)
104
+ end
105
+
106
+ # Let state_machine do its initialization stuff
107
+ super() # no arguments intentional
108
+ end
109
+
110
+ def tick
111
+ return if self.skipping_ticks?
112
+ self.skip_ticks_until = nil
113
+
114
+ # clear the memoization per tick
115
+ @process_running = nil
116
+
117
+ # run state machine transitions
118
+ super
119
+
120
+ if self.up?
121
+ self.run_watches
122
+
123
+ if self.monitor_children?
124
+ refresh_children!
125
+ children.each {|child| child.tick}
126
+ end
127
+ end
128
+ end
129
+
130
+ def logger=(logger)
131
+ @logger = logger
132
+ self.watches.each {|w| w.logger = logger }
133
+ self.triggers.each {|t| t.logger = logger }
134
+ end
135
+
136
+ # State machine methods
137
+ def dispatch!(event, reason = nil)
138
+ @event_mutex.synchronize do
139
+ @statistics.record_event(event, reason)
140
+ self.send("#{event}")
141
+ end
142
+ end
143
+
144
+ def record_transition(transition)
145
+ unless transition.loopback?
146
+ @transitioned = true
147
+
148
+ # When a process changes state, we should clear the memory of all the watches
149
+ self.watches.each { |w| w.clear_history! }
150
+
151
+ # Also, when a process changes state, we should re-populate its child list
152
+ if self.monitor_children?
153
+ self.logger.warning "Clearing child list"
154
+ self.children.clear
155
+ end
156
+ logger.info "Going from #{transition.from_name} => #{transition.to_name}"
157
+ end
158
+ end
159
+
160
+ def notify_triggers(transition)
161
+ self.triggers.each {|trigger| trigger.notify(transition)}
162
+ end
163
+
164
+ # Watch related methods
165
+ def add_watch(name, options = {})
166
+ self.watches << ConditionWatch.new(name, options.merge(:logger => self.logger))
167
+ end
168
+
169
+ def add_trigger(name, options = {})
170
+ self.triggers << Trigger[name].new(self, options.merge(:logger => self.logger))
171
+ end
172
+
173
+ def run_watches
174
+ now = Time.now.to_i
175
+
176
+ threads = self.watches.collect do |watch|
177
+ [watch, Thread.new { Thread.current[:events] = watch.run(self.actual_pid, now) }]
178
+ end
179
+
180
+ @transitioned = false
181
+
182
+ threads.inject([]) do |events, (watch, thread)|
183
+ thread.join
184
+ if thread[:events].size > 0
185
+ logger.info "#{watch.name} dispatched: #{thread[:events].join(',')}"
186
+ thread[:events].each do |event|
187
+ events << [event, watch.to_s]
188
+ end
189
+ end
190
+ events
191
+ end.each do |(event, reason)|
192
+ break if @transitioned
193
+ self.dispatch!(event, reason)
194
+ end
195
+ end
196
+
197
+ def determine_initial_state
198
+ if self.process_running?(true)
199
+ self.state = 'up'
200
+ else
201
+ # TODO: or "unmonitored" if bluepill was started in no auto-start mode.
202
+ self.state = 'down'
203
+ end
204
+ end
205
+
206
+ def handle_user_command(cmd)
207
+ case cmd
208
+ when "start"
209
+ if self.process_running?(true)
210
+ logger.warning("Refusing to re-run start command on an already running process.")
211
+ else
212
+ dispatch!(:start, "user initiated")
213
+ end
214
+ when "stop"
215
+ stop_process
216
+ dispatch!(:unmonitor, "user initiated")
217
+ when "restart"
218
+ restart_process
219
+ when "unmonitor"
220
+ # When the user issues an unmonitor cmd, reset any triggers so that
221
+ # scheduled events gets cleared
222
+ triggers.each {|t| t.reset! }
223
+ dispatch!(:unmonitor, "user initiated")
224
+ end
225
+ end
226
+
227
+ # System Process Methods
228
+ def process_running?(force = false)
229
+ @process_running = nil if force # clear existing state if forced
230
+
231
+ @process_running ||= signal_process(0)
232
+ # the process isn't running, so we should clear the PID
233
+ self.clear_pid unless @process_running
234
+ @process_running
235
+ end
236
+
237
+ def start_process
238
+ logger.warning "Executing start command: #{start_command}"
239
+
240
+ if self.daemonize?
241
+ System.daemonize(start_command, self.system_command_options)
242
+
243
+ else
244
+ # This is a self-daemonizing process
245
+ with_timeout(start_grace_time) do
246
+ result = System.execute_blocking(start_command, self.system_command_options)
247
+
248
+ unless result[:exit_code].zero?
249
+ logger.warning "Start command execution returned non-zero exit code:"
250
+ logger.warning result.inspect
251
+ end
252
+ end
253
+ end
254
+
255
+ self.skip_ticks_for(start_grace_time)
256
+ end
257
+
258
+ def stop_process
259
+ if stop_command
260
+ cmd = self.prepare_command(stop_command)
261
+ logger.warning "Executing stop command: #{cmd}"
262
+
263
+ with_timeout(stop_grace_time) do
264
+ result = System.execute_blocking(cmd, self.system_command_options)
265
+
266
+ unless result[:exit_code].zero?
267
+ logger.warning "Stop command execution returned non-zero exit code:"
268
+ logger.warning result.inspect
269
+ end
270
+ end
271
+
272
+ else
273
+ logger.warning "Executing default stop command. Sending TERM signal to #{actual_pid}"
274
+ signal_process("TERM")
275
+ end
276
+ self.unlink_pid # TODO: we only write the pid file if we daemonize, should we only unlink it if we daemonize?
277
+
278
+ self.skip_ticks_for(stop_grace_time)
279
+ end
280
+
281
+ def restart_process
282
+ if restart_command
283
+ cmd = self.prepare_command(restart_command)
284
+
285
+ logger.warning "Executing restart command: #{cmd}"
286
+
287
+ with_timeout(restart_grace_time) do
288
+ result = System.execute_blocking(cmd, self.system_command_options)
289
+
290
+ unless result[:exit_code].zero?
291
+ logger.warning "Restart command execution returned non-zero exit code:"
292
+ logger.warning result.inspect
293
+ end
294
+ end
295
+
296
+ self.skip_ticks_for(restart_grace_time)
297
+ else
298
+ logger.warning "No restart_command specified. Must stop and start to restart"
299
+ self.stop_process
300
+ # the tick will bring it back.
301
+ end
302
+ end
303
+
304
+ def daemonize?
305
+ !!self.daemonize
306
+ end
307
+
308
+ def monitor_children?
309
+ !!self.monitor_children
310
+ end
311
+
312
+ def signal_process(code)
313
+ ::Process.kill(code, actual_pid)
314
+ true
315
+ rescue
316
+ false
317
+ end
318
+
319
+ def actual_pid
320
+ @actual_pid ||= begin
321
+ if pid_file
322
+ if File.exists?(pid_file)
323
+ str = File.read(pid_file)
324
+ str.to_i if str.size > 0
325
+ else
326
+ logger.warning("pid_file #{pid_file} does not exist or cannot be read")
327
+ nil
328
+ end
329
+ end
330
+ end
331
+ end
332
+
333
+ def actual_pid=(pid)
334
+ @actual_pid = pid
335
+ end
336
+
337
+ def clear_pid
338
+ @actual_pid = nil
339
+ end
340
+
341
+ def unlink_pid
342
+ File.unlink(pid_file) if pid_file && File.exists?(pid_file)
343
+ end
344
+
345
+ # Internal State Methods
346
+ def skip_ticks_for(seconds)
347
+ # TODO: should this be addative or longest wins?
348
+ # i.e. if two calls for skip_ticks_for come in for 5 and 10, should it skip for 10 or 15?
349
+ self.skip_ticks_until = (self.skip_ticks_until || Time.now.to_i) + seconds.to_i
350
+ end
351
+
352
+ def skipping_ticks?
353
+ self.skip_ticks_until && self.skip_ticks_until > Time.now.to_i
354
+ end
355
+
356
+ def refresh_children!
357
+ # First prune the list of dead children
358
+ @children.delete_if {|child| !child.process_running?(true) }
359
+
360
+ # Add new found children to the list
361
+ new_children_pids = System.get_children(self.actual_pid) - @children.map {|child| child.actual_pid}
362
+
363
+ unless new_children_pids.empty?
364
+ logger.info "Existing children: #{@children.collect{|c| c.actual_pid}.join(",")}. Got new children: #{new_children_pids.inspect} for #{actual_pid}"
365
+ end
366
+
367
+ # Construct a new process wrapper for each new found children
368
+ new_children_pids.each do |child_pid|
369
+ child = self.child_process_template.deep_copy
370
+
371
+ child.name = "<child(pid:#{child_pid})>"
372
+ child.actual_pid = child_pid
373
+ child.logger = self.logger.prefix_with(child.name)
374
+
375
+ child.initialize_state_machines
376
+ child.state = "up"
377
+
378
+ @children << child
379
+ end
380
+ end
381
+
382
+ def deep_copy
383
+ # TODO: This is a kludge. Ideally, process templates
384
+ # would be facotries, and not a template object.
385
+ mutex, triggers, @event_mutex, @triggers = @event_mutex, @triggers, nil, nil
386
+ clone = Marshal.load(Marshal.dump(self))
387
+ clone.instance_variable_set("@event_mutex", Monitor.new)
388
+ clone.instance_variable_set("@triggers", triggers.collect{ |t| t.deep_copy })
389
+ @event_mutex = mutex
390
+ @triggers = triggers
391
+ clone
392
+ end
393
+
394
+ def prepare_command(command)
395
+ command.to_s.gsub("{{PID}}", actual_pid.to_s)
396
+ end
397
+
398
+ def system_command_options
399
+ {
400
+ :uid => self.uid,
401
+ :gid => self.gid,
402
+ :working_dir => self.working_dir,
403
+ :environment => self.environment,
404
+ :pid_file => self.pid_file,
405
+ :logger => self.logger,
406
+ :stdin => self.stdin,
407
+ :stdout => self.stdout,
408
+ :stderr => self.stderr
409
+ }
410
+ end
411
+
412
+ def with_timeout(secs, &blk)
413
+ Timeout.timeout(secs.to_f, &blk)
414
+
415
+ rescue Timeout::Error
416
+ logger.err "Execution is taking longer than expected. Unmonitoring."
417
+ logger.err "Did you forget to tell bluepill to daemonize this process?"
418
+ self.dispatch!("unmonitor")
419
+ end
420
+ end
421
+ end
422
+
@@ -0,0 +1,17 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ class AlwaysTrue < ProcessCondition
4
+ def initialize(options = {})
5
+ @below = options[:below]
6
+ end
7
+
8
+ def run(pid)
9
+ 1
10
+ end
11
+
12
+ def check(value)
13
+ true
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,18 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ class CpuUsage < ProcessCondition
4
+ def initialize(options = {})
5
+ @below = options[:below]
6
+ end
7
+
8
+ def run(pid)
9
+ # third col in the ps axu output
10
+ System.cpu_usage(pid).to_f
11
+ end
12
+
13
+ def check(value)
14
+ value < @below
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,52 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module Bluepill
5
+ module ProcessConditions
6
+ class Http < ProcessCondition
7
+ def initialize(options = {})
8
+ @uri = URI.parse(options[:url])
9
+ @kind = case options[:kind]
10
+ when Fixnum then Net::HTTPResponse::CODE_TO_OBJ[options[:kind].to_s]
11
+ when String, Symbol then Net.const_get("HTTP#{options[:kind].to_s.camelize}")
12
+ else
13
+ Net::HTTPSuccess
14
+ end
15
+ @pattern = options[:pattern] || nil
16
+ @open_timeout = (options[:open_timeout] || options[:timeout] || 5).to_i
17
+ @read_timeout = (options[:read_timeout] || options[:timeout] || 5).to_i
18
+ end
19
+
20
+ def run(pid)
21
+ session = Net::HTTP.new(@uri.host, @uri.port)
22
+ session.open_timeout = @open_timeout
23
+ session.read_timeout = @read_timeout
24
+ hide_net_http_bug do
25
+ session.start do |http|
26
+ http.get(@uri.path)
27
+ end
28
+ end
29
+ rescue
30
+ $!
31
+ end
32
+
33
+ def check(value)
34
+ return false unless value.kind_of?(@kind)
35
+ return true unless @pattern
36
+ return false unless value.class.body_permitted?
37
+ @pattern === value.body
38
+ end
39
+
40
+ private
41
+ def hide_net_http_bug
42
+ yield
43
+ rescue NoMethodError => e
44
+ if e.to_s =~ /#{Regexp.escape(%q|undefined method `closed?' for nil:NilClass|)}/
45
+ raise Errno::ECONNREFUSED, "Connection refused attempting to contact #{@uri.scheme}://#{@uri.host}:#{@uri.port}"
46
+ else
47
+ raise
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,31 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ class MemUsage < ProcessCondition
4
+ MB = 1024 ** 2
5
+ FORMAT_STR = "%d%s"
6
+ MB_LABEL = "MB"
7
+ KB_LABEL = "KB"
8
+
9
+ def initialize(options = {})
10
+ @below = options[:below]
11
+ end
12
+
13
+ def run(pid)
14
+ # rss is on the 5th col
15
+ System.memory_usage(pid).to_f
16
+ end
17
+
18
+ def check(value)
19
+ value.kilobytes < @below
20
+ end
21
+
22
+ def format_value(value)
23
+ if value.kilobytes >= MB
24
+ FORMAT_STR % [(value / 1024).round, MB_LABEL]
25
+ else
26
+ FORMAT_STR % [value, KB_LABEL]
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,21 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ class ProcessCondition
4
+ def initialize(options = {})
5
+ @options = options
6
+ end
7
+
8
+ def run(pid)
9
+ raise "Implement in subclass!"
10
+ end
11
+
12
+ def check(value)
13
+ raise "Implement in subclass!"
14
+ end
15
+
16
+ def format_value(value)
17
+ value
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,13 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ def self.[](name)
4
+ const_get(name.to_s.camelcase)
5
+ end
6
+ end
7
+ end
8
+
9
+ require "bluepill/process_conditions/process_condition"
10
+ Dir["#{File.dirname(__FILE__)}/process_conditions/*.rb"].each do |pc|
11
+ require pc
12
+ end
13
+
@@ -0,0 +1,24 @@
1
+ module Bluepill
2
+ class ProcessStatistics
3
+ STRFTIME = "%m/%d/%Y %H:%I:%S"
4
+ # possibly persist this data.
5
+ def initialize
6
+ @events = Util::RotationalArray.new(10)
7
+ end
8
+
9
+ def record_event(event, reason)
10
+ @events.push([event, reason, Time.now])
11
+ end
12
+
13
+ def to_s
14
+ str = []
15
+ @events.each do |(event, reason, time)|
16
+ str << " #{event} at #{time.strftime(STRFTIME)} - #{reason || "unspecified"}"
17
+ end
18
+ if str.size > 0
19
+ str << "event history:"
20
+ end
21
+ str.reverse.join("\n")
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,47 @@
1
+ require 'socket'
2
+
3
+ module Bluepill
4
+ module Socket
5
+ TIMEOUT = 10
6
+
7
+ extend self
8
+
9
+ def client(base_dir, name, &b)
10
+ UNIXSocket.open(socket_path(base_dir, name), &b)
11
+ end
12
+
13
+ def client_command(base_dir, name, command)
14
+ client(base_dir, name) do |socket|
15
+ Timeout.timeout(TIMEOUT) do
16
+ socket.puts command
17
+ Marshal.load(socket)
18
+ end
19
+ end
20
+ rescue EOFError, Timeout::Error
21
+ abort("Socket Timeout: Server may not be responding")
22
+ end
23
+
24
+ def server(base_dir, name)
25
+ socket_path = self.socket_path(base_dir, name)
26
+ begin
27
+ UNIXServer.open(socket_path)
28
+ rescue Errno::EADDRINUSE
29
+ # if sock file has been created. test to see if there is a server
30
+ begin
31
+ UNIXSocket.open(socket_path)
32
+ rescue Errno::ECONNREFUSED
33
+ File.delete(socket_path)
34
+ return UNIXServer.open(socket_path)
35
+ else
36
+ logger.err("Server is already running!")
37
+ exit(7)
38
+ end
39
+ end
40
+ end
41
+
42
+ def socket_path(base_dir, name)
43
+ File.join(base_dir, 'socks', name + ".sock")
44
+ end
45
+ end
46
+ end
47
+