evented_bluepill 0.0.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,422 @@
1
+ require "state_machine"
2
+ require "daemons"
3
+
4
+ module Bluepill
5
+ class Process
6
+ CONFIGURABLE_ATTRIBUTES = [
7
+ :start_command,
8
+ :stop_command,
9
+ :restart_command,
10
+
11
+ :stdout,
12
+ :stderr,
13
+ :stdin,
14
+
15
+ :daemonize,
16
+ :pid_file,
17
+ :working_dir,
18
+ :environment,
19
+
20
+ :start_grace_time,
21
+ :stop_grace_time,
22
+ :restart_grace_time,
23
+
24
+ :uid,
25
+ :gid,
26
+
27
+ :monitor_children,
28
+ :child_process_template
29
+ ]
30
+
31
+ attr_accessor :name, :watches, :triggers, :logger, :skip_ticks_until
32
+ attr_accessor *CONFIGURABLE_ATTRIBUTES
33
+ attr_reader :children, :statistics
34
+
35
+ state_machine :initial => :unmonitored do
36
+ # These are the idle states, i.e. only an event (either external or internal) will trigger a transition.
37
+ # The distinction between down and unmonitored is that down
38
+ # means we know it is not running and unmonitored is that we don't care if it's running.
39
+ state :unmonitored, :up, :down
40
+
41
+ # These are transitionary states, we expect the process to change state after a certain period of time.
42
+ state :starting, :stopping, :restarting
43
+
44
+ event :tick do
45
+ transition :starting => :up, :if => :process_running?
46
+ transition :starting => :down, :unless => :process_running?
47
+
48
+ transition :up => :up, :if => :process_running?
49
+ transition :up => :down, :unless => :process_running?
50
+
51
+ # The process failed to die after entering the stopping state. Change the state to reflect
52
+ # reality.
53
+ transition :stopping => :up, :if => :process_running?
54
+ transition :stopping => :down, :unless => :process_running?
55
+
56
+ transition :down => :up, :if => :process_running?
57
+ transition :down => :starting, :unless => :process_running?
58
+
59
+ transition :restarting => :up, :if => :process_running?
60
+ transition :restarting => :down, :unless => :process_running?
61
+ end
62
+
63
+ event :start do
64
+ transition [:unmonitored, :down] => :starting
65
+ end
66
+
67
+ event :stop do
68
+ transition :up => :stopping
69
+ end
70
+
71
+ event :unmonitor do
72
+ transition any => :unmonitored
73
+ end
74
+
75
+ event :restart do
76
+ transition [:up, :down] => :restarting
77
+ end
78
+
79
+ before_transition any => any, :do => :notify_triggers
80
+
81
+ after_transition any => :starting, :do => :start_process
82
+ after_transition any => :stopping, :do => :stop_process
83
+ after_transition any => :restarting, :do => :restart_process
84
+
85
+ after_transition any => any, :do => :record_transition
86
+ end
87
+
88
+ def initialize(process_name, options = {})
89
+ @name = process_name
90
+ @event_mutex = Monitor.new
91
+ @transition_history = Util::RotationalArray.new(10)
92
+ @watches = []
93
+ @triggers = []
94
+ @children = []
95
+ @statistics = ProcessStatistics.new
96
+
97
+ # These defaults are overriden below if it's configured to be something else.
98
+ @monitor_children = false
99
+ @start_grace_time = @stop_grace_time = @restart_grace_time = 3
100
+ @environment = {}
101
+
102
+ CONFIGURABLE_ATTRIBUTES.each do |attribute_name|
103
+ self.send("#{attribute_name}=", options[attribute_name]) if options.has_key?(attribute_name)
104
+ end
105
+
106
+ # Let state_machine do its initialization stuff
107
+ super() # no arguments intentional
108
+ end
109
+
110
+ def tick
111
+ return if self.skipping_ticks?
112
+ self.skip_ticks_until = nil
113
+
114
+ # clear the memoization per tick
115
+ @process_running = nil
116
+
117
+ # run state machine transitions
118
+ super
119
+
120
+ if self.up?
121
+ self.run_watches
122
+
123
+ if self.monitor_children?
124
+ refresh_children!
125
+ children.each {|child| child.tick}
126
+ end
127
+ end
128
+ end
129
+
130
+ def logger=(logger)
131
+ @logger = logger
132
+ self.watches.each {|w| w.logger = logger }
133
+ self.triggers.each {|t| t.logger = logger }
134
+ end
135
+
136
+ # State machine methods
137
+ def dispatch!(event, reason = nil)
138
+ @event_mutex.synchronize do
139
+ @statistics.record_event(event, reason)
140
+ self.send("#{event}")
141
+ end
142
+ end
143
+
144
+ def record_transition(transition)
145
+ unless transition.loopback?
146
+ @transitioned = true
147
+
148
+ # When a process changes state, we should clear the memory of all the watches
149
+ self.watches.each { |w| w.clear_history! }
150
+
151
+ # Also, when a process changes state, we should re-populate its child list
152
+ if self.monitor_children?
153
+ self.logger.warning "Clearing child list"
154
+ self.children.clear
155
+ end
156
+ logger.info "Going from #{transition.from_name} => #{transition.to_name}"
157
+ end
158
+ end
159
+
160
+ def notify_triggers(transition)
161
+ self.triggers.each {|trigger| trigger.notify(transition)}
162
+ end
163
+
164
+ # Watch related methods
165
+ def add_watch(name, options = {})
166
+ self.watches << ConditionWatch.new(name, options.merge(:logger => self.logger))
167
+ end
168
+
169
+ def add_trigger(name, options = {})
170
+ self.triggers << Trigger[name].new(self, options.merge(:logger => self.logger))
171
+ end
172
+
173
+ def run_watches
174
+ now = Time.now.to_i
175
+
176
+ threads = self.watches.collect do |watch|
177
+ [watch, Thread.new { Thread.current[:events] = watch.run(self.actual_pid, now) }]
178
+ end
179
+
180
+ @transitioned = false
181
+
182
+ threads.inject([]) do |events, (watch, thread)|
183
+ thread.join
184
+ if thread[:events].size > 0
185
+ logger.info "#{watch.name} dispatched: #{thread[:events].join(',')}"
186
+ thread[:events].each do |event|
187
+ events << [event, watch.to_s]
188
+ end
189
+ end
190
+ events
191
+ end.each do |(event, reason)|
192
+ break if @transitioned
193
+ self.dispatch!(event, reason)
194
+ end
195
+ end
196
+
197
+ def determine_initial_state
198
+ if self.process_running?(true)
199
+ self.state = 'up'
200
+ else
201
+ # TODO: or "unmonitored" if bluepill was started in no auto-start mode.
202
+ self.state = 'down'
203
+ end
204
+ end
205
+
206
+ def handle_user_command(cmd)
207
+ case cmd
208
+ when "start"
209
+ if self.process_running?(true)
210
+ logger.warning("Refusing to re-run start command on an already running process.")
211
+ else
212
+ dispatch!(:start, "user initiated")
213
+ end
214
+ when "stop"
215
+ stop_process
216
+ dispatch!(:unmonitor, "user initiated")
217
+ when "restart"
218
+ restart_process
219
+ when "unmonitor"
220
+ # When the user issues an unmonitor cmd, reset any triggers so that
221
+ # scheduled events gets cleared
222
+ triggers.each {|t| t.reset! }
223
+ dispatch!(:unmonitor, "user initiated")
224
+ end
225
+ end
226
+
227
+ # System Process Methods
228
+ def process_running?(force = false)
229
+ @process_running = nil if force # clear existing state if forced
230
+
231
+ @process_running ||= signal_process(0)
232
+ # the process isn't running, so we should clear the PID
233
+ self.clear_pid unless @process_running
234
+ @process_running
235
+ end
236
+
237
+ def start_process
238
+ logger.warning "Executing start command: #{start_command}"
239
+
240
+ if self.daemonize?
241
+ System.daemonize(start_command, self.system_command_options)
242
+
243
+ else
244
+ # This is a self-daemonizing process
245
+ with_timeout(start_grace_time) do
246
+ result = System.execute_blocking(start_command, self.system_command_options)
247
+
248
+ unless result[:exit_code].zero?
249
+ logger.warning "Start command execution returned non-zero exit code:"
250
+ logger.warning result.inspect
251
+ end
252
+ end
253
+ end
254
+
255
+ self.skip_ticks_for(start_grace_time)
256
+ end
257
+
258
+ def stop_process
259
+ if stop_command
260
+ cmd = self.prepare_command(stop_command)
261
+ logger.warning "Executing stop command: #{cmd}"
262
+
263
+ with_timeout(stop_grace_time) do
264
+ result = System.execute_blocking(cmd, self.system_command_options)
265
+
266
+ unless result[:exit_code].zero?
267
+ logger.warning "Stop command execution returned non-zero exit code:"
268
+ logger.warning result.inspect
269
+ end
270
+ end
271
+
272
+ else
273
+ logger.warning "Executing default stop command. Sending TERM signal to #{actual_pid}"
274
+ signal_process("TERM")
275
+ end
276
+ self.unlink_pid # TODO: we only write the pid file if we daemonize, should we only unlink it if we daemonize?
277
+
278
+ self.skip_ticks_for(stop_grace_time)
279
+ end
280
+
281
+ def restart_process
282
+ if restart_command
283
+ cmd = self.prepare_command(restart_command)
284
+
285
+ logger.warning "Executing restart command: #{cmd}"
286
+
287
+ with_timeout(restart_grace_time) do
288
+ result = System.execute_blocking(cmd, self.system_command_options)
289
+
290
+ unless result[:exit_code].zero?
291
+ logger.warning "Restart command execution returned non-zero exit code:"
292
+ logger.warning result.inspect
293
+ end
294
+ end
295
+
296
+ self.skip_ticks_for(restart_grace_time)
297
+ else
298
+ logger.warning "No restart_command specified. Must stop and start to restart"
299
+ self.stop_process
300
+ # the tick will bring it back.
301
+ end
302
+ end
303
+
304
+ def daemonize?
305
+ !!self.daemonize
306
+ end
307
+
308
+ def monitor_children?
309
+ !!self.monitor_children
310
+ end
311
+
312
+ def signal_process(code)
313
+ ::Process.kill(code, actual_pid)
314
+ true
315
+ rescue
316
+ false
317
+ end
318
+
319
+ def actual_pid
320
+ @actual_pid ||= begin
321
+ if pid_file
322
+ if File.exists?(pid_file)
323
+ str = File.read(pid_file)
324
+ str.to_i if str.size > 0
325
+ else
326
+ logger.warning("pid_file #{pid_file} does not exist or cannot be read")
327
+ nil
328
+ end
329
+ end
330
+ end
331
+ end
332
+
333
+ def actual_pid=(pid)
334
+ @actual_pid = pid
335
+ end
336
+
337
+ def clear_pid
338
+ @actual_pid = nil
339
+ end
340
+
341
+ def unlink_pid
342
+ File.unlink(pid_file) if pid_file && File.exists?(pid_file)
343
+ end
344
+
345
+ # Internal State Methods
346
+ def skip_ticks_for(seconds)
347
+ # TODO: should this be addative or longest wins?
348
+ # i.e. if two calls for skip_ticks_for come in for 5 and 10, should it skip for 10 or 15?
349
+ self.skip_ticks_until = (self.skip_ticks_until || Time.now.to_i) + seconds.to_i
350
+ end
351
+
352
+ def skipping_ticks?
353
+ self.skip_ticks_until && self.skip_ticks_until > Time.now.to_i
354
+ end
355
+
356
+ def refresh_children!
357
+ # First prune the list of dead children
358
+ @children.delete_if {|child| !child.process_running?(true) }
359
+
360
+ # Add new found children to the list
361
+ new_children_pids = System.get_children(self.actual_pid) - @children.map {|child| child.actual_pid}
362
+
363
+ unless new_children_pids.empty?
364
+ logger.info "Existing children: #{@children.collect{|c| c.actual_pid}.join(",")}. Got new children: #{new_children_pids.inspect} for #{actual_pid}"
365
+ end
366
+
367
+ # Construct a new process wrapper for each new found children
368
+ new_children_pids.each do |child_pid|
369
+ child = self.child_process_template.deep_copy
370
+
371
+ child.name = "<child(pid:#{child_pid})>"
372
+ child.actual_pid = child_pid
373
+ child.logger = self.logger.prefix_with(child.name)
374
+
375
+ child.initialize_state_machines
376
+ child.state = "up"
377
+
378
+ @children << child
379
+ end
380
+ end
381
+
382
+ def deep_copy
383
+ # TODO: This is a kludge. Ideally, process templates
384
+ # would be facotries, and not a template object.
385
+ mutex, triggers, @event_mutex, @triggers = @event_mutex, @triggers, nil, nil
386
+ clone = Marshal.load(Marshal.dump(self))
387
+ clone.instance_variable_set("@event_mutex", Monitor.new)
388
+ clone.instance_variable_set("@triggers", triggers.collect{ |t| t.deep_copy })
389
+ @event_mutex = mutex
390
+ @triggers = triggers
391
+ clone
392
+ end
393
+
394
+ def prepare_command(command)
395
+ command.to_s.gsub("{{PID}}", actual_pid.to_s)
396
+ end
397
+
398
+ def system_command_options
399
+ {
400
+ :uid => self.uid,
401
+ :gid => self.gid,
402
+ :working_dir => self.working_dir,
403
+ :environment => self.environment,
404
+ :pid_file => self.pid_file,
405
+ :logger => self.logger,
406
+ :stdin => self.stdin,
407
+ :stdout => self.stdout,
408
+ :stderr => self.stderr
409
+ }
410
+ end
411
+
412
+ def with_timeout(secs, &blk)
413
+ Timeout.timeout(secs.to_f, &blk)
414
+
415
+ rescue Timeout::Error
416
+ logger.err "Execution is taking longer than expected. Unmonitoring."
417
+ logger.err "Did you forget to tell bluepill to daemonize this process?"
418
+ self.dispatch!("unmonitor")
419
+ end
420
+ end
421
+ end
422
+
@@ -0,0 +1,17 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ class AlwaysTrue < ProcessCondition
4
+ def initialize(options = {})
5
+ @below = options[:below]
6
+ end
7
+
8
+ def run(pid)
9
+ 1
10
+ end
11
+
12
+ def check(value)
13
+ true
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,18 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ class CpuUsage < ProcessCondition
4
+ def initialize(options = {})
5
+ @below = options[:below]
6
+ end
7
+
8
+ def run(pid)
9
+ # third col in the ps axu output
10
+ System.cpu_usage(pid).to_f
11
+ end
12
+
13
+ def check(value)
14
+ value < @below
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,52 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module Bluepill
5
+ module ProcessConditions
6
+ class Http < ProcessCondition
7
+ def initialize(options = {})
8
+ @uri = URI.parse(options[:url])
9
+ @kind = case options[:kind]
10
+ when Fixnum then Net::HTTPResponse::CODE_TO_OBJ[options[:kind].to_s]
11
+ when String, Symbol then Net.const_get("HTTP#{options[:kind].to_s.camelize}")
12
+ else
13
+ Net::HTTPSuccess
14
+ end
15
+ @pattern = options[:pattern] || nil
16
+ @open_timeout = (options[:open_timeout] || options[:timeout] || 5).to_i
17
+ @read_timeout = (options[:read_timeout] || options[:timeout] || 5).to_i
18
+ end
19
+
20
+ def run(pid)
21
+ session = Net::HTTP.new(@uri.host, @uri.port)
22
+ session.open_timeout = @open_timeout
23
+ session.read_timeout = @read_timeout
24
+ hide_net_http_bug do
25
+ session.start do |http|
26
+ http.get(@uri.path)
27
+ end
28
+ end
29
+ rescue
30
+ $!
31
+ end
32
+
33
+ def check(value)
34
+ return false unless value.kind_of?(@kind)
35
+ return true unless @pattern
36
+ return false unless value.class.body_permitted?
37
+ @pattern === value.body
38
+ end
39
+
40
+ private
41
+ def hide_net_http_bug
42
+ yield
43
+ rescue NoMethodError => e
44
+ if e.to_s =~ /#{Regexp.escape(%q|undefined method `closed?' for nil:NilClass|)}/
45
+ raise Errno::ECONNREFUSED, "Connection refused attempting to contact #{@uri.scheme}://#{@uri.host}:#{@uri.port}"
46
+ else
47
+ raise
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,31 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ class MemUsage < ProcessCondition
4
+ MB = 1024 ** 2
5
+ FORMAT_STR = "%d%s"
6
+ MB_LABEL = "MB"
7
+ KB_LABEL = "KB"
8
+
9
+ def initialize(options = {})
10
+ @below = options[:below]
11
+ end
12
+
13
+ def run(pid)
14
+ # rss is on the 5th col
15
+ System.memory_usage(pid).to_f
16
+ end
17
+
18
+ def check(value)
19
+ value.kilobytes < @below
20
+ end
21
+
22
+ def format_value(value)
23
+ if value.kilobytes >= MB
24
+ FORMAT_STR % [(value / 1024).round, MB_LABEL]
25
+ else
26
+ FORMAT_STR % [value, KB_LABEL]
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,21 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ class ProcessCondition
4
+ def initialize(options = {})
5
+ @options = options
6
+ end
7
+
8
+ def run(pid)
9
+ raise "Implement in subclass!"
10
+ end
11
+
12
+ def check(value)
13
+ raise "Implement in subclass!"
14
+ end
15
+
16
+ def format_value(value)
17
+ value
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,13 @@
1
+ module Bluepill
2
+ module ProcessConditions
3
+ def self.[](name)
4
+ const_get(name.to_s.camelcase)
5
+ end
6
+ end
7
+ end
8
+
9
+ require "bluepill/process_conditions/process_condition"
10
+ Dir["#{File.dirname(__FILE__)}/process_conditions/*.rb"].each do |pc|
11
+ require pc
12
+ end
13
+
@@ -0,0 +1,24 @@
1
+ module Bluepill
2
+ class ProcessStatistics
3
+ STRFTIME = "%m/%d/%Y %H:%I:%S"
4
+ # possibly persist this data.
5
+ def initialize
6
+ @events = Util::RotationalArray.new(10)
7
+ end
8
+
9
+ def record_event(event, reason)
10
+ @events.push([event, reason, Time.now])
11
+ end
12
+
13
+ def to_s
14
+ str = []
15
+ @events.each do |(event, reason, time)|
16
+ str << " #{event} at #{time.strftime(STRFTIME)} - #{reason || "unspecified"}"
17
+ end
18
+ if str.size > 0
19
+ str << "event history:"
20
+ end
21
+ str.reverse.join("\n")
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,47 @@
1
+ require 'socket'
2
+
3
+ module Bluepill
4
+ module Socket
5
+ TIMEOUT = 10
6
+
7
+ extend self
8
+
9
+ def client(base_dir, name, &b)
10
+ UNIXSocket.open(socket_path(base_dir, name), &b)
11
+ end
12
+
13
+ def client_command(base_dir, name, command)
14
+ client(base_dir, name) do |socket|
15
+ Timeout.timeout(TIMEOUT) do
16
+ socket.puts command
17
+ Marshal.load(socket)
18
+ end
19
+ end
20
+ rescue EOFError, Timeout::Error
21
+ abort("Socket Timeout: Server may not be responding")
22
+ end
23
+
24
+ def server(base_dir, name)
25
+ socket_path = self.socket_path(base_dir, name)
26
+ begin
27
+ UNIXServer.open(socket_path)
28
+ rescue Errno::EADDRINUSE
29
+ # if sock file has been created. test to see if there is a server
30
+ begin
31
+ UNIXSocket.open(socket_path)
32
+ rescue Errno::ECONNREFUSED
33
+ File.delete(socket_path)
34
+ return UNIXServer.open(socket_path)
35
+ else
36
+ logger.err("Server is already running!")
37
+ exit(7)
38
+ end
39
+ end
40
+ end
41
+
42
+ def socket_path(base_dir, name)
43
+ File.join(base_dir, 'socks', name + ".sock")
44
+ end
45
+ end
46
+ end
47
+