scout_agent 3.0.7 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -109,6 +109,9 @@ module ScoutAgent
109
109
 
110
110
  # maintain the queue database
111
111
  db.maintain
112
+ status_database.maintain
113
+ # clean out old logs
114
+ ScoutAgent.remove_old_log_files(log)
112
115
 
113
116
  log.info("Messages queued successfully.")
114
117
  end
@@ -108,8 +108,11 @@ module ScoutAgent
108
108
  "#{run_time} seconds." )
109
109
  end
110
110
 
111
- # maintain the snapshots database
111
+ # maintain the databases
112
112
  db.maintain
113
+ status_database.maintain
114
+ # clean out old logs
115
+ ScoutAgent.remove_old_log_files(log)
113
116
 
114
117
  log.info("Snapshot complete.")
115
118
  this_file.flock(File::LOCK_UN) # release our snapshot lock
@@ -74,10 +74,7 @@ module ScoutAgent
74
74
  lifelines = agents.map { |agent| Lifeline.new(agent, log) }
75
75
  %w[TERM INT].each do |signal|
76
76
  trap(signal) do
77
- Thread.new do
78
- lifelines.each { |line| line.terminate }
79
- Process.waitall
80
- end
77
+ lifelines.each { |line| line.terminate }
81
78
  end
82
79
  end
83
80
  lifelines.each do |line|
@@ -88,6 +85,8 @@ module ScoutAgent
88
85
  lifelines.each do |line|
89
86
  line.join
90
87
  end
88
+ # wait for all children to obey our stop command
89
+ Process.waitall
91
90
  end
92
91
 
93
92
  #######
@@ -13,20 +13,29 @@ module ScoutAgent
13
13
  # to terminate.
14
14
  #
15
15
  class Stop < Assignment
16
+ #
17
+ # The number of periods between stop checks after a process has been
18
+ # signaled. Checks will be made until it's obvious the process obeyed the
19
+ # request or until this number of checks has been made.
20
+ #
21
+ WAIT_COUNT = 10
22
+ #
23
+ # The pause in seconds between stop checks after a process has been
24
+ # signaled.
25
+ #
26
+ WAIT_DELAY = 0.5
27
+
16
28
  # Runs the stop command.
17
29
  def execute
18
30
  @agent = IDCard.new(:lifeline)
19
31
  if @agent.pid_file.exist?
20
- puts "Stopping #{ScoutAgent.proper_agent_name} (PID #{@agent.pid})..."
21
- signal_and_wait("TERM")
22
- if @agent.pid_file.exist?
23
- puts "TERM signal was ignored, sending KILL..."
24
- signal_and_wait("KILL")
25
- if @agent.pid_file.exist?
26
- abort_with_failed_to_stop
27
- end
32
+ signal_all("TERM")
33
+ if Plan.pid_dir.entries.any? { |pid| pid.to_s =~ /\w+\.pid\z/ }
34
+ puts "\nTERM signals were ignored, sending KILL signals.\n\n"
35
+ signal_all("KILL")
36
+ abort_with_failed_to_stop
28
37
  end
29
- puts "Stopped."
38
+ puts "All processes stopped."
30
39
  else
31
40
  abort_with_not_running_notice
32
41
  end
@@ -37,28 +46,49 @@ module ScoutAgent
37
46
  #######
38
47
 
39
48
  #
40
- # Sends +signal_name+ (which is expected to be some kind of stop request
41
- # to the <tt>@agent</tt>. This method will then wait +wait_count+ periods
42
- # of +wait_delay+ seconds checking between waits to see if the agent has
43
- # complied. It returns when the agent has exited or the total wait period
44
- # has expired.
49
+ # Sends +signal_name+ (assumed to be a stop request) to all subprocesses
50
+ # of the agent. The first process signaled is the lifeline process and it
51
+ # is given a few seconds to shut everything down smoothly as it is
52
+ # supposed to do. If that fails, the stray processes will be sent
53
+ # +signal_name+ directly.
45
54
  #
46
- def signal_and_wait(signal_name, wait_count = 10, wait_delay = 0.5)
47
- # signal the main process
48
- @agent.signal(signal_name)
49
- # wait for it to stop
50
- wait_count.times do
51
- sleep wait_delay
52
- break unless @agent.pid_file.exist?
53
- end
54
- # signal other stray processes
55
+ def signal_all(signal_name)
56
+ # start with the Lifeline process as that should properly stop everyone
57
+ signal_and_wait(@agent, signal_name)
58
+ # signal any other stray processes
55
59
  Plan.pid_dir.each_entry do |process|
56
60
  if process.to_s =~ /(\w+)\.pid\z/
57
- IDCard.new($1).signal(signal_name)
61
+ signal_and_wait(IDCard.new($1), signal_name) unless $1 == "lifeline"
62
+ end
63
+ end
64
+ end
65
+
66
+ #
67
+ # Sends +signal_name+ (which is expected to be some kind of stop request)
68
+ # to the +id_card+. This method will then wait +WAIT_COUNT+ periods of
69
+ # +WAIT_DELAY+ seconds checking between waits to see if the agent has
70
+ # complied. It returns when the signaled process has exited or the total
71
+ # wait period has expired. The wait period is skipped for the +KILL+
72
+ # signal, since the process is not allowed to respond.
73
+ #
74
+ def signal_and_wait(id_card, signal_name)
75
+ puts "Sending #{signal_name} signal to the agent's " +
76
+ "#{id_card.process_name} process."
77
+ # signal the main process
78
+ begin
79
+ id_card.signal(signal_name)
80
+ rescue Errno::ECHILD, Errno::ESRCH # no such process
81
+ # do nothing: it stopped
82
+ rescue Errno::EPERM # we don't have permission
83
+ abort_with_no_permission
84
+ end
85
+ unless signal_name == "KILL" # process cannot respond, so don't wait
86
+ # wait for it to stop
87
+ WAIT_COUNT.times do
88
+ sleep WAIT_DELAY
89
+ break unless id_card.pid_file.exist?
58
90
  end
59
91
  end
60
- rescue Errno::EPERM # we don't have permission
61
- abort_with_no_permission
62
92
  end
63
93
 
64
94
  #
@@ -75,6 +105,7 @@ module ScoutAgent
75
105
  #
76
106
  def abort_with_no_permission
77
107
  abort <<-END_PERMISSION.trim
108
+
78
109
  Unable to signal the daemon. Please rerun this command with
79
110
  super user privileges:
80
111
 
@@ -84,13 +115,16 @@ module ScoutAgent
84
115
  end
85
116
 
86
117
  #
87
- # Abort with an error message to the user that says we don't have enough
88
- # permission to stop the agent due to how it was started.
118
+ # Abort with an error message to the user that says we killed the agent
119
+ # but there are some stray file hanging around.
89
120
  #
90
121
  def abort_with_failed_to_stop
91
122
  abort <<-END_FAILED.trim
92
- Unable to stop the daemon. You may need to use the PID files
93
- in #{Plan.pid_dir} to clean up stay processes.
123
+
124
+ KILL signals were sent to all active processes and they
125
+ should be stopped now. You may wish to check the PID
126
+ files in #{Plan.pid_dir} to be sure. The agent should
127
+ clean up these old files as it relaunches.
94
128
  END_FAILED
95
129
  end
96
130
  end
@@ -7,15 +7,26 @@ module ScoutAgent
7
7
  # execution and to signal other processes.
8
8
  #
9
9
  class IDCard
10
- class << self
11
- #
12
- # This global attribute should contain the name of the current process.
13
- # It is set during a successful authorization.
14
- #
15
- # <b>Warning:</b> Be sure to clear this attribute immediately after a
16
- # fork() so you don't keep the parent's identity.
17
- #
18
- attr_accessor :me
10
+ #
11
+ # This global attribute should contain the IDCard for this process. It is
12
+ # set during a successful authorization.
13
+ #
14
+ # <b>Warning:</b> Be sure to clear this attribute immediately after a
15
+ # fork(), with a call to me=(), so you don't keep the parent's identity.
16
+ #
17
+ def self.me
18
+ @me ||= nil
19
+ end
20
+
21
+ #
22
+ # A setter for the identity of this process. This is set automatically as
23
+ # part of an authorization.
24
+ #
25
+ # <b>Warning:</b> Be sure to clear this attribute immediately after a
26
+ # fork() so you don't keep the parent's identity.
27
+ #
28
+ def self.me=(id_card)
29
+ @me = id_card
19
30
  end
20
31
 
21
32
  #
@@ -76,10 +87,10 @@ module ScoutAgent
76
87
  # <tt>IDCard::me()</tt> has been updated and an exit handle has been
77
88
  # installed to revoke() this claim as the process ends.
78
89
  #
79
- def authorize
90
+ def authorize(&block)
80
91
  File.open(pid_file, File::CREAT | File::EXCL | File::WRONLY) do |pid|
81
92
  pid.flock(File::LOCK_EX)
82
- if not block_given? or yield # allows for daemonization
93
+ if block.nil? or block.call # allows for daemonization
83
94
  pid.puts Process.pid
84
95
  else
85
96
  pid.flock(File::LOCK_UN)
@@ -105,7 +116,7 @@ module ScoutAgent
105
116
  # stale PID file found, clearing it and reloading
106
117
  if revoke
107
118
  pid.flock(File::LOCK_UN) # release the lock before we recurse
108
- return authorize # try again
119
+ return authorize(&block) # try again
109
120
  end
110
121
  rescue Errno::EACCES # don't have permission
111
122
  # nothing we can do so give up
@@ -2,16 +2,46 @@
2
2
  # encoding: UTF-8
3
3
 
4
4
  module ScoutAgent
5
+ #
6
+ # This class is a monitor for an Agent subprocess of the platform. It
7
+ # launches the Agent code and makes sure it continues to check-in at regular
8
+ # intervals, restarting the subprocess when it fails to do so.
9
+ #
5
10
  class Lifeline
11
+ #
12
+ # The number of seconds allowed to pass before the Agent subprocess is
13
+ # considered unresponsive.
14
+ #
6
15
  NO_CONTACT_TIMEOUT = 5
7
- CHECK_IN_FREQUENCY = 0.99 # gives us five check-ins before a cutoff
16
+ #
17
+ # The frequency with which the subprocess is expected to check-in. This is
18
+ # purposely set to a little under a second to give one more check-in
19
+ # possibility before the <tt>NO_CONTACT_TIMEOUT</tt> is reached.
20
+ #
21
+ CHECK_IN_FREQUENCY = 0.99
22
+ #
23
+ # The number of seconds the monitor will wait for a process to exit cleanly
24
+ # before forcing a stop.
25
+ #
8
26
  TERM_TO_KILL_PAUSE = 1
27
+ #
28
+ # The sequence of seconds this monitor will wait between restarts of the
29
+ # subprocess. The initial values are short, to try and get running again as
30
+ # soon as possible. However, this timeout grows larger up to a point to
31
+ # reduce strain on a server experiencing long term problems. The sequence
32
+ # will reset after a successful relaunch that runs for at least as long as
33
+ # the next number in the sequence (or the max).
34
+ #
9
35
  RELAUNCH_FREQUENCIES = [0, 1, 1, 2, 3, 5, 8, 13]
10
36
 
11
37
  #################
12
38
  ### Interface ###
13
39
  #################
14
40
 
41
+ #
42
+ # Prepares a monitor for the code specified by +agent+. You may also set
43
+ # log() messages will be appended to.
44
+ #
15
45
  def initialize(agent, log = WireTap.new(nil))
16
46
  @agent = agent
17
47
  @log = log
@@ -20,6 +50,7 @@ module ScoutAgent
20
50
  @reader = nil
21
51
  @writer = nil
22
52
  @launch_and_monitor_thread = nil
53
+ @termination_thread = nil
23
54
  @check_in_with_parent_thread = nil
24
55
  @code = nil
25
56
  @last_launch = nil
@@ -32,8 +63,13 @@ module ScoutAgent
32
63
 
33
64
  include Tracked
34
65
 
66
+ # The log file this monitor writes tracking information to.
35
67
  attr_reader :log
36
68
 
69
+ #
70
+ # This method outlines the process used to monitor an Agent. It is roughly:
71
+ # launch, monitor, kill as needed, and restart the process.
72
+ #
37
73
  def launch_and_monitor
38
74
  @launch_and_monitor_thread = Thread.new do
39
75
  Thread.current.abort_on_exception = true
@@ -48,22 +84,38 @@ module ScoutAgent
48
84
  end
49
85
  end
50
86
 
87
+ #
88
+ # Begins a termination of the Agent subprocess in a separate Thread. This
89
+ # monitor's join() method will also wait on this termination Thread to
90
+ # ensure everything gets the order to shutdown before we exit.
91
+ #
51
92
  def terminate
52
- if Process.pid == @parent_pid
53
- # stop monitoring
54
- log.info("Stopping the monitoring for '#{@agent}'.")
55
- @launch_and_monitor_thread.exit if @launch_and_monitor_thread
56
- # ask child process to exit
57
- log.info("Asking '#{@agent}' to stop.")
58
- IDCard.new(@agent).signal("TERM")
93
+ @termination_thread = Thread.new do
94
+ if Process.pid == @parent_pid
95
+ # stop monitoring
96
+ log.info("Stopping the monitoring for '#{@agent}'.")
97
+ @launch_and_monitor_thread.exit if @launch_and_monitor_thread
98
+ # ask child process to exit
99
+ log.info("Asking '#{@agent}' to stop.")
100
+ IDCard.new(@agent).signal("TERM")
101
+ end
59
102
  end
60
103
  rescue Errno::ESRCH # no such process
61
104
  # if already exited, so we are fine
62
105
  end
63
106
 
107
+ #
108
+ # Waits for the monitor Thread to be stopped by a natural termination before
109
+ # returning. If terminate() is called to start the shutdown, this method
110
+ # will also wait on the Thread spawned by that method to ensure everything
111
+ # gets the signal to stop.
112
+ #
64
113
  def join
65
114
  if Process.pid == @parent_pid and @launch_and_monitor_thread
66
- @launch_and_monitor_thread.join
115
+ @launch_and_monitor_thread.join # wait on the monitor to stop
116
+ if @termination_thread
117
+ @termination_thread.join # wait on us to stop the subprocess
118
+ end
67
119
  end
68
120
  end
69
121
 
@@ -71,10 +123,15 @@ module ScoutAgent
71
123
  private
72
124
  #######
73
125
 
74
- ##############
75
- ### Parent ###
76
- ##############
126
+ ###############
127
+ ### Monitor ###
128
+ ###############
77
129
 
130
+ #
131
+ # This method just rests for the proper amount of time between launches to
132
+ # ensure we're not overworking the server due to continuing issues. See
133
+ # +RELAUNCH_FREQUENCIES+ for details.
134
+ #
78
135
  def wait_for_launch
79
136
  if @last_launch
80
137
  seconds_ran = Time.now - @last_launch
@@ -95,11 +152,19 @@ module ScoutAgent
95
152
  end
96
153
  end
97
154
  end
98
-
155
+
156
+ #
157
+ # Creates a two-ended pipe for one way communication from the Agent checking
158
+ # in with the monitor.
159
+ #
99
160
  def prepare_pipe
100
161
  @reader, @writer = IO.pipe
101
162
  end
102
163
 
164
+ #
165
+ # This method fork()'s the subprocess and outlines the work done there:
166
+ # loading, starting the check-in Thread, and running the Agent code.
167
+ #
103
168
  def launch_child
104
169
  log.info("Launching '#{@agent}'.")
105
170
  status(@agent)
@@ -113,7 +178,8 @@ module ScoutAgent
113
178
  run_code
114
179
  end
115
180
  end
116
-
181
+
182
+ # Ensure that the writing end of the pipe is closed.
117
183
  def close_writer
118
184
  @writer.close
119
185
  rescue IOError # already closed
@@ -122,24 +188,36 @@ module ScoutAgent
122
188
  # it wasn't set so there's nothing to close
123
189
  end
124
190
 
191
+ #
192
+ # An infinite loop that just reads check-in messages from the Agent. This
193
+ # method will return when the Agent fails to report within
194
+ # +NO_CONTACT_TIMEOUT+.
195
+ #
125
196
  def monitor_child
126
197
  loop do
127
198
  check_in = nil
128
199
  begin
129
- Timeout.timeout(NO_CONTACT_TIMEOUT) { check_in = @reader.gets }
200
+ Timeout.timeout(NO_CONTACT_TIMEOUT) do
201
+ check_in = @reader.gets
202
+ end
130
203
  log.error("'#{@agent}' monitor channel has closed.") if check_in.nil?
131
204
  rescue Timeout::Error
132
205
  # check_in will stay nil
133
206
  log.error("'#{@agent}' failed to check-in in time.")
134
207
  end
135
208
  unless check_in.to_s =~
136
- /\A#{@child_pid}: \d{4}-\d{2}-\d{2} \d{2}:\d{2}\Z/
209
+ /\A#{@child_pid}:\s*\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\Z/
137
210
  log.error("'#{@agent}' check-in was malformed.") unless check_in.nil?
138
211
  break
139
212
  end
140
213
  end
141
214
  end
142
215
 
216
+ #
217
+ # This method is called after a monitor cycle fails, so it needs to ensure
218
+ # the Agent gets shutdown. It will also stop a running plugin, if found,
219
+ # for the master agent only.
220
+ #
143
221
  def restart_child
144
222
  log.info("Stopping '#{@agent}'.")
145
223
  status(@agent, :restarting)
@@ -152,6 +230,10 @@ module ScoutAgent
152
230
  end
153
231
  end
154
232
 
233
+ #
234
+ # This method overrides the base status() setter from Tracked, to add a way
235
+ # the combine messages when restarting multiple processes.
236
+ #
155
237
  def status(process, restarting = false)
156
238
  if db = status_database
157
239
  db.write_to_sqlite do |sqlite|
@@ -181,18 +263,27 @@ module ScoutAgent
181
263
  end
182
264
 
183
265
  #############
184
- ### Child ###
266
+ ### Agent ###
185
267
  #############
186
268
 
269
+ #
270
+ # Installs appropriate signal handlers for the Agent and clears the identity
271
+ # of the monitor.
272
+ #
187
273
  def reset_environment
188
274
  # swap out our parent's signal handlers
189
- install_shutdown_handler { finish_code }
190
- trap("ALRM") { alert_code }
275
+ install_shutdown_handler do
276
+ finish_code
277
+ end
278
+ trap("ALRM") do
279
+ alert_code
280
+ end
191
281
 
192
282
  # clear the parent's identity
193
283
  IDCard.me = nil
194
284
  end
195
285
 
286
+ # Ensure that the reading end of the pipe is closed.
196
287
  def close_reader
197
288
  @reader.close
198
289
  rescue IOError # already closed
@@ -201,16 +292,29 @@ module ScoutAgent
201
292
  # it wasn't set so there's nothing to close
202
293
  end
203
294
 
295
+ #
296
+ # Loads the code for the Agent to be monitored and fetches the object built
297
+ # by that code.
298
+ #
204
299
  def load_code
205
300
  require LIB_DIR + "agent"
206
301
  require LIB_DIR + "agent/#{@agent}_agent"
207
302
  @code = ScoutAgent::Agent.const_get("#{@agent.CamelCase}Agent").new
208
303
  end
209
304
 
305
+ #
306
+ # Ensures that this Agent is authorized to run because a copy is not
307
+ # currently active.
308
+ #
210
309
  def authorize_code
211
310
  @code.authorize
212
311
  end
213
312
 
313
+ #
314
+ # An infinite loop that just writes check-in messages to the monitoring
315
+ # process. This code will trigger it's own shutdown if the parent
316
+ # disappears (closing the pipe).
317
+ #
214
318
  def check_in_with_parent
215
319
  @check_in_with_parent_thread = Thread.new do
216
320
  Thread.current.abort_on_exception = true
@@ -226,10 +330,15 @@ module ScoutAgent
226
330
  end
227
331
  end
228
332
 
333
+ # Invokes the main code of the Agent.
229
334
  def run_code
230
335
  @code.run
231
336
  end
232
337
 
338
+ #
339
+ # Closes the pipe and invokes the finishing code of the Agent in a separate
340
+ # Thread.
341
+ #
233
342
  def finish_code
234
343
  close_writer
235
344
 
@@ -242,6 +351,7 @@ module ScoutAgent
242
351
  end
243
352
  end
244
353
 
354
+ # Invokes the code for the Agent to notice changes from the outside world.
245
355
  def alert_code
246
356
  if @code
247
357
  Thread.new do