scout_agent 3.0.7 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -109,6 +109,9 @@ module ScoutAgent
109
109
 
110
110
  # maintain the queue database
111
111
  db.maintain
112
+ status_database.maintain
113
+ # clean out old logs
114
+ ScoutAgent.remove_old_log_files(log)
112
115
 
113
116
  log.info("Messages queued successfully.")
114
117
  end
@@ -108,8 +108,11 @@ module ScoutAgent
108
108
  "#{run_time} seconds." )
109
109
  end
110
110
 
111
- # maintain the snapshots database
111
+ # maintain the databases
112
112
  db.maintain
113
+ status_database.maintain
114
+ # clean out old logs
115
+ ScoutAgent.remove_old_log_files(log)
113
116
 
114
117
  log.info("Snapshot complete.")
115
118
  this_file.flock(File::LOCK_UN) # release our snapshot lock
@@ -74,10 +74,7 @@ module ScoutAgent
74
74
  lifelines = agents.map { |agent| Lifeline.new(agent, log) }
75
75
  %w[TERM INT].each do |signal|
76
76
  trap(signal) do
77
- Thread.new do
78
- lifelines.each { |line| line.terminate }
79
- Process.waitall
80
- end
77
+ lifelines.each { |line| line.terminate }
81
78
  end
82
79
  end
83
80
  lifelines.each do |line|
@@ -88,6 +85,8 @@ module ScoutAgent
88
85
  lifelines.each do |line|
89
86
  line.join
90
87
  end
88
+ # wait for all children to obey our stop command
89
+ Process.waitall
91
90
  end
92
91
 
93
92
  #######
@@ -13,20 +13,29 @@ module ScoutAgent
13
13
  # to terminate.
14
14
  #
15
15
  class Stop < Assignment
16
+ #
17
+ # The number of periods between stop checks after a process has been
18
+ # signaled. Checks will be made until it's obvious the process obeyed the
19
+ # request or until this number of checks has been made.
20
+ #
21
+ WAIT_COUNT = 10
22
+ #
23
+ # The pause in seconds between stop checks after a process has been
24
+ # signaled.
25
+ #
26
+ WAIT_DELAY = 0.5
27
+
16
28
  # Runs the stop command.
17
29
  def execute
18
30
  @agent = IDCard.new(:lifeline)
19
31
  if @agent.pid_file.exist?
20
- puts "Stopping #{ScoutAgent.proper_agent_name} (PID #{@agent.pid})..."
21
- signal_and_wait("TERM")
22
- if @agent.pid_file.exist?
23
- puts "TERM signal was ignored, sending KILL..."
24
- signal_and_wait("KILL")
25
- if @agent.pid_file.exist?
26
- abort_with_failed_to_stop
27
- end
32
+ signal_all("TERM")
33
+ if Plan.pid_dir.entries.any? { |pid| pid.to_s =~ /\w+\.pid\z/ }
34
+ puts "\nTERM signals were ignored, sending KILL signals.\n\n"
35
+ signal_all("KILL")
36
+ abort_with_failed_to_stop
28
37
  end
29
- puts "Stopped."
38
+ puts "All processes stopped."
30
39
  else
31
40
  abort_with_not_running_notice
32
41
  end
@@ -37,28 +46,49 @@ module ScoutAgent
37
46
  #######
38
47
 
39
48
  #
40
- # Sends +signal_name+ (which is expected to be some kind of stop request
41
- # to the <tt>@agent</tt>. This method will then wait +wait_count+ periods
42
- # of +wait_delay+ seconds checking between waits to see if the agent has
43
- # complied. It returns when the agent has exited or the total wait period
44
- # has expired.
49
+ # Sends +signal_name+ (assumed to be a stop request) to all subprocesses
50
+ # of the agent. The first process signaled is the lifeline process and it
51
+ # is given a few seconds to shut everything down smoothly as it is
52
+ # supposed to do. If that fails, the stray processes will be sent
53
+ # +signal_name+ directly.
45
54
  #
46
- def signal_and_wait(signal_name, wait_count = 10, wait_delay = 0.5)
47
- # signal the main process
48
- @agent.signal(signal_name)
49
- # wait for it to stop
50
- wait_count.times do
51
- sleep wait_delay
52
- break unless @agent.pid_file.exist?
53
- end
54
- # signal other stray processes
55
+ def signal_all(signal_name)
56
+ # start with the Lifeline process as that should properly stop everyone
57
+ signal_and_wait(@agent, signal_name)
58
+ # signal any other stray processes
55
59
  Plan.pid_dir.each_entry do |process|
56
60
  if process.to_s =~ /(\w+)\.pid\z/
57
- IDCard.new($1).signal(signal_name)
61
+ signal_and_wait(IDCard.new($1), signal_name) unless $1 == "lifeline"
62
+ end
63
+ end
64
+ end
65
+
66
+ #
67
+ # Sends +signal_name+ (which is expected to be some kind of stop request)
68
+ # to the +id_card+. This method will then wait +WAIT_COUNT+ periods of
69
+ # +WAIT_DELAY+ seconds checking between waits to see if the agent has
70
+ # complied. It returns when the signaled process has exited or the total
71
+ # wait period has expired. The wait period is skipped for the +KILL+
72
+ # signal, since the process is not allowed to respond.
73
+ #
74
+ def signal_and_wait(id_card, signal_name)
75
+ puts "Sending #{signal_name} signal to the agent's " +
76
+ "#{id_card.process_name} process."
77
+ # signal the main process
78
+ begin
79
+ id_card.signal(signal_name)
80
+ rescue Errno::ECHILD, Errno::ESRCH # no such process
81
+ # do nothing: it stopped
82
+ rescue Errno::EPERM # we don't have permission
83
+ abort_with_no_permission
84
+ end
85
+ unless signal_name == "KILL" # process cannot respond, so don't wait
86
+ # wait for it to stop
87
+ WAIT_COUNT.times do
88
+ sleep WAIT_DELAY
89
+ break unless id_card.pid_file.exist?
58
90
  end
59
91
  end
60
- rescue Errno::EPERM # we don't have permission
61
- abort_with_no_permission
62
92
  end
63
93
 
64
94
  #
@@ -75,6 +105,7 @@ module ScoutAgent
75
105
  #
76
106
  def abort_with_no_permission
77
107
  abort <<-END_PERMISSION.trim
108
+
78
109
  Unable to signal the daemon. Please rerun this command with
79
110
  super user privileges:
80
111
 
@@ -84,13 +115,16 @@ module ScoutAgent
84
115
  end
85
116
 
86
117
  #
87
- # Abort with an error message to the user that says we don't have enough
88
- # permission to stop the agent due to how it was started.
118
+ # Abort with an error message to the user that says we killed the agent
119
+ # but there are some stray file hanging around.
89
120
  #
90
121
  def abort_with_failed_to_stop
91
122
  abort <<-END_FAILED.trim
92
- Unable to stop the daemon. You may need to use the PID files
93
- in #{Plan.pid_dir} to clean up stay processes.
123
+
124
+ KILL signals were sent to all active processes and they
125
+ should be stopped now. You may wish to check the PID
126
+ files in #{Plan.pid_dir} to be sure. The agent should
127
+ clean up these old files as it relaunches.
94
128
  END_FAILED
95
129
  end
96
130
  end
@@ -7,15 +7,26 @@ module ScoutAgent
7
7
  # execution and to signal other processes.
8
8
  #
9
9
  class IDCard
10
- class << self
11
- #
12
- # This global attribute should contain the name of the current process.
13
- # It is set during a successful authorization.
14
- #
15
- # <b>Warning:</b> Be sure to clear this attribute immediately after a
16
- # fork() so you don't keep the parent's identity.
17
- #
18
- attr_accessor :me
10
+ #
11
+ # This global attribute should contain the IDCard for this process. It is
12
+ # set during a successful authorization.
13
+ #
14
+ # <b>Warning:</b> Be sure to clear this attribute immediately after a
15
+ # fork(), with a call to me=(), so you don't keep the parent's identity.
16
+ #
17
+ def self.me
18
+ @me ||= nil
19
+ end
20
+
21
+ #
22
+ # A setter for the identity of this process. This is set automatically as
23
+ # part of an authorization.
24
+ #
25
+ # <b>Warning:</b> Be sure to clear this attribute immediately after a
26
+ # fork() so you don't keep the parent's identity.
27
+ #
28
+ def self.me=(id_card)
29
+ @me = id_card
19
30
  end
20
31
 
21
32
  #
@@ -76,10 +87,10 @@ module ScoutAgent
76
87
  # <tt>IDCard::me()</tt> has been updated and an exit handle has been
77
88
  # installed to revoke() this claim as the process ends.
78
89
  #
79
- def authorize
90
+ def authorize(&block)
80
91
  File.open(pid_file, File::CREAT | File::EXCL | File::WRONLY) do |pid|
81
92
  pid.flock(File::LOCK_EX)
82
- if not block_given? or yield # allows for daemonization
93
+ if block.nil? or block.call # allows for daemonization
83
94
  pid.puts Process.pid
84
95
  else
85
96
  pid.flock(File::LOCK_UN)
@@ -105,7 +116,7 @@ module ScoutAgent
105
116
  # stale PID file found, clearing it and reloading
106
117
  if revoke
107
118
  pid.flock(File::LOCK_UN) # release the lock before we recurse
108
- return authorize # try again
119
+ return authorize(&block) # try again
109
120
  end
110
121
  rescue Errno::EACCES # don't have permission
111
122
  # nothing we can do so give up
@@ -2,16 +2,46 @@
2
2
  # encoding: UTF-8
3
3
 
4
4
  module ScoutAgent
5
+ #
6
+ # This class is a monitor for an Agent subprocess of the platform. It
7
+ # launches the Agent code and makes sure it continues to check-in at regular
8
+ # intervals, restarting the subprocess when it fails to do so.
9
+ #
5
10
  class Lifeline
11
+ #
12
+ # The number of seconds allowed to pass before the Agent subprocess is
13
+ # considered unresponsive.
14
+ #
6
15
  NO_CONTACT_TIMEOUT = 5
7
- CHECK_IN_FREQUENCY = 0.99 # gives us five check-ins before a cutoff
16
+ #
17
+ # The frequency with which the subprocess is expected to check-in. This is
18
+ # purposely set to a little under a second to give one more check-in
19
+ # possibility before the <tt>NO_CONTACT_TIMEOUT</tt> is reached.
20
+ #
21
+ CHECK_IN_FREQUENCY = 0.99
22
+ #
23
+ # The number of seconds the monitor will wait for a process to exit cleanly
24
+ # before forcing a stop.
25
+ #
8
26
  TERM_TO_KILL_PAUSE = 1
27
+ #
28
+ # The sequence of seconds this monitor will wait between restarts of the
29
+ # subprocess. The initial values are short, to try and get running again as
30
+ # soon as possible. However, this timeout grows larger up to a point to
31
+ # reduce strain on a server experiencing long term problems. The sequence
32
+ # will reset after a successful relaunch that runs for at least as long as
33
+ # the next number in the sequence (or the max).
34
+ #
9
35
  RELAUNCH_FREQUENCIES = [0, 1, 1, 2, 3, 5, 8, 13]
10
36
 
11
37
  #################
12
38
  ### Interface ###
13
39
  #################
14
40
 
41
+ #
42
+ # Prepares a monitor for the code specified by +agent+. You may also set
43
+ # log() messages will be appended to.
44
+ #
15
45
  def initialize(agent, log = WireTap.new(nil))
16
46
  @agent = agent
17
47
  @log = log
@@ -20,6 +50,7 @@ module ScoutAgent
20
50
  @reader = nil
21
51
  @writer = nil
22
52
  @launch_and_monitor_thread = nil
53
+ @termination_thread = nil
23
54
  @check_in_with_parent_thread = nil
24
55
  @code = nil
25
56
  @last_launch = nil
@@ -32,8 +63,13 @@ module ScoutAgent
32
63
 
33
64
  include Tracked
34
65
 
66
+ # The log file this monitor writes tracking information to.
35
67
  attr_reader :log
36
68
 
69
+ #
70
+ # This method outlines the process used to monitor an Agent. It is roughly:
71
+ # launch, monitor, kill as needed, and restart the process.
72
+ #
37
73
  def launch_and_monitor
38
74
  @launch_and_monitor_thread = Thread.new do
39
75
  Thread.current.abort_on_exception = true
@@ -48,22 +84,38 @@ module ScoutAgent
48
84
  end
49
85
  end
50
86
 
87
+ #
88
+ # Begins a termination of the Agent subprocess in a separate Thread. This
89
+ # monitor's join() method will also wait on this termination Thread to
90
+ # ensure everything gets the order to shutdown before we exit.
91
+ #
51
92
  def terminate
52
- if Process.pid == @parent_pid
53
- # stop monitoring
54
- log.info("Stopping the monitoring for '#{@agent}'.")
55
- @launch_and_monitor_thread.exit if @launch_and_monitor_thread
56
- # ask child process to exit
57
- log.info("Asking '#{@agent}' to stop.")
58
- IDCard.new(@agent).signal("TERM")
93
+ @termination_thread = Thread.new do
94
+ if Process.pid == @parent_pid
95
+ # stop monitoring
96
+ log.info("Stopping the monitoring for '#{@agent}'.")
97
+ @launch_and_monitor_thread.exit if @launch_and_monitor_thread
98
+ # ask child process to exit
99
+ log.info("Asking '#{@agent}' to stop.")
100
+ IDCard.new(@agent).signal("TERM")
101
+ end
59
102
  end
60
103
  rescue Errno::ESRCH # no such process
61
104
  # if already exited, so we are fine
62
105
  end
63
106
 
107
+ #
108
+ # Waits for the monitor Thread to be stopped by a natural termination before
109
+ # returning. If terminate() is called to start the shutdown, this method
110
+ # will also wait on the Thread spawned by that method to ensure everything
111
+ # gets the signal to stop.
112
+ #
64
113
  def join
65
114
  if Process.pid == @parent_pid and @launch_and_monitor_thread
66
- @launch_and_monitor_thread.join
115
+ @launch_and_monitor_thread.join # wait on the monitor to stop
116
+ if @termination_thread
117
+ @termination_thread.join # wait on us to stop the subprocess
118
+ end
67
119
  end
68
120
  end
69
121
 
@@ -71,10 +123,15 @@ module ScoutAgent
71
123
  private
72
124
  #######
73
125
 
74
- ##############
75
- ### Parent ###
76
- ##############
126
+ ###############
127
+ ### Monitor ###
128
+ ###############
77
129
 
130
+ #
131
+ # This method just rests for the proper amount of time between launches to
132
+ # ensure we're not overworking the server due to continuing issues. See
133
+ # +RELAUNCH_FREQUENCIES+ for details.
134
+ #
78
135
  def wait_for_launch
79
136
  if @last_launch
80
137
  seconds_ran = Time.now - @last_launch
@@ -95,11 +152,19 @@ module ScoutAgent
95
152
  end
96
153
  end
97
154
  end
98
-
155
+
156
+ #
157
+ # Creates a two-ended pipe for one way communication from the Agent checking
158
+ # in with the monitor.
159
+ #
99
160
  def prepare_pipe
100
161
  @reader, @writer = IO.pipe
101
162
  end
102
163
 
164
+ #
165
+ # This method fork()'s the subprocess and outlines the work done there:
166
+ # loading, starting the check-in Thread, and running the Agent code.
167
+ #
103
168
  def launch_child
104
169
  log.info("Launching '#{@agent}'.")
105
170
  status(@agent)
@@ -113,7 +178,8 @@ module ScoutAgent
113
178
  run_code
114
179
  end
115
180
  end
116
-
181
+
182
+ # Ensure that the writing end of the pipe is closed.
117
183
  def close_writer
118
184
  @writer.close
119
185
  rescue IOError # already closed
@@ -122,24 +188,36 @@ module ScoutAgent
122
188
  # it wasn't set so there's nothing to close
123
189
  end
124
190
 
191
+ #
192
+ # An infinite loop that just reads check-in messages from the Agent. This
193
+ # method will return when the Agent fails to report within
194
+ # +NO_CONTACT_TIMEOUT+.
195
+ #
125
196
  def monitor_child
126
197
  loop do
127
198
  check_in = nil
128
199
  begin
129
- Timeout.timeout(NO_CONTACT_TIMEOUT) { check_in = @reader.gets }
200
+ Timeout.timeout(NO_CONTACT_TIMEOUT) do
201
+ check_in = @reader.gets
202
+ end
130
203
  log.error("'#{@agent}' monitor channel has closed.") if check_in.nil?
131
204
  rescue Timeout::Error
132
205
  # check_in will stay nil
133
206
  log.error("'#{@agent}' failed to check-in in time.")
134
207
  end
135
208
  unless check_in.to_s =~
136
- /\A#{@child_pid}: \d{4}-\d{2}-\d{2} \d{2}:\d{2}\Z/
209
+ /\A#{@child_pid}:\s*\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\Z/
137
210
  log.error("'#{@agent}' check-in was malformed.") unless check_in.nil?
138
211
  break
139
212
  end
140
213
  end
141
214
  end
142
215
 
216
+ #
217
+ # This method is called after a monitor cycle fails, so it needs to ensure
218
+ # the Agent gets shutdown. It will also stop a running plugin, if found,
219
+ # for the master agent only.
220
+ #
143
221
  def restart_child
144
222
  log.info("Stopping '#{@agent}'.")
145
223
  status(@agent, :restarting)
@@ -152,6 +230,10 @@ module ScoutAgent
152
230
  end
153
231
  end
154
232
 
233
+ #
234
+ # This method overrides the base status() setter from Tracked, to add a way
235
+ # the combine messages when restarting multiple processes.
236
+ #
155
237
  def status(process, restarting = false)
156
238
  if db = status_database
157
239
  db.write_to_sqlite do |sqlite|
@@ -181,18 +263,27 @@ module ScoutAgent
181
263
  end
182
264
 
183
265
  #############
184
- ### Child ###
266
+ ### Agent ###
185
267
  #############
186
268
 
269
+ #
270
+ # Installs appropriate signal handlers for the Agent and clears the identity
271
+ # of the monitor.
272
+ #
187
273
  def reset_environment
188
274
  # swap out our parent's signal handlers
189
- install_shutdown_handler { finish_code }
190
- trap("ALRM") { alert_code }
275
+ install_shutdown_handler do
276
+ finish_code
277
+ end
278
+ trap("ALRM") do
279
+ alert_code
280
+ end
191
281
 
192
282
  # clear the parent's identity
193
283
  IDCard.me = nil
194
284
  end
195
285
 
286
+ # Ensure that the reading end of the pipe is closed.
196
287
  def close_reader
197
288
  @reader.close
198
289
  rescue IOError # already closed
@@ -201,16 +292,29 @@ module ScoutAgent
201
292
  # it wasn't set so there's nothing to close
202
293
  end
203
294
 
295
+ #
296
+ # Loads the code for the Agent to be monitored and fetches the object built
297
+ # by that code.
298
+ #
204
299
  def load_code
205
300
  require LIB_DIR + "agent"
206
301
  require LIB_DIR + "agent/#{@agent}_agent"
207
302
  @code = ScoutAgent::Agent.const_get("#{@agent.CamelCase}Agent").new
208
303
  end
209
304
 
305
+ #
306
+ # Ensures that this Agent is authorized to run because a copy is not
307
+ # currently active.
308
+ #
210
309
  def authorize_code
211
310
  @code.authorize
212
311
  end
213
312
 
313
+ #
314
+ # An infinite loop that just writes check-in messages to the monitoring
315
+ # process. This code will trigger it's own shutdown if the parent
316
+ # disappears (closing the pipe).
317
+ #
214
318
  def check_in_with_parent
215
319
  @check_in_with_parent_thread = Thread.new do
216
320
  Thread.current.abort_on_exception = true
@@ -226,10 +330,15 @@ module ScoutAgent
226
330
  end
227
331
  end
228
332
 
333
+ # Invokes the main code of the Agent.
229
334
  def run_code
230
335
  @code.run
231
336
  end
232
337
 
338
+ #
339
+ # Closes the pipe and invokes the finishing code of the Agent in a separate
340
+ # Thread.
341
+ #
233
342
  def finish_code
234
343
  close_writer
235
344
 
@@ -242,6 +351,7 @@ module ScoutAgent
242
351
  end
243
352
  end
244
353
 
354
+ # Invokes the code for the Agent to notice changes from the outside world.
245
355
  def alert_code
246
356
  if @code
247
357
  Thread.new do