scout_agent 3.0.7 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +13 -0
- data/README +44 -2
- data/Rakefile +1 -1
- data/TODO +7 -1
- data/lib/scout_agent.rb +26 -1
- data/lib/scout_agent/agent/communication_agent.rb +75 -2
- data/lib/scout_agent/agent/master_agent.rb +132 -31
- data/lib/scout_agent/assignment/queue.rb +3 -0
- data/lib/scout_agent/assignment/snapshot.rb +4 -1
- data/lib/scout_agent/assignment/start.rb +3 -4
- data/lib/scout_agent/assignment/stop.rb +64 -30
- data/lib/scout_agent/id_card.rb +23 -12
- data/lib/scout_agent/lifeline.rb +129 -19
- data/lib/scout_agent/mission.rb +221 -68
- data/lib/scout_agent/order.rb +2 -2
- data/lib/scout_agent/plan.rb +38 -18
- metadata +2 -2
@@ -108,8 +108,11 @@ module ScoutAgent
|
|
108
108
|
"#{run_time} seconds." )
|
109
109
|
end
|
110
110
|
|
111
|
-
# maintain the
|
111
|
+
# maintain the databases
|
112
112
|
db.maintain
|
113
|
+
status_database.maintain
|
114
|
+
# clean out old logs
|
115
|
+
ScoutAgent.remove_old_log_files(log)
|
113
116
|
|
114
117
|
log.info("Snapshot complete.")
|
115
118
|
this_file.flock(File::LOCK_UN) # release our snapshot lock
|
@@ -74,10 +74,7 @@ module ScoutAgent
|
|
74
74
|
lifelines = agents.map { |agent| Lifeline.new(agent, log) }
|
75
75
|
%w[TERM INT].each do |signal|
|
76
76
|
trap(signal) do
|
77
|
-
|
78
|
-
lifelines.each { |line| line.terminate }
|
79
|
-
Process.waitall
|
80
|
-
end
|
77
|
+
lifelines.each { |line| line.terminate }
|
81
78
|
end
|
82
79
|
end
|
83
80
|
lifelines.each do |line|
|
@@ -88,6 +85,8 @@ module ScoutAgent
|
|
88
85
|
lifelines.each do |line|
|
89
86
|
line.join
|
90
87
|
end
|
88
|
+
# wait for all children to obey our stop command
|
89
|
+
Process.waitall
|
91
90
|
end
|
92
91
|
|
93
92
|
#######
|
@@ -13,20 +13,29 @@ module ScoutAgent
|
|
13
13
|
# to terminate.
|
14
14
|
#
|
15
15
|
class Stop < Assignment
|
16
|
+
#
|
17
|
+
# The number of periods between stop checks after a process has been
|
18
|
+
# signaled. Checks will be made until it's obvious the process obeyed the
|
19
|
+
# request or until this number of checks has been made.
|
20
|
+
#
|
21
|
+
WAIT_COUNT = 10
|
22
|
+
#
|
23
|
+
# The pause in seconds between stop checks after a process has been
|
24
|
+
# signaled.
|
25
|
+
#
|
26
|
+
WAIT_DELAY = 0.5
|
27
|
+
|
16
28
|
# Runs the stop command.
|
17
29
|
def execute
|
18
30
|
@agent = IDCard.new(:lifeline)
|
19
31
|
if @agent.pid_file.exist?
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
if @agent.pid_file.exist?
|
26
|
-
abort_with_failed_to_stop
|
27
|
-
end
|
32
|
+
signal_all("TERM")
|
33
|
+
if Plan.pid_dir.entries.any? { |pid| pid.to_s =~ /\w+\.pid\z/ }
|
34
|
+
puts "\nTERM signals were ignored, sending KILL signals.\n\n"
|
35
|
+
signal_all("KILL")
|
36
|
+
abort_with_failed_to_stop
|
28
37
|
end
|
29
|
-
puts "
|
38
|
+
puts "All processes stopped."
|
30
39
|
else
|
31
40
|
abort_with_not_running_notice
|
32
41
|
end
|
@@ -37,28 +46,49 @@ module ScoutAgent
|
|
37
46
|
#######
|
38
47
|
|
39
48
|
#
|
40
|
-
# Sends +signal_name+ (
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
49
|
+
# Sends +signal_name+ (assumed to be a stop request) to all subprocesses
|
50
|
+
# of the agent. The first process signaled is the lifeline process and it
|
51
|
+
# is given a few seconds to shut everything down smoothly as it is
|
52
|
+
# supposed to do. If that fails, the stray processes will be sent
|
53
|
+
# +signal_name+ directly.
|
45
54
|
#
|
46
|
-
def
|
47
|
-
#
|
48
|
-
@agent
|
49
|
-
#
|
50
|
-
wait_count.times do
|
51
|
-
sleep wait_delay
|
52
|
-
break unless @agent.pid_file.exist?
|
53
|
-
end
|
54
|
-
# signal other stray processes
|
55
|
+
def signal_all(signal_name)
|
56
|
+
# start with the Lifeline process as that should properly stop everyone
|
57
|
+
signal_and_wait(@agent, signal_name)
|
58
|
+
# signal any other stray processes
|
55
59
|
Plan.pid_dir.each_entry do |process|
|
56
60
|
if process.to_s =~ /(\w+)\.pid\z/
|
57
|
-
IDCard.new($1)
|
61
|
+
signal_and_wait(IDCard.new($1), signal_name) unless $1 == "lifeline"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# Sends +signal_name+ (which is expected to be some kind of stop request)
|
68
|
+
# to the +id_card+. This method will then wait +WAIT_COUNT+ periods of
|
69
|
+
# +WAIT_DELAY+ seconds checking between waits to see if the agent has
|
70
|
+
# complied. It returns when the signaled process has exited or the total
|
71
|
+
# wait period has expired. The wait period is skipped for the +KILL+
|
72
|
+
# signal, since the process is not allowed to respond.
|
73
|
+
#
|
74
|
+
def signal_and_wait(id_card, signal_name)
|
75
|
+
puts "Sending #{signal_name} signal to the agent's " +
|
76
|
+
"#{id_card.process_name} process."
|
77
|
+
# signal the main process
|
78
|
+
begin
|
79
|
+
id_card.signal(signal_name)
|
80
|
+
rescue Errno::ECHILD, Errno::ESRCH # no such process
|
81
|
+
# do nothing: it stopped
|
82
|
+
rescue Errno::EPERM # we don't have permission
|
83
|
+
abort_with_no_permission
|
84
|
+
end
|
85
|
+
unless signal_name == "KILL" # process cannot respond, so don't wait
|
86
|
+
# wait for it to stop
|
87
|
+
WAIT_COUNT.times do
|
88
|
+
sleep WAIT_DELAY
|
89
|
+
break unless id_card.pid_file.exist?
|
58
90
|
end
|
59
91
|
end
|
60
|
-
rescue Errno::EPERM # we don't have permission
|
61
|
-
abort_with_no_permission
|
62
92
|
end
|
63
93
|
|
64
94
|
#
|
@@ -75,6 +105,7 @@ module ScoutAgent
|
|
75
105
|
#
|
76
106
|
def abort_with_no_permission
|
77
107
|
abort <<-END_PERMISSION.trim
|
108
|
+
|
78
109
|
Unable to signal the daemon. Please rerun this command with
|
79
110
|
super user privileges:
|
80
111
|
|
@@ -84,13 +115,16 @@ module ScoutAgent
|
|
84
115
|
end
|
85
116
|
|
86
117
|
#
|
87
|
-
# Abort with an error message to the user that says we
|
88
|
-
#
|
118
|
+
# Abort with an error message to the user that says we killed the agent
|
119
|
+
# but there are some stray file hanging around.
|
89
120
|
#
|
90
121
|
def abort_with_failed_to_stop
|
91
122
|
abort <<-END_FAILED.trim
|
92
|
-
|
93
|
-
|
123
|
+
|
124
|
+
KILL signals were sent to all active processes and they
|
125
|
+
should be stopped now. You may wish to check the PID
|
126
|
+
files in #{Plan.pid_dir} to be sure. The agent should
|
127
|
+
clean up these old files as it relaunches.
|
94
128
|
END_FAILED
|
95
129
|
end
|
96
130
|
end
|
data/lib/scout_agent/id_card.rb
CHANGED
@@ -7,15 +7,26 @@ module ScoutAgent
|
|
7
7
|
# execution and to signal other processes.
|
8
8
|
#
|
9
9
|
class IDCard
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
10
|
+
#
|
11
|
+
# This global attribute should contain the IDCard for this process. It is
|
12
|
+
# set during a successful authorization.
|
13
|
+
#
|
14
|
+
# <b>Warning:</b> Be sure to clear this attribute immediately after a
|
15
|
+
# fork(), with a call to me=(), so you don't keep the parent's identity.
|
16
|
+
#
|
17
|
+
def self.me
|
18
|
+
@me ||= nil
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# A setter for the identity of this process. This is set automatically as
|
23
|
+
# part of an authorization.
|
24
|
+
#
|
25
|
+
# <b>Warning:</b> Be sure to clear this attribute immediately after a
|
26
|
+
# fork() so you don't keep the parent's identity.
|
27
|
+
#
|
28
|
+
def self.me=(id_card)
|
29
|
+
@me = id_card
|
19
30
|
end
|
20
31
|
|
21
32
|
#
|
@@ -76,10 +87,10 @@ module ScoutAgent
|
|
76
87
|
# <tt>IDCard::me()</tt> has been updated and an exit handle has been
|
77
88
|
# installed to revoke() this claim as the process ends.
|
78
89
|
#
|
79
|
-
def authorize
|
90
|
+
def authorize(&block)
|
80
91
|
File.open(pid_file, File::CREAT | File::EXCL | File::WRONLY) do |pid|
|
81
92
|
pid.flock(File::LOCK_EX)
|
82
|
-
if
|
93
|
+
if block.nil? or block.call # allows for daemonization
|
83
94
|
pid.puts Process.pid
|
84
95
|
else
|
85
96
|
pid.flock(File::LOCK_UN)
|
@@ -105,7 +116,7 @@ module ScoutAgent
|
|
105
116
|
# stale PID file found, clearing it and reloading
|
106
117
|
if revoke
|
107
118
|
pid.flock(File::LOCK_UN) # release the lock before we recurse
|
108
|
-
return authorize
|
119
|
+
return authorize(&block) # try again
|
109
120
|
end
|
110
121
|
rescue Errno::EACCES # don't have permission
|
111
122
|
# nothing we can do so give up
|
data/lib/scout_agent/lifeline.rb
CHANGED
@@ -2,16 +2,46 @@
|
|
2
2
|
# encoding: UTF-8
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
|
+
#
|
6
|
+
# This class is a monitor for an Agent subprocess of the platform. It
|
7
|
+
# launches the Agent code and makes sure it continues to check-in at regular
|
8
|
+
# intervals, restarting the subprocess when it fails to do so.
|
9
|
+
#
|
5
10
|
class Lifeline
|
11
|
+
#
|
12
|
+
# The number of seconds allowed to pass before the Agent subprocess is
|
13
|
+
# considered unresponsive.
|
14
|
+
#
|
6
15
|
NO_CONTACT_TIMEOUT = 5
|
7
|
-
|
16
|
+
#
|
17
|
+
# The frequency with which the subprocess is expected to check-in. This is
|
18
|
+
# purposely set to a little under a second to give one more check-in
|
19
|
+
# possibility before the <tt>NO_CONTACT_TIMEOUT</tt> is reached.
|
20
|
+
#
|
21
|
+
CHECK_IN_FREQUENCY = 0.99
|
22
|
+
#
|
23
|
+
# The number of seconds the monitor will wait for a process to exit cleanly
|
24
|
+
# before forcing a stop.
|
25
|
+
#
|
8
26
|
TERM_TO_KILL_PAUSE = 1
|
27
|
+
#
|
28
|
+
# The sequence of seconds this monitor will wait between restarts of the
|
29
|
+
# subprocess. The initial values are short, to try and get running again as
|
30
|
+
# soon as possible. However, this timeout grows larger up to a point to
|
31
|
+
# reduce strain on a server experiencing long term problems. The sequence
|
32
|
+
# will reset after a successful relaunch that runs for at least as long as
|
33
|
+
# the next number in the sequence (or the max).
|
34
|
+
#
|
9
35
|
RELAUNCH_FREQUENCIES = [0, 1, 1, 2, 3, 5, 8, 13]
|
10
36
|
|
11
37
|
#################
|
12
38
|
### Interface ###
|
13
39
|
#################
|
14
40
|
|
41
|
+
#
|
42
|
+
# Prepares a monitor for the code specified by +agent+. You may also set
|
43
|
+
# log() messages will be appended to.
|
44
|
+
#
|
15
45
|
def initialize(agent, log = WireTap.new(nil))
|
16
46
|
@agent = agent
|
17
47
|
@log = log
|
@@ -20,6 +50,7 @@ module ScoutAgent
|
|
20
50
|
@reader = nil
|
21
51
|
@writer = nil
|
22
52
|
@launch_and_monitor_thread = nil
|
53
|
+
@termination_thread = nil
|
23
54
|
@check_in_with_parent_thread = nil
|
24
55
|
@code = nil
|
25
56
|
@last_launch = nil
|
@@ -32,8 +63,13 @@ module ScoutAgent
|
|
32
63
|
|
33
64
|
include Tracked
|
34
65
|
|
66
|
+
# The log file this monitor writes tracking information to.
|
35
67
|
attr_reader :log
|
36
68
|
|
69
|
+
#
|
70
|
+
# This method outlines the process used to monitor an Agent. It is roughly:
|
71
|
+
# launch, monitor, kill as needed, and restart the process.
|
72
|
+
#
|
37
73
|
def launch_and_monitor
|
38
74
|
@launch_and_monitor_thread = Thread.new do
|
39
75
|
Thread.current.abort_on_exception = true
|
@@ -48,22 +84,38 @@ module ScoutAgent
|
|
48
84
|
end
|
49
85
|
end
|
50
86
|
|
87
|
+
#
|
88
|
+
# Begins a termination of the Agent subprocess in a separate Thread. This
|
89
|
+
# monitor's join() method will also wait on this termination Thread to
|
90
|
+
# ensure everything gets the order to shutdown before we exit.
|
91
|
+
#
|
51
92
|
def terminate
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
93
|
+
@termination_thread = Thread.new do
|
94
|
+
if Process.pid == @parent_pid
|
95
|
+
# stop monitoring
|
96
|
+
log.info("Stopping the monitoring for '#{@agent}'.")
|
97
|
+
@launch_and_monitor_thread.exit if @launch_and_monitor_thread
|
98
|
+
# ask child process to exit
|
99
|
+
log.info("Asking '#{@agent}' to stop.")
|
100
|
+
IDCard.new(@agent).signal("TERM")
|
101
|
+
end
|
59
102
|
end
|
60
103
|
rescue Errno::ESRCH # no such process
|
61
104
|
# if already exited, so we are fine
|
62
105
|
end
|
63
106
|
|
107
|
+
#
|
108
|
+
# Waits for the monitor Thread to be stopped by a natural termination before
|
109
|
+
# returning. If terminate() is called to start the shutdown, this method
|
110
|
+
# will also wait on the Thread spawned by that method to ensure everything
|
111
|
+
# gets the signal to stop.
|
112
|
+
#
|
64
113
|
def join
|
65
114
|
if Process.pid == @parent_pid and @launch_and_monitor_thread
|
66
|
-
@launch_and_monitor_thread.join
|
115
|
+
@launch_and_monitor_thread.join # wait on the monitor to stop
|
116
|
+
if @termination_thread
|
117
|
+
@termination_thread.join # wait on us to stop the subprocess
|
118
|
+
end
|
67
119
|
end
|
68
120
|
end
|
69
121
|
|
@@ -71,10 +123,15 @@ module ScoutAgent
|
|
71
123
|
private
|
72
124
|
#######
|
73
125
|
|
74
|
-
|
75
|
-
###
|
76
|
-
|
126
|
+
###############
|
127
|
+
### Monitor ###
|
128
|
+
###############
|
77
129
|
|
130
|
+
#
|
131
|
+
# This method just rests for the proper amount of time between launches to
|
132
|
+
# ensure we're not overworking the server due to continuing issues. See
|
133
|
+
# +RELAUNCH_FREQUENCIES+ for details.
|
134
|
+
#
|
78
135
|
def wait_for_launch
|
79
136
|
if @last_launch
|
80
137
|
seconds_ran = Time.now - @last_launch
|
@@ -95,11 +152,19 @@ module ScoutAgent
|
|
95
152
|
end
|
96
153
|
end
|
97
154
|
end
|
98
|
-
|
155
|
+
|
156
|
+
#
|
157
|
+
# Creates a two-ended pipe for one way communication from the Agent checking
|
158
|
+
# in with the monitor.
|
159
|
+
#
|
99
160
|
def prepare_pipe
|
100
161
|
@reader, @writer = IO.pipe
|
101
162
|
end
|
102
163
|
|
164
|
+
#
|
165
|
+
# This method fork()'s the subprocess and outlines the work done there:
|
166
|
+
# loading, starting the check-in Thread, and running the Agent code.
|
167
|
+
#
|
103
168
|
def launch_child
|
104
169
|
log.info("Launching '#{@agent}'.")
|
105
170
|
status(@agent)
|
@@ -113,7 +178,8 @@ module ScoutAgent
|
|
113
178
|
run_code
|
114
179
|
end
|
115
180
|
end
|
116
|
-
|
181
|
+
|
182
|
+
# Ensure that the writing end of the pipe is closed.
|
117
183
|
def close_writer
|
118
184
|
@writer.close
|
119
185
|
rescue IOError # already closed
|
@@ -122,24 +188,36 @@ module ScoutAgent
|
|
122
188
|
# it wasn't set so there's nothing to close
|
123
189
|
end
|
124
190
|
|
191
|
+
#
|
192
|
+
# An infinite loop that just reads check-in messages from the Agent. This
|
193
|
+
# method will return when the Agent fails to report within
|
194
|
+
# +NO_CONTACT_TIMEOUT+.
|
195
|
+
#
|
125
196
|
def monitor_child
|
126
197
|
loop do
|
127
198
|
check_in = nil
|
128
199
|
begin
|
129
|
-
Timeout.timeout(NO_CONTACT_TIMEOUT)
|
200
|
+
Timeout.timeout(NO_CONTACT_TIMEOUT) do
|
201
|
+
check_in = @reader.gets
|
202
|
+
end
|
130
203
|
log.error("'#{@agent}' monitor channel has closed.") if check_in.nil?
|
131
204
|
rescue Timeout::Error
|
132
205
|
# check_in will stay nil
|
133
206
|
log.error("'#{@agent}' failed to check-in in time.")
|
134
207
|
end
|
135
208
|
unless check_in.to_s =~
|
136
|
-
/\A#{@child_pid}
|
209
|
+
/\A#{@child_pid}:\s*\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\Z/
|
137
210
|
log.error("'#{@agent}' check-in was malformed.") unless check_in.nil?
|
138
211
|
break
|
139
212
|
end
|
140
213
|
end
|
141
214
|
end
|
142
215
|
|
216
|
+
#
|
217
|
+
# This method is called after a monitor cycle fails, so it needs to ensure
|
218
|
+
# the Agent gets shutdown. It will also stop a running plugin, if found,
|
219
|
+
# for the master agent only.
|
220
|
+
#
|
143
221
|
def restart_child
|
144
222
|
log.info("Stopping '#{@agent}'.")
|
145
223
|
status(@agent, :restarting)
|
@@ -152,6 +230,10 @@ module ScoutAgent
|
|
152
230
|
end
|
153
231
|
end
|
154
232
|
|
233
|
+
#
|
234
|
+
# This method overrides the base status() setter from Tracked, to add a way
|
235
|
+
# the combine messages when restarting multiple processes.
|
236
|
+
#
|
155
237
|
def status(process, restarting = false)
|
156
238
|
if db = status_database
|
157
239
|
db.write_to_sqlite do |sqlite|
|
@@ -181,18 +263,27 @@ module ScoutAgent
|
|
181
263
|
end
|
182
264
|
|
183
265
|
#############
|
184
|
-
###
|
266
|
+
### Agent ###
|
185
267
|
#############
|
186
268
|
|
269
|
+
#
|
270
|
+
# Installs appropriate signal handlers for the Agent and clears the identity
|
271
|
+
# of the monitor.
|
272
|
+
#
|
187
273
|
def reset_environment
|
188
274
|
# swap out our parent's signal handlers
|
189
|
-
install_shutdown_handler
|
190
|
-
|
275
|
+
install_shutdown_handler do
|
276
|
+
finish_code
|
277
|
+
end
|
278
|
+
trap("ALRM") do
|
279
|
+
alert_code
|
280
|
+
end
|
191
281
|
|
192
282
|
# clear the parent's identity
|
193
283
|
IDCard.me = nil
|
194
284
|
end
|
195
285
|
|
286
|
+
# Ensure that the reading end of the pipe is closed.
|
196
287
|
def close_reader
|
197
288
|
@reader.close
|
198
289
|
rescue IOError # already closed
|
@@ -201,16 +292,29 @@ module ScoutAgent
|
|
201
292
|
# it wasn't set so there's nothing to close
|
202
293
|
end
|
203
294
|
|
295
|
+
#
|
296
|
+
# Loads the code for the Agent to be monitored and fetches the object built
|
297
|
+
# by that code.
|
298
|
+
#
|
204
299
|
def load_code
|
205
300
|
require LIB_DIR + "agent"
|
206
301
|
require LIB_DIR + "agent/#{@agent}_agent"
|
207
302
|
@code = ScoutAgent::Agent.const_get("#{@agent.CamelCase}Agent").new
|
208
303
|
end
|
209
304
|
|
305
|
+
#
|
306
|
+
# Ensures that this Agent is authorized to run because a copy is not
|
307
|
+
# currently active.
|
308
|
+
#
|
210
309
|
def authorize_code
|
211
310
|
@code.authorize
|
212
311
|
end
|
213
312
|
|
313
|
+
#
|
314
|
+
# An infinite loop that just writes check-in messages to the monitoring
|
315
|
+
# process. This code will trigger it's own shutdown if the parent
|
316
|
+
# disappears (closing the pipe).
|
317
|
+
#
|
214
318
|
def check_in_with_parent
|
215
319
|
@check_in_with_parent_thread = Thread.new do
|
216
320
|
Thread.current.abort_on_exception = true
|
@@ -226,10 +330,15 @@ module ScoutAgent
|
|
226
330
|
end
|
227
331
|
end
|
228
332
|
|
333
|
+
# Invokes the main code of the Agent.
|
229
334
|
def run_code
|
230
335
|
@code.run
|
231
336
|
end
|
232
337
|
|
338
|
+
#
|
339
|
+
# Closes the pipe and invokes the finishing code of the Agent in a separate
|
340
|
+
# Thread.
|
341
|
+
#
|
233
342
|
def finish_code
|
234
343
|
close_writer
|
235
344
|
|
@@ -242,6 +351,7 @@ module ScoutAgent
|
|
242
351
|
end
|
243
352
|
end
|
244
353
|
|
354
|
+
# Invokes the code for the Agent to notice changes from the outside world.
|
245
355
|
def alert_code
|
246
356
|
if @code
|
247
357
|
Thread.new do
|