scout_agent 3.0.7 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -0
- data/README +44 -2
- data/Rakefile +1 -1
- data/TODO +7 -1
- data/lib/scout_agent.rb +26 -1
- data/lib/scout_agent/agent/communication_agent.rb +75 -2
- data/lib/scout_agent/agent/master_agent.rb +132 -31
- data/lib/scout_agent/assignment/queue.rb +3 -0
- data/lib/scout_agent/assignment/snapshot.rb +4 -1
- data/lib/scout_agent/assignment/start.rb +3 -4
- data/lib/scout_agent/assignment/stop.rb +64 -30
- data/lib/scout_agent/id_card.rb +23 -12
- data/lib/scout_agent/lifeline.rb +129 -19
- data/lib/scout_agent/mission.rb +221 -68
- data/lib/scout_agent/order.rb +2 -2
- data/lib/scout_agent/plan.rb +38 -18
- metadata +2 -2
@@ -108,8 +108,11 @@ module ScoutAgent
|
|
108
108
|
"#{run_time} seconds." )
|
109
109
|
end
|
110
110
|
|
111
|
-
# maintain the
|
111
|
+
# maintain the databases
|
112
112
|
db.maintain
|
113
|
+
status_database.maintain
|
114
|
+
# clean out old logs
|
115
|
+
ScoutAgent.remove_old_log_files(log)
|
113
116
|
|
114
117
|
log.info("Snapshot complete.")
|
115
118
|
this_file.flock(File::LOCK_UN) # release our snapshot lock
|
@@ -74,10 +74,7 @@ module ScoutAgent
|
|
74
74
|
lifelines = agents.map { |agent| Lifeline.new(agent, log) }
|
75
75
|
%w[TERM INT].each do |signal|
|
76
76
|
trap(signal) do
|
77
|
-
|
78
|
-
lifelines.each { |line| line.terminate }
|
79
|
-
Process.waitall
|
80
|
-
end
|
77
|
+
lifelines.each { |line| line.terminate }
|
81
78
|
end
|
82
79
|
end
|
83
80
|
lifelines.each do |line|
|
@@ -88,6 +85,8 @@ module ScoutAgent
|
|
88
85
|
lifelines.each do |line|
|
89
86
|
line.join
|
90
87
|
end
|
88
|
+
# wait for all children to obey our stop command
|
89
|
+
Process.waitall
|
91
90
|
end
|
92
91
|
|
93
92
|
#######
|
@@ -13,20 +13,29 @@ module ScoutAgent
|
|
13
13
|
# to terminate.
|
14
14
|
#
|
15
15
|
class Stop < Assignment
|
16
|
+
#
|
17
|
+
# The number of periods between stop checks after a process has been
|
18
|
+
# signaled. Checks will be made until it's obvious the process obeyed the
|
19
|
+
# request or until this number of checks has been made.
|
20
|
+
#
|
21
|
+
WAIT_COUNT = 10
|
22
|
+
#
|
23
|
+
# The pause in seconds between stop checks after a process has been
|
24
|
+
# signaled.
|
25
|
+
#
|
26
|
+
WAIT_DELAY = 0.5
|
27
|
+
|
16
28
|
# Runs the stop command.
|
17
29
|
def execute
|
18
30
|
@agent = IDCard.new(:lifeline)
|
19
31
|
if @agent.pid_file.exist?
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
if @agent.pid_file.exist?
|
26
|
-
abort_with_failed_to_stop
|
27
|
-
end
|
32
|
+
signal_all("TERM")
|
33
|
+
if Plan.pid_dir.entries.any? { |pid| pid.to_s =~ /\w+\.pid\z/ }
|
34
|
+
puts "\nTERM signals were ignored, sending KILL signals.\n\n"
|
35
|
+
signal_all("KILL")
|
36
|
+
abort_with_failed_to_stop
|
28
37
|
end
|
29
|
-
puts "
|
38
|
+
puts "All processes stopped."
|
30
39
|
else
|
31
40
|
abort_with_not_running_notice
|
32
41
|
end
|
@@ -37,28 +46,49 @@ module ScoutAgent
|
|
37
46
|
#######
|
38
47
|
|
39
48
|
#
|
40
|
-
# Sends +signal_name+ (
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
49
|
+
# Sends +signal_name+ (assumed to be a stop request) to all subprocesses
|
50
|
+
# of the agent. The first process signaled is the lifeline process and it
|
51
|
+
# is given a few seconds to shut everything down smoothly as it is
|
52
|
+
# supposed to do. If that fails, the stray processes will be sent
|
53
|
+
# +signal_name+ directly.
|
45
54
|
#
|
46
|
-
def
|
47
|
-
#
|
48
|
-
@agent
|
49
|
-
#
|
50
|
-
wait_count.times do
|
51
|
-
sleep wait_delay
|
52
|
-
break unless @agent.pid_file.exist?
|
53
|
-
end
|
54
|
-
# signal other stray processes
|
55
|
+
def signal_all(signal_name)
|
56
|
+
# start with the Lifeline process as that should properly stop everyone
|
57
|
+
signal_and_wait(@agent, signal_name)
|
58
|
+
# signal any other stray processes
|
55
59
|
Plan.pid_dir.each_entry do |process|
|
56
60
|
if process.to_s =~ /(\w+)\.pid\z/
|
57
|
-
IDCard.new($1)
|
61
|
+
signal_and_wait(IDCard.new($1), signal_name) unless $1 == "lifeline"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# Sends +signal_name+ (which is expected to be some kind of stop request)
|
68
|
+
# to the +id_card+. This method will then wait +WAIT_COUNT+ periods of
|
69
|
+
# +WAIT_DELAY+ seconds checking between waits to see if the agent has
|
70
|
+
# complied. It returns when the signaled process has exited or the total
|
71
|
+
# wait period has expired. The wait period is skipped for the +KILL+
|
72
|
+
# signal, since the process is not allowed to respond.
|
73
|
+
#
|
74
|
+
def signal_and_wait(id_card, signal_name)
|
75
|
+
puts "Sending #{signal_name} signal to the agent's " +
|
76
|
+
"#{id_card.process_name} process."
|
77
|
+
# signal the main process
|
78
|
+
begin
|
79
|
+
id_card.signal(signal_name)
|
80
|
+
rescue Errno::ECHILD, Errno::ESRCH # no such process
|
81
|
+
# do nothing: it stopped
|
82
|
+
rescue Errno::EPERM # we don't have permission
|
83
|
+
abort_with_no_permission
|
84
|
+
end
|
85
|
+
unless signal_name == "KILL" # process cannot respond, so don't wait
|
86
|
+
# wait for it to stop
|
87
|
+
WAIT_COUNT.times do
|
88
|
+
sleep WAIT_DELAY
|
89
|
+
break unless id_card.pid_file.exist?
|
58
90
|
end
|
59
91
|
end
|
60
|
-
rescue Errno::EPERM # we don't have permission
|
61
|
-
abort_with_no_permission
|
62
92
|
end
|
63
93
|
|
64
94
|
#
|
@@ -75,6 +105,7 @@ module ScoutAgent
|
|
75
105
|
#
|
76
106
|
def abort_with_no_permission
|
77
107
|
abort <<-END_PERMISSION.trim
|
108
|
+
|
78
109
|
Unable to signal the daemon. Please rerun this command with
|
79
110
|
super user privileges:
|
80
111
|
|
@@ -84,13 +115,16 @@ module ScoutAgent
|
|
84
115
|
end
|
85
116
|
|
86
117
|
#
|
87
|
-
# Abort with an error message to the user that says we
|
88
|
-
#
|
118
|
+
# Abort with an error message to the user that says we killed the agent
|
119
|
+
# but there are some stray file hanging around.
|
89
120
|
#
|
90
121
|
def abort_with_failed_to_stop
|
91
122
|
abort <<-END_FAILED.trim
|
92
|
-
|
93
|
-
|
123
|
+
|
124
|
+
KILL signals were sent to all active processes and they
|
125
|
+
should be stopped now. You may wish to check the PID
|
126
|
+
files in #{Plan.pid_dir} to be sure. The agent should
|
127
|
+
clean up these old files as it relaunches.
|
94
128
|
END_FAILED
|
95
129
|
end
|
96
130
|
end
|
data/lib/scout_agent/id_card.rb
CHANGED
@@ -7,15 +7,26 @@ module ScoutAgent
|
|
7
7
|
# execution and to signal other processes.
|
8
8
|
#
|
9
9
|
class IDCard
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
10
|
+
#
|
11
|
+
# This global attribute should contain the IDCard for this process. It is
|
12
|
+
# set during a successful authorization.
|
13
|
+
#
|
14
|
+
# <b>Warning:</b> Be sure to clear this attribute immediately after a
|
15
|
+
# fork(), with a call to me=(), so you don't keep the parent's identity.
|
16
|
+
#
|
17
|
+
def self.me
|
18
|
+
@me ||= nil
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# A setter for the identity of this process. This is set automatically as
|
23
|
+
# part of an authorization.
|
24
|
+
#
|
25
|
+
# <b>Warning:</b> Be sure to clear this attribute immediately after a
|
26
|
+
# fork() so you don't keep the parent's identity.
|
27
|
+
#
|
28
|
+
def self.me=(id_card)
|
29
|
+
@me = id_card
|
19
30
|
end
|
20
31
|
|
21
32
|
#
|
@@ -76,10 +87,10 @@ module ScoutAgent
|
|
76
87
|
# <tt>IDCard::me()</tt> has been updated and an exit handle has been
|
77
88
|
# installed to revoke() this claim as the process ends.
|
78
89
|
#
|
79
|
-
def authorize
|
90
|
+
def authorize(&block)
|
80
91
|
File.open(pid_file, File::CREAT | File::EXCL | File::WRONLY) do |pid|
|
81
92
|
pid.flock(File::LOCK_EX)
|
82
|
-
if
|
93
|
+
if block.nil? or block.call # allows for daemonization
|
83
94
|
pid.puts Process.pid
|
84
95
|
else
|
85
96
|
pid.flock(File::LOCK_UN)
|
@@ -105,7 +116,7 @@ module ScoutAgent
|
|
105
116
|
# stale PID file found, clearing it and reloading
|
106
117
|
if revoke
|
107
118
|
pid.flock(File::LOCK_UN) # release the lock before we recurse
|
108
|
-
return authorize
|
119
|
+
return authorize(&block) # try again
|
109
120
|
end
|
110
121
|
rescue Errno::EACCES # don't have permission
|
111
122
|
# nothing we can do so give up
|
data/lib/scout_agent/lifeline.rb
CHANGED
@@ -2,16 +2,46 @@
|
|
2
2
|
# encoding: UTF-8
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
|
+
#
|
6
|
+
# This class is a monitor for an Agent subprocess of the platform. It
|
7
|
+
# launches the Agent code and makes sure it continues to check-in at regular
|
8
|
+
# intervals, restarting the subprocess when it fails to do so.
|
9
|
+
#
|
5
10
|
class Lifeline
|
11
|
+
#
|
12
|
+
# The number of seconds allowed to pass before the Agent subprocess is
|
13
|
+
# considered unresponsive.
|
14
|
+
#
|
6
15
|
NO_CONTACT_TIMEOUT = 5
|
7
|
-
|
16
|
+
#
|
17
|
+
# The frequency with which the subprocess is expected to check-in. This is
|
18
|
+
# purposely set to a little under a second to give one more check-in
|
19
|
+
# possibility before the <tt>NO_CONTACT_TIMEOUT</tt> is reached.
|
20
|
+
#
|
21
|
+
CHECK_IN_FREQUENCY = 0.99
|
22
|
+
#
|
23
|
+
# The number of seconds the monitor will wait for a process to exit cleanly
|
24
|
+
# before forcing a stop.
|
25
|
+
#
|
8
26
|
TERM_TO_KILL_PAUSE = 1
|
27
|
+
#
|
28
|
+
# The sequence of seconds this monitor will wait between restarts of the
|
29
|
+
# subprocess. The initial values are short, to try and get running again as
|
30
|
+
# soon as possible. However, this timeout grows larger up to a point to
|
31
|
+
# reduce strain on a server experiencing long term problems. The sequence
|
32
|
+
# will reset after a successful relaunch that runs for at least as long as
|
33
|
+
# the next number in the sequence (or the max).
|
34
|
+
#
|
9
35
|
RELAUNCH_FREQUENCIES = [0, 1, 1, 2, 3, 5, 8, 13]
|
10
36
|
|
11
37
|
#################
|
12
38
|
### Interface ###
|
13
39
|
#################
|
14
40
|
|
41
|
+
#
|
42
|
+
# Prepares a monitor for the code specified by +agent+. You may also set
|
43
|
+
# log() messages will be appended to.
|
44
|
+
#
|
15
45
|
def initialize(agent, log = WireTap.new(nil))
|
16
46
|
@agent = agent
|
17
47
|
@log = log
|
@@ -20,6 +50,7 @@ module ScoutAgent
|
|
20
50
|
@reader = nil
|
21
51
|
@writer = nil
|
22
52
|
@launch_and_monitor_thread = nil
|
53
|
+
@termination_thread = nil
|
23
54
|
@check_in_with_parent_thread = nil
|
24
55
|
@code = nil
|
25
56
|
@last_launch = nil
|
@@ -32,8 +63,13 @@ module ScoutAgent
|
|
32
63
|
|
33
64
|
include Tracked
|
34
65
|
|
66
|
+
# The log file this monitor writes tracking information to.
|
35
67
|
attr_reader :log
|
36
68
|
|
69
|
+
#
|
70
|
+
# This method outlines the process used to monitor an Agent. It is roughly:
|
71
|
+
# launch, monitor, kill as needed, and restart the process.
|
72
|
+
#
|
37
73
|
def launch_and_monitor
|
38
74
|
@launch_and_monitor_thread = Thread.new do
|
39
75
|
Thread.current.abort_on_exception = true
|
@@ -48,22 +84,38 @@ module ScoutAgent
|
|
48
84
|
end
|
49
85
|
end
|
50
86
|
|
87
|
+
#
|
88
|
+
# Begins a termination of the Agent subprocess in a separate Thread. This
|
89
|
+
# monitor's join() method will also wait on this termination Thread to
|
90
|
+
# ensure everything gets the order to shutdown before we exit.
|
91
|
+
#
|
51
92
|
def terminate
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
93
|
+
@termination_thread = Thread.new do
|
94
|
+
if Process.pid == @parent_pid
|
95
|
+
# stop monitoring
|
96
|
+
log.info("Stopping the monitoring for '#{@agent}'.")
|
97
|
+
@launch_and_monitor_thread.exit if @launch_and_monitor_thread
|
98
|
+
# ask child process to exit
|
99
|
+
log.info("Asking '#{@agent}' to stop.")
|
100
|
+
IDCard.new(@agent).signal("TERM")
|
101
|
+
end
|
59
102
|
end
|
60
103
|
rescue Errno::ESRCH # no such process
|
61
104
|
# if already exited, so we are fine
|
62
105
|
end
|
63
106
|
|
107
|
+
#
|
108
|
+
# Waits for the monitor Thread to be stopped by a natural termination before
|
109
|
+
# returning. If terminate() is called to start the shutdown, this method
|
110
|
+
# will also wait on the Thread spawned by that method to ensure everything
|
111
|
+
# gets the signal to stop.
|
112
|
+
#
|
64
113
|
def join
|
65
114
|
if Process.pid == @parent_pid and @launch_and_monitor_thread
|
66
|
-
@launch_and_monitor_thread.join
|
115
|
+
@launch_and_monitor_thread.join # wait on the monitor to stop
|
116
|
+
if @termination_thread
|
117
|
+
@termination_thread.join # wait on us to stop the subprocess
|
118
|
+
end
|
67
119
|
end
|
68
120
|
end
|
69
121
|
|
@@ -71,10 +123,15 @@ module ScoutAgent
|
|
71
123
|
private
|
72
124
|
#######
|
73
125
|
|
74
|
-
|
75
|
-
###
|
76
|
-
|
126
|
+
###############
|
127
|
+
### Monitor ###
|
128
|
+
###############
|
77
129
|
|
130
|
+
#
|
131
|
+
# This method just rests for the proper amount of time between launches to
|
132
|
+
# ensure we're not overworking the server due to continuing issues. See
|
133
|
+
# +RELAUNCH_FREQUENCIES+ for details.
|
134
|
+
#
|
78
135
|
def wait_for_launch
|
79
136
|
if @last_launch
|
80
137
|
seconds_ran = Time.now - @last_launch
|
@@ -95,11 +152,19 @@ module ScoutAgent
|
|
95
152
|
end
|
96
153
|
end
|
97
154
|
end
|
98
|
-
|
155
|
+
|
156
|
+
#
|
157
|
+
# Creates a two-ended pipe for one way communication from the Agent checking
|
158
|
+
# in with the monitor.
|
159
|
+
#
|
99
160
|
def prepare_pipe
|
100
161
|
@reader, @writer = IO.pipe
|
101
162
|
end
|
102
163
|
|
164
|
+
#
|
165
|
+
# This method fork()'s the subprocess and outlines the work done there:
|
166
|
+
# loading, starting the check-in Thread, and running the Agent code.
|
167
|
+
#
|
103
168
|
def launch_child
|
104
169
|
log.info("Launching '#{@agent}'.")
|
105
170
|
status(@agent)
|
@@ -113,7 +178,8 @@ module ScoutAgent
|
|
113
178
|
run_code
|
114
179
|
end
|
115
180
|
end
|
116
|
-
|
181
|
+
|
182
|
+
# Ensure that the writing end of the pipe is closed.
|
117
183
|
def close_writer
|
118
184
|
@writer.close
|
119
185
|
rescue IOError # already closed
|
@@ -122,24 +188,36 @@ module ScoutAgent
|
|
122
188
|
# it wasn't set so there's nothing to close
|
123
189
|
end
|
124
190
|
|
191
|
+
#
|
192
|
+
# An infinite loop that just reads check-in messages from the Agent. This
|
193
|
+
# method will return when the Agent fails to report within
|
194
|
+
# +NO_CONTACT_TIMEOUT+.
|
195
|
+
#
|
125
196
|
def monitor_child
|
126
197
|
loop do
|
127
198
|
check_in = nil
|
128
199
|
begin
|
129
|
-
Timeout.timeout(NO_CONTACT_TIMEOUT)
|
200
|
+
Timeout.timeout(NO_CONTACT_TIMEOUT) do
|
201
|
+
check_in = @reader.gets
|
202
|
+
end
|
130
203
|
log.error("'#{@agent}' monitor channel has closed.") if check_in.nil?
|
131
204
|
rescue Timeout::Error
|
132
205
|
# check_in will stay nil
|
133
206
|
log.error("'#{@agent}' failed to check-in in time.")
|
134
207
|
end
|
135
208
|
unless check_in.to_s =~
|
136
|
-
/\A#{@child_pid}
|
209
|
+
/\A#{@child_pid}:\s*\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\Z/
|
137
210
|
log.error("'#{@agent}' check-in was malformed.") unless check_in.nil?
|
138
211
|
break
|
139
212
|
end
|
140
213
|
end
|
141
214
|
end
|
142
215
|
|
216
|
+
#
|
217
|
+
# This method is called after a monitor cycle fails, so it needs to ensure
|
218
|
+
# the Agent gets shutdown. It will also stop a running plugin, if found,
|
219
|
+
# for the master agent only.
|
220
|
+
#
|
143
221
|
def restart_child
|
144
222
|
log.info("Stopping '#{@agent}'.")
|
145
223
|
status(@agent, :restarting)
|
@@ -152,6 +230,10 @@ module ScoutAgent
|
|
152
230
|
end
|
153
231
|
end
|
154
232
|
|
233
|
+
#
|
234
|
+
# This method overrides the base status() setter from Tracked, to add a way
|
235
|
+
# the combine messages when restarting multiple processes.
|
236
|
+
#
|
155
237
|
def status(process, restarting = false)
|
156
238
|
if db = status_database
|
157
239
|
db.write_to_sqlite do |sqlite|
|
@@ -181,18 +263,27 @@ module ScoutAgent
|
|
181
263
|
end
|
182
264
|
|
183
265
|
#############
|
184
|
-
###
|
266
|
+
### Agent ###
|
185
267
|
#############
|
186
268
|
|
269
|
+
#
|
270
|
+
# Installs appropriate signal handlers for the Agent and clears the identity
|
271
|
+
# of the monitor.
|
272
|
+
#
|
187
273
|
def reset_environment
|
188
274
|
# swap out our parent's signal handlers
|
189
|
-
install_shutdown_handler
|
190
|
-
|
275
|
+
install_shutdown_handler do
|
276
|
+
finish_code
|
277
|
+
end
|
278
|
+
trap("ALRM") do
|
279
|
+
alert_code
|
280
|
+
end
|
191
281
|
|
192
282
|
# clear the parent's identity
|
193
283
|
IDCard.me = nil
|
194
284
|
end
|
195
285
|
|
286
|
+
# Ensure that the reading end of the pipe is closed.
|
196
287
|
def close_reader
|
197
288
|
@reader.close
|
198
289
|
rescue IOError # already closed
|
@@ -201,16 +292,29 @@ module ScoutAgent
|
|
201
292
|
# it wasn't set so there's nothing to close
|
202
293
|
end
|
203
294
|
|
295
|
+
#
|
296
|
+
# Loads the code for the Agent to be monitored and fetches the object built
|
297
|
+
# by that code.
|
298
|
+
#
|
204
299
|
def load_code
|
205
300
|
require LIB_DIR + "agent"
|
206
301
|
require LIB_DIR + "agent/#{@agent}_agent"
|
207
302
|
@code = ScoutAgent::Agent.const_get("#{@agent.CamelCase}Agent").new
|
208
303
|
end
|
209
304
|
|
305
|
+
#
|
306
|
+
# Ensures that this Agent is authorized to run because a copy is not
|
307
|
+
# currently active.
|
308
|
+
#
|
210
309
|
def authorize_code
|
211
310
|
@code.authorize
|
212
311
|
end
|
213
312
|
|
313
|
+
#
|
314
|
+
# An infinite loop that just writes check-in messages to the monitoring
|
315
|
+
# process. This code will trigger it's own shutdown if the parent
|
316
|
+
# disappears (closing the pipe).
|
317
|
+
#
|
214
318
|
def check_in_with_parent
|
215
319
|
@check_in_with_parent_thread = Thread.new do
|
216
320
|
Thread.current.abort_on_exception = true
|
@@ -226,10 +330,15 @@ module ScoutAgent
|
|
226
330
|
end
|
227
331
|
end
|
228
332
|
|
333
|
+
# Invokes the main code of the Agent.
|
229
334
|
def run_code
|
230
335
|
@code.run
|
231
336
|
end
|
232
337
|
|
338
|
+
#
|
339
|
+
# Closes the pipe and invokes the finishing code of the Agent in a separate
|
340
|
+
# Thread.
|
341
|
+
#
|
233
342
|
def finish_code
|
234
343
|
close_writer
|
235
344
|
|
@@ -242,6 +351,7 @@ module ScoutAgent
|
|
242
351
|
end
|
243
352
|
end
|
244
353
|
|
354
|
+
# Invokes the code for the Agent to notice changes from the outside world.
|
245
355
|
def alert_code
|
246
356
|
if @code
|
247
357
|
Thread.new do
|