scout_agent 3.0.6 → 3.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +18 -0
- data/Rakefile +1 -1
- data/lib/scout_agent/agent/communication_agent.rb +5 -1
- data/lib/scout_agent/agent/master_agent.rb +6 -6
- data/lib/scout_agent/agent.rb +26 -5
- data/lib/scout_agent/api.rb +52 -12
- data/lib/scout_agent/assignment/configuration.rb +11 -0
- data/lib/scout_agent/assignment/identify.rb +26 -0
- data/lib/scout_agent/assignment/queue.rb +35 -0
- data/lib/scout_agent/assignment/reset.rb +30 -0
- data/lib/scout_agent/assignment/snapshot.rb +50 -4
- data/lib/scout_agent/assignment/start.rb +65 -2
- data/lib/scout_agent/assignment/status.rb +22 -3
- data/lib/scout_agent/assignment/stop.rb +42 -5
- data/lib/scout_agent/assignment/test.rb +366 -0
- data/lib/scout_agent/assignment/update.rb +20 -0
- data/lib/scout_agent/assignment/upload_log.rb +31 -1
- data/lib/scout_agent/assignment.rb +92 -13
- data/lib/scout_agent/core_extensions.rb +24 -5
- data/lib/scout_agent/dispatcher.rb +45 -1
- data/lib/scout_agent/lifeline.rb +7 -2
- data/lib/scout_agent/mission.rb +31 -11
- data/lib/scout_agent/order/check_in_order.rb +27 -1
- data/lib/scout_agent/order/snapshot_order.rb +16 -0
- data/lib/scout_agent/order.rb +57 -11
- data/lib/scout_agent/plan.rb +1 -1
- data/lib/scout_agent.rb +1 -1
- metadata +13 -5
@@ -3,19 +3,43 @@
|
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
5
|
class Assignment
|
6
|
+
#
|
7
|
+
# Invoke with:
|
8
|
+
#
|
9
|
+
# scout_agent snap [force]
|
10
|
+
#
|
11
|
+
# This command requests that a snapshot be taken of the environment the
|
12
|
+
# agent is running on. Snapshots are a collection of commands sent down
|
13
|
+
# from the Scout server that can be used to measure the current health of
|
14
|
+
# the environment. Their output, exit status, and run time are passed back
|
15
|
+
# up to the server in response to this request.
|
16
|
+
#
|
17
|
+
# Note that this is just a request. It may not be honored if enough time
|
18
|
+
# hasn't passed since the last snapshot. This is to protect your server
|
19
|
+
# from overmuch busy work, but you can choose to override this limitation
|
20
|
+
# with the optional "force" parameter.
|
21
|
+
#
|
6
22
|
class Snapshot < Assignment
|
23
|
+
# Runs the snapshot command.
|
7
24
|
def execute
|
25
|
+
# prepare the log
|
8
26
|
log = ScoutAgent.prepare_wire_tap(:snapshot, :skip_stdout)
|
9
27
|
|
28
|
+
# load the snapshot database
|
10
29
|
unless db = Database.load(:snapshots, log)
|
11
30
|
abort_with_missing_db
|
12
31
|
end
|
13
32
|
|
33
|
+
#
|
34
|
+
# lock on this file to ensure only one process can run a snapshot at a
|
35
|
+
# time
|
36
|
+
#
|
14
37
|
open(__FILE__) do |this_file|
|
15
38
|
unless this_file.flock(File::LOCK_EX | File::LOCK_NB)
|
16
39
|
exit # snapshot in progress
|
17
40
|
end
|
18
41
|
|
42
|
+
# record our status and set removal at_exit()
|
19
43
|
log.info("Building snapshot.")
|
20
44
|
status_database(log)
|
21
45
|
status("Building snapshot", :snapshot)
|
@@ -23,13 +47,16 @@ module ScoutAgent
|
|
23
47
|
clear_status(:snapshot)
|
24
48
|
end
|
25
49
|
|
50
|
+
# reset commands, if requested
|
26
51
|
if Array(other_args).shift == "force"
|
27
52
|
log.info("Clearing command run times to force a full snapshot.")
|
28
53
|
db.reset_all_commands
|
29
54
|
end
|
30
55
|
|
56
|
+
# read current commands
|
31
57
|
commands = db.current_commands
|
32
58
|
|
59
|
+
# bail out if there's no commands to run
|
33
60
|
if commands.empty?
|
34
61
|
if db.have_commands?
|
35
62
|
abort_with_too_recent
|
@@ -38,12 +65,14 @@ module ScoutAgent
|
|
38
65
|
abort_with_no_commands
|
39
66
|
end
|
40
67
|
end
|
41
|
-
|
68
|
+
|
69
|
+
# build snapshot
|
42
70
|
snapshot_started = Time.now
|
43
|
-
commands.each do |command|
|
71
|
+
commands.each do |command| # run each command
|
44
72
|
log.info("Running `#{command[:code]}`.")
|
45
73
|
command_started = Time.now
|
46
74
|
reader, writer = IO.pipe
|
75
|
+
# run the command in a child process
|
47
76
|
run = fork do
|
48
77
|
reader.close
|
49
78
|
STDOUT.reopen(writer)
|
@@ -57,6 +86,7 @@ module ScoutAgent
|
|
57
86
|
exit_status = nil
|
58
87
|
output = nil
|
59
88
|
writer.close
|
89
|
+
# make sure the child process stops in a reasonable time
|
60
90
|
begin
|
61
91
|
Timeout.timeout(command[:timeout]) do
|
62
92
|
output = reader.read
|
@@ -68,6 +98,7 @@ module ScoutAgent
|
|
68
98
|
output = "Error: This command took too long to run"
|
69
99
|
end
|
70
100
|
run_time = Time.now - command_started
|
101
|
+
# record results
|
71
102
|
db.complete_run( command,
|
72
103
|
output,
|
73
104
|
exit_status,
|
@@ -76,26 +107,41 @@ module ScoutAgent
|
|
76
107
|
log.debug( "`#{command[:code]}` exited (#{exit_status}) in " +
|
77
108
|
"#{run_time} seconds." )
|
78
109
|
end
|
79
|
-
|
110
|
+
|
111
|
+
# maintain the snapshots database
|
80
112
|
db.maintain
|
81
113
|
|
82
114
|
log.info("Snapshot complete.")
|
83
|
-
this_file.flock(File::LOCK_UN)
|
115
|
+
this_file.flock(File::LOCK_UN) # release our snapshot lock
|
84
116
|
end
|
85
117
|
end
|
86
118
|
|
119
|
+
#######
|
87
120
|
private
|
121
|
+
#######
|
88
122
|
|
123
|
+
#
|
124
|
+
# Abort with an error message to the user that says we cannot load the
|
125
|
+
# snapshot database.
|
126
|
+
#
|
89
127
|
def abort_with_missing_db
|
90
128
|
warn "Snapshots database could not be loaded."
|
91
129
|
exit 1
|
92
130
|
end
|
93
131
|
|
132
|
+
#
|
133
|
+
# Abort with an error message to the user that says a snapshot was
|
134
|
+
# recently taken and enough time has not yet passed to grab another.
|
135
|
+
#
|
94
136
|
def abort_with_too_recent
|
95
137
|
warn "A snapshot was recently taken."
|
96
138
|
exit 2
|
97
139
|
end
|
98
140
|
|
141
|
+
#
|
142
|
+
# Abort with an error message to the user that says we don't have any
|
143
|
+
# commands from the server yet to run as a snapshot.
|
144
|
+
#
|
99
145
|
def abort_with_no_commands
|
100
146
|
warn "No snapshot commands have been received from the server."
|
101
147
|
exit 3
|
@@ -3,25 +3,45 @@
|
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
5
|
class Assignment
|
6
|
+
#
|
7
|
+
# Invoke with:
|
8
|
+
#
|
9
|
+
# sudo scout_agent start
|
10
|
+
#
|
11
|
+
# This command starts the Scout agent, if it is not already running.
|
12
|
+
#
|
13
|
+
# The agent requires super user privileges to prepare it's environment, but
|
14
|
+
# it will relenquish these privileges before it begins normal work.
|
15
|
+
#
|
16
|
+
# The agent daemonizes itself as part of the startup process (unless
|
17
|
+
# configured not to) and thus will detach from your terminal. It will still
|
18
|
+
# be running happily in the background. You can check up on it with the
|
19
|
+
# status command, if you like.
|
20
|
+
#
|
6
21
|
class Start < Assignment
|
7
22
|
choose_user true
|
8
23
|
choose_group true
|
9
24
|
|
25
|
+
# Runs the start command.
|
10
26
|
def execute
|
27
|
+
# build the directory for PID file storage, if needed
|
11
28
|
unless Plan.pid_dir.exist?
|
12
29
|
unless Plan.build_pid_dir(group.gid)
|
13
30
|
abort_with_missing_pid_dir
|
14
31
|
end
|
15
32
|
end
|
16
33
|
|
34
|
+
# switch to the selected user and group
|
17
35
|
unless switch_group_and_user
|
18
36
|
abort_with_wrong_group_or_user
|
19
37
|
end
|
20
38
|
|
39
|
+
# test our ability to reach the server
|
21
40
|
unless test_server_connection(:quiet)
|
22
41
|
abort_with_cannot_connect
|
23
42
|
end
|
24
43
|
|
44
|
+
# make a final check to see if everything looks properly prepared
|
25
45
|
unless Plan.valid? and
|
26
46
|
Plan.pid_dir.exist? and
|
27
47
|
Plan.pid_dir.readable? and
|
@@ -29,6 +49,7 @@ module ScoutAgent
|
|
29
49
|
abort_with_missing_resources
|
30
50
|
end
|
31
51
|
|
52
|
+
# make sure we are the only process running and daemonize, if configured
|
32
53
|
running_mode = Plan.run_as_daemon? ? " as a daemon" : ""
|
33
54
|
puts "Starting #{ScoutAgent.proper_agent_name}#{running_mode}..."
|
34
55
|
card = IDCard.new(:lifeline)
|
@@ -41,12 +62,15 @@ module ScoutAgent
|
|
41
62
|
end
|
42
63
|
end
|
43
64
|
|
65
|
+
# prepare the log
|
44
66
|
log = ScoutAgent.prepare_wire_tap(:lifeline)
|
45
67
|
log.info("Loading monitors.")
|
46
68
|
|
69
|
+
# load configured agents
|
47
70
|
agents = %w[master]
|
48
71
|
agents << "communication" if Plan.enable_xmpp?
|
49
72
|
|
73
|
+
# start each agent through a Lifeline monitor
|
50
74
|
lifelines = agents.map { |agent| Lifeline.new(agent, log) }
|
51
75
|
%w[TERM INT].each do |signal|
|
52
76
|
trap(signal) do
|
@@ -56,13 +80,24 @@ module ScoutAgent
|
|
56
80
|
end
|
57
81
|
end
|
58
82
|
end
|
59
|
-
lifelines.each
|
83
|
+
lifelines.each do |line|
|
84
|
+
line.launch_and_monitor
|
85
|
+
end
|
60
86
|
|
61
|
-
|
87
|
+
# wait for all monitors to finish
|
88
|
+
lifelines.each do |line|
|
89
|
+
line.join
|
90
|
+
end
|
62
91
|
end
|
63
92
|
|
93
|
+
#######
|
64
94
|
private
|
95
|
+
#######
|
65
96
|
|
97
|
+
#
|
98
|
+
# A typical Unix daemonization process. Returns +true+ if successful,
|
99
|
+
# +false+ otherwise.
|
100
|
+
#
|
66
101
|
def daemonize
|
67
102
|
exit!(0) if fork
|
68
103
|
Process.setsid
|
@@ -77,6 +112,10 @@ module ScoutAgent
|
|
77
112
|
false
|
78
113
|
end
|
79
114
|
|
115
|
+
#
|
116
|
+
# Switches the actual and effective user and group of this process as
|
117
|
+
# configured.
|
118
|
+
#
|
80
119
|
def switch_group_and_user
|
81
120
|
if Process.euid != user.uid or Process.egid != group.egid
|
82
121
|
Process.initgroups(user.name, group.gid) # prepare groups
|
@@ -88,6 +127,10 @@ module ScoutAgent
|
|
88
127
|
false
|
89
128
|
end
|
90
129
|
|
130
|
+
#
|
131
|
+
# Abort with an error message to the user that says we cannot prepare PID
|
132
|
+
# file storage because we don't have the privileges.
|
133
|
+
#
|
91
134
|
def abort_with_missing_pid_dir
|
92
135
|
abort <<-END_PID_DIR.trim
|
93
136
|
Unable to prepare PID file storage. Please start the daemon
|
@@ -99,6 +142,10 @@ module ScoutAgent
|
|
99
142
|
END_PID_DIR
|
100
143
|
end
|
101
144
|
|
145
|
+
#
|
146
|
+
# Abort with an error message to the user that warns of our inability to
|
147
|
+
# reach the server do to connectivity issues or a bad configuration.
|
148
|
+
#
|
102
149
|
def abort_with_cannot_connect
|
103
150
|
abort <<-END_CANNOT_CONNECT.trim
|
104
151
|
Unable to load a plan from the server at:
|
@@ -114,6 +161,10 @@ module ScoutAgent
|
|
114
161
|
END_CANNOT_CONNECT
|
115
162
|
end
|
116
163
|
|
164
|
+
#
|
165
|
+
# Abort with an error message to the user that says we were unable to
|
166
|
+
# switch to the configured user or group.
|
167
|
+
#
|
117
168
|
def abort_with_wrong_group_or_user
|
118
169
|
abort <<-END_GROUP_USER.trim
|
119
170
|
Unable to switch to the selected user and group. Please
|
@@ -125,6 +176,10 @@ module ScoutAgent
|
|
125
176
|
END_GROUP_USER
|
126
177
|
end
|
127
178
|
|
179
|
+
#
|
180
|
+
# Abort with an error message to the user that says we were unable to load
|
181
|
+
# all expected resources due to a damaged configuration.
|
182
|
+
#
|
128
183
|
def abort_with_missing_resources
|
129
184
|
abort <<-END_RESOURCES.trim
|
130
185
|
Some resources needed to complete the startup process are
|
@@ -141,10 +196,18 @@ module ScoutAgent
|
|
141
196
|
END_RESOURCES
|
142
197
|
end
|
143
198
|
|
199
|
+
#
|
200
|
+
# Abort with an error message to the user that informs of an agent already
|
201
|
+
# running on this system.
|
202
|
+
#
|
144
203
|
def abort_with_other_process_running(pid)
|
145
204
|
abort "The daemon is already running with the process ID of #{pid}."
|
146
205
|
end
|
147
206
|
|
207
|
+
#
|
208
|
+
# Abort with an error message to the user that says we are unable to
|
209
|
+
# daemonize this process.
|
210
|
+
#
|
148
211
|
def abort_with_failure_to_daemonize
|
149
212
|
abort "Unable to daemonize this process."
|
150
213
|
end
|
@@ -3,7 +3,18 @@
|
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
5
|
class Assignment
|
6
|
+
#
|
7
|
+
# Invoke with:
|
8
|
+
#
|
9
|
+
# scout_agent status
|
10
|
+
#
|
11
|
+
# This command dumps the status database to <tt>$stdout</tt>. This data
|
12
|
+
# will show what processes are running, what they are currently working on,
|
13
|
+
# and when that status was last updated. It can be useful in tracking down
|
14
|
+
# issues with the agent to see where it went wrong.
|
15
|
+
#
|
6
16
|
class Status < Assignment
|
17
|
+
# Runs the status command.
|
7
18
|
def execute
|
8
19
|
unless db = status_database
|
9
20
|
abort_with_missing_db
|
@@ -25,7 +36,8 @@ module ScoutAgent
|
|
25
36
|
%w[Last\ Updated last_updated_at]
|
26
37
|
].map { |title, data| [title] + statuses.map { |row| row[data] } }
|
27
38
|
sizes = columns.map { |column|
|
28
|
-
column.map { |field| field.to_s.size }.max
|
39
|
+
column.map { |field| field.to_s.size }.max
|
40
|
+
}
|
29
41
|
format = sizes.map { |size| "%-#{size}s" }.join(" ")
|
30
42
|
puts format % columns.map { |column| column.first }
|
31
43
|
puts format % sizes.map { |size| "-" * size }
|
@@ -36,9 +48,16 @@ module ScoutAgent
|
|
36
48
|
end
|
37
49
|
end
|
38
50
|
|
51
|
+
#######
|
52
|
+
private
|
53
|
+
#######
|
54
|
+
|
55
|
+
#
|
56
|
+
# Abort with an error message to the user that says we cannot load the
|
57
|
+
# status database.
|
58
|
+
#
|
39
59
|
def abort_with_missing_db
|
40
|
-
|
41
|
-
exit 1
|
60
|
+
abort "Statuses database could not be loaded."
|
42
61
|
end
|
43
62
|
end
|
44
63
|
end
|
@@ -3,7 +3,17 @@
|
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
5
|
class Assignment
|
6
|
+
#
|
7
|
+
# Invoke with:
|
8
|
+
#
|
9
|
+
# sudo scout_agent stop
|
10
|
+
#
|
11
|
+
# This command halts the currently running agent. The agent is asked nicely
|
12
|
+
# to stop. If it doesn't respond within a few seconds though, it is forced
|
13
|
+
# to terminate.
|
14
|
+
#
|
6
15
|
class Stop < Assignment
|
16
|
+
# Runs the stop command.
|
7
17
|
def execute
|
8
18
|
@agent = IDCard.new(:lifeline)
|
9
19
|
if @agent.pid_file.exist?
|
@@ -22,24 +32,47 @@ module ScoutAgent
|
|
22
32
|
end
|
23
33
|
end
|
24
34
|
|
35
|
+
#######
|
25
36
|
private
|
37
|
+
#######
|
26
38
|
|
39
|
+
#
|
40
|
+
# Sends +signal_name+ (which is expected to be some kind of stop request
|
41
|
+
# to the <tt>@agent</tt>. This method will then wait +wait_count+ periods
|
42
|
+
# of +wait_delay+ seconds checking between waits to see if the agent has
|
43
|
+
# complied. It returns when the agent has exited or the total wait period
|
44
|
+
# has expired.
|
45
|
+
#
|
27
46
|
def signal_and_wait(signal_name, wait_count = 10, wait_delay = 0.5)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
abort_with_no_permission
|
32
|
-
end
|
47
|
+
# signal the main process
|
48
|
+
@agent.signal(signal_name)
|
49
|
+
# wait for it to stop
|
33
50
|
wait_count.times do
|
34
51
|
sleep wait_delay
|
35
52
|
break unless @agent.pid_file.exist?
|
36
53
|
end
|
54
|
+
# signal other stray processes
|
55
|
+
Plan.pid_dir.each_entry do |process|
|
56
|
+
if process.to_s =~ /(\w+)\.pid\z/
|
57
|
+
IDCard.new($1).signal(signal_name)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
rescue Errno::EPERM # we don't have permission
|
61
|
+
abort_with_no_permission
|
37
62
|
end
|
38
63
|
|
64
|
+
#
|
65
|
+
# Abort with an error message to the user that says that the agent isn't
|
66
|
+
# currently running.
|
67
|
+
#
|
39
68
|
def abort_with_not_running_notice
|
40
69
|
puts "#{ScoutAgent.proper_agent_name} is not currently running."
|
41
70
|
end
|
42
71
|
|
72
|
+
#
|
73
|
+
# Abort with an error message to the user that says we don't have enough
|
74
|
+
# permission to stop the agent due to how it was started.
|
75
|
+
#
|
43
76
|
def abort_with_no_permission
|
44
77
|
abort <<-END_PERMISSION.trim
|
45
78
|
Unable to signal the daemon. Please rerun this command with
|
@@ -50,6 +83,10 @@ module ScoutAgent
|
|
50
83
|
END_PERMISSION
|
51
84
|
end
|
52
85
|
|
86
|
+
#
|
87
|
+
# Abort with an error message to the user that says we don't have enough
|
88
|
+
# permission to stop the agent due to how it was started.
|
89
|
+
#
|
53
90
|
def abort_with_failed_to_stop
|
54
91
|
abort <<-END_FAILED.trim
|
55
92
|
Unable to stop the daemon. You may need to use the PID files
|