scout_agent 3.0.6 → 3.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +18 -0
- data/Rakefile +1 -1
- data/lib/scout_agent/agent/communication_agent.rb +5 -1
- data/lib/scout_agent/agent/master_agent.rb +6 -6
- data/lib/scout_agent/agent.rb +26 -5
- data/lib/scout_agent/api.rb +52 -12
- data/lib/scout_agent/assignment/configuration.rb +11 -0
- data/lib/scout_agent/assignment/identify.rb +26 -0
- data/lib/scout_agent/assignment/queue.rb +35 -0
- data/lib/scout_agent/assignment/reset.rb +30 -0
- data/lib/scout_agent/assignment/snapshot.rb +50 -4
- data/lib/scout_agent/assignment/start.rb +65 -2
- data/lib/scout_agent/assignment/status.rb +22 -3
- data/lib/scout_agent/assignment/stop.rb +42 -5
- data/lib/scout_agent/assignment/test.rb +366 -0
- data/lib/scout_agent/assignment/update.rb +20 -0
- data/lib/scout_agent/assignment/upload_log.rb +31 -1
- data/lib/scout_agent/assignment.rb +92 -13
- data/lib/scout_agent/core_extensions.rb +24 -5
- data/lib/scout_agent/dispatcher.rb +45 -1
- data/lib/scout_agent/lifeline.rb +7 -2
- data/lib/scout_agent/mission.rb +31 -11
- data/lib/scout_agent/order/check_in_order.rb +27 -1
- data/lib/scout_agent/order/snapshot_order.rb +16 -0
- data/lib/scout_agent/order.rb +57 -11
- data/lib/scout_agent/plan.rb +1 -1
- data/lib/scout_agent.rb +1 -1
- metadata +13 -5
@@ -3,19 +3,43 @@
|
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
5
|
class Assignment
|
6
|
+
#
|
7
|
+
# Invoke with:
|
8
|
+
#
|
9
|
+
# scout_agent snap [force]
|
10
|
+
#
|
11
|
+
# This command requests that a snapshot be taken of the environment the
|
12
|
+
# agent is running on. Snapshots are a collection of commands sent down
|
13
|
+
# from the Scout server that can be used to measure the current health of
|
14
|
+
# the environment. Their output, exit status, and run time are passed back
|
15
|
+
# up to the server in response to this request.
|
16
|
+
#
|
17
|
+
# Note that this is just a request. It may not be honored if enough time
|
18
|
+
# hasn't passed since the last snapshot. This is to protect your server
|
19
|
+
# from overmuch busy work, but you can choose to override this limitation
|
20
|
+
# with the optional "force" parameter.
|
21
|
+
#
|
6
22
|
class Snapshot < Assignment
|
23
|
+
# Runs the snapshot command.
|
7
24
|
def execute
|
25
|
+
# prepare the log
|
8
26
|
log = ScoutAgent.prepare_wire_tap(:snapshot, :skip_stdout)
|
9
27
|
|
28
|
+
# load the snapshot database
|
10
29
|
unless db = Database.load(:snapshots, log)
|
11
30
|
abort_with_missing_db
|
12
31
|
end
|
13
32
|
|
33
|
+
#
|
34
|
+
# lock on this file to ensure only one process can run a snapshot at a
|
35
|
+
# time
|
36
|
+
#
|
14
37
|
open(__FILE__) do |this_file|
|
15
38
|
unless this_file.flock(File::LOCK_EX | File::LOCK_NB)
|
16
39
|
exit # snapshot in progress
|
17
40
|
end
|
18
41
|
|
42
|
+
# record our status and set removal at_exit()
|
19
43
|
log.info("Building snapshot.")
|
20
44
|
status_database(log)
|
21
45
|
status("Building snapshot", :snapshot)
|
@@ -23,13 +47,16 @@ module ScoutAgent
|
|
23
47
|
clear_status(:snapshot)
|
24
48
|
end
|
25
49
|
|
50
|
+
# reset commands, if requested
|
26
51
|
if Array(other_args).shift == "force"
|
27
52
|
log.info("Clearing command run times to force a full snapshot.")
|
28
53
|
db.reset_all_commands
|
29
54
|
end
|
30
55
|
|
56
|
+
# read current commands
|
31
57
|
commands = db.current_commands
|
32
58
|
|
59
|
+
# bail out if there's no commands to run
|
33
60
|
if commands.empty?
|
34
61
|
if db.have_commands?
|
35
62
|
abort_with_too_recent
|
@@ -38,12 +65,14 @@ module ScoutAgent
|
|
38
65
|
abort_with_no_commands
|
39
66
|
end
|
40
67
|
end
|
41
|
-
|
68
|
+
|
69
|
+
# build snapshot
|
42
70
|
snapshot_started = Time.now
|
43
|
-
commands.each do |command|
|
71
|
+
commands.each do |command| # run each command
|
44
72
|
log.info("Running `#{command[:code]}`.")
|
45
73
|
command_started = Time.now
|
46
74
|
reader, writer = IO.pipe
|
75
|
+
# run the command in a child process
|
47
76
|
run = fork do
|
48
77
|
reader.close
|
49
78
|
STDOUT.reopen(writer)
|
@@ -57,6 +86,7 @@ module ScoutAgent
|
|
57
86
|
exit_status = nil
|
58
87
|
output = nil
|
59
88
|
writer.close
|
89
|
+
# make sure the child process stops in a reasonable time
|
60
90
|
begin
|
61
91
|
Timeout.timeout(command[:timeout]) do
|
62
92
|
output = reader.read
|
@@ -68,6 +98,7 @@ module ScoutAgent
|
|
68
98
|
output = "Error: This command took too long to run"
|
69
99
|
end
|
70
100
|
run_time = Time.now - command_started
|
101
|
+
# record results
|
71
102
|
db.complete_run( command,
|
72
103
|
output,
|
73
104
|
exit_status,
|
@@ -76,26 +107,41 @@ module ScoutAgent
|
|
76
107
|
log.debug( "`#{command[:code]}` exited (#{exit_status}) in " +
|
77
108
|
"#{run_time} seconds." )
|
78
109
|
end
|
79
|
-
|
110
|
+
|
111
|
+
# maintain the snapshots database
|
80
112
|
db.maintain
|
81
113
|
|
82
114
|
log.info("Snapshot complete.")
|
83
|
-
this_file.flock(File::LOCK_UN)
|
115
|
+
this_file.flock(File::LOCK_UN) # release our snapshot lock
|
84
116
|
end
|
85
117
|
end
|
86
118
|
|
119
|
+
#######
|
87
120
|
private
|
121
|
+
#######
|
88
122
|
|
123
|
+
#
|
124
|
+
# Abort with an error message to the user that says we cannot load the
|
125
|
+
# snapshot database.
|
126
|
+
#
|
89
127
|
def abort_with_missing_db
|
90
128
|
warn "Snapshots database could not be loaded."
|
91
129
|
exit 1
|
92
130
|
end
|
93
131
|
|
132
|
+
#
|
133
|
+
# Abort with an error message to the user that says a snapshot was
|
134
|
+
# recently taken and enough time has not yet passed to grab another.
|
135
|
+
#
|
94
136
|
def abort_with_too_recent
|
95
137
|
warn "A snapshot was recently taken."
|
96
138
|
exit 2
|
97
139
|
end
|
98
140
|
|
141
|
+
#
|
142
|
+
# Abort with an error message to the user that says we don't have any
|
143
|
+
# commands from the server yet to run as a snapshot.
|
144
|
+
#
|
99
145
|
def abort_with_no_commands
|
100
146
|
warn "No snapshot commands have been received from the server."
|
101
147
|
exit 3
|
@@ -3,25 +3,45 @@
|
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
5
|
class Assignment
|
6
|
+
#
|
7
|
+
# Invoke with:
|
8
|
+
#
|
9
|
+
# sudo scout_agent start
|
10
|
+
#
|
11
|
+
# This command starts the Scout agent, if it is not already running.
|
12
|
+
#
|
13
|
+
# The agent requires super user privileges to prepare it's environment, but
|
14
|
+
# it will relenquish these privileges before it begins normal work.
|
15
|
+
#
|
16
|
+
# The agent daemonizes itself as part of the startup process (unless
|
17
|
+
# configured not to) and thus will detach from your terminal. It will still
|
18
|
+
# be running happily in the background. You can check up on it with the
|
19
|
+
# status command, if you like.
|
20
|
+
#
|
6
21
|
class Start < Assignment
|
7
22
|
choose_user true
|
8
23
|
choose_group true
|
9
24
|
|
25
|
+
# Runs the start command.
|
10
26
|
def execute
|
27
|
+
# build the directory for PID file storage, if needed
|
11
28
|
unless Plan.pid_dir.exist?
|
12
29
|
unless Plan.build_pid_dir(group.gid)
|
13
30
|
abort_with_missing_pid_dir
|
14
31
|
end
|
15
32
|
end
|
16
33
|
|
34
|
+
# switch to the selected user and group
|
17
35
|
unless switch_group_and_user
|
18
36
|
abort_with_wrong_group_or_user
|
19
37
|
end
|
20
38
|
|
39
|
+
# test our ability to reach the server
|
21
40
|
unless test_server_connection(:quiet)
|
22
41
|
abort_with_cannot_connect
|
23
42
|
end
|
24
43
|
|
44
|
+
# make a final check to see if everything looks properly prepared
|
25
45
|
unless Plan.valid? and
|
26
46
|
Plan.pid_dir.exist? and
|
27
47
|
Plan.pid_dir.readable? and
|
@@ -29,6 +49,7 @@ module ScoutAgent
|
|
29
49
|
abort_with_missing_resources
|
30
50
|
end
|
31
51
|
|
52
|
+
# make sure we are the only process running and daemonize, if configured
|
32
53
|
running_mode = Plan.run_as_daemon? ? " as a daemon" : ""
|
33
54
|
puts "Starting #{ScoutAgent.proper_agent_name}#{running_mode}..."
|
34
55
|
card = IDCard.new(:lifeline)
|
@@ -41,12 +62,15 @@ module ScoutAgent
|
|
41
62
|
end
|
42
63
|
end
|
43
64
|
|
65
|
+
# prepare the log
|
44
66
|
log = ScoutAgent.prepare_wire_tap(:lifeline)
|
45
67
|
log.info("Loading monitors.")
|
46
68
|
|
69
|
+
# load configured agents
|
47
70
|
agents = %w[master]
|
48
71
|
agents << "communication" if Plan.enable_xmpp?
|
49
72
|
|
73
|
+
# start each agent through a Lifeline monitor
|
50
74
|
lifelines = agents.map { |agent| Lifeline.new(agent, log) }
|
51
75
|
%w[TERM INT].each do |signal|
|
52
76
|
trap(signal) do
|
@@ -56,13 +80,24 @@ module ScoutAgent
|
|
56
80
|
end
|
57
81
|
end
|
58
82
|
end
|
59
|
-
lifelines.each
|
83
|
+
lifelines.each do |line|
|
84
|
+
line.launch_and_monitor
|
85
|
+
end
|
60
86
|
|
61
|
-
|
87
|
+
# wait for all monitors to finish
|
88
|
+
lifelines.each do |line|
|
89
|
+
line.join
|
90
|
+
end
|
62
91
|
end
|
63
92
|
|
93
|
+
#######
|
64
94
|
private
|
95
|
+
#######
|
65
96
|
|
97
|
+
#
|
98
|
+
# A typical Unix daemonization process. Returns +true+ if successful,
|
99
|
+
# +false+ otherwise.
|
100
|
+
#
|
66
101
|
def daemonize
|
67
102
|
exit!(0) if fork
|
68
103
|
Process.setsid
|
@@ -77,6 +112,10 @@ module ScoutAgent
|
|
77
112
|
false
|
78
113
|
end
|
79
114
|
|
115
|
+
#
|
116
|
+
# Switches the actual and effective user and group of this process as
|
117
|
+
# configured.
|
118
|
+
#
|
80
119
|
def switch_group_and_user
|
81
120
|
if Process.euid != user.uid or Process.egid != group.egid
|
82
121
|
Process.initgroups(user.name, group.gid) # prepare groups
|
@@ -88,6 +127,10 @@ module ScoutAgent
|
|
88
127
|
false
|
89
128
|
end
|
90
129
|
|
130
|
+
#
|
131
|
+
# Abort with an error message to the user that says we cannot prepare PID
|
132
|
+
# file storage because we don't have the privileges.
|
133
|
+
#
|
91
134
|
def abort_with_missing_pid_dir
|
92
135
|
abort <<-END_PID_DIR.trim
|
93
136
|
Unable to prepare PID file storage. Please start the daemon
|
@@ -99,6 +142,10 @@ module ScoutAgent
|
|
99
142
|
END_PID_DIR
|
100
143
|
end
|
101
144
|
|
145
|
+
#
|
146
|
+
# Abort with an error message to the user that warns of our inability to
|
147
|
+
# reach the server do to connectivity issues or a bad configuration.
|
148
|
+
#
|
102
149
|
def abort_with_cannot_connect
|
103
150
|
abort <<-END_CANNOT_CONNECT.trim
|
104
151
|
Unable to load a plan from the server at:
|
@@ -114,6 +161,10 @@ module ScoutAgent
|
|
114
161
|
END_CANNOT_CONNECT
|
115
162
|
end
|
116
163
|
|
164
|
+
#
|
165
|
+
# Abort with an error message to the user that says we were unable to
|
166
|
+
# switch to the configured user or group.
|
167
|
+
#
|
117
168
|
def abort_with_wrong_group_or_user
|
118
169
|
abort <<-END_GROUP_USER.trim
|
119
170
|
Unable to switch to the selected user and group. Please
|
@@ -125,6 +176,10 @@ module ScoutAgent
|
|
125
176
|
END_GROUP_USER
|
126
177
|
end
|
127
178
|
|
179
|
+
#
|
180
|
+
# Abort with an error message to the user that says we were unable to load
|
181
|
+
# all expected resources due to a damaged configuration.
|
182
|
+
#
|
128
183
|
def abort_with_missing_resources
|
129
184
|
abort <<-END_RESOURCES.trim
|
130
185
|
Some resources needed to complete the startup process are
|
@@ -141,10 +196,18 @@ module ScoutAgent
|
|
141
196
|
END_RESOURCES
|
142
197
|
end
|
143
198
|
|
199
|
+
#
|
200
|
+
# Abort with an error message to the user that informs of an agent already
|
201
|
+
# running on this system.
|
202
|
+
#
|
144
203
|
def abort_with_other_process_running(pid)
|
145
204
|
abort "The daemon is already running with the process ID of #{pid}."
|
146
205
|
end
|
147
206
|
|
207
|
+
#
|
208
|
+
# Abort with an error message to the user that says we are unable to
|
209
|
+
# daemonize this process.
|
210
|
+
#
|
148
211
|
def abort_with_failure_to_daemonize
|
149
212
|
abort "Unable to daemonize this process."
|
150
213
|
end
|
@@ -3,7 +3,18 @@
|
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
5
|
class Assignment
|
6
|
+
#
|
7
|
+
# Invoke with:
|
8
|
+
#
|
9
|
+
# scout_agent status
|
10
|
+
#
|
11
|
+
# This command dumps the status database to <tt>$stdout</tt>. This data
|
12
|
+
# will show what processes are running, what they are currently working on,
|
13
|
+
# and when that status was last updated. It can be useful in tracking down
|
14
|
+
# issues with the agent to see where it went wrong.
|
15
|
+
#
|
6
16
|
class Status < Assignment
|
17
|
+
# Runs the status command.
|
7
18
|
def execute
|
8
19
|
unless db = status_database
|
9
20
|
abort_with_missing_db
|
@@ -25,7 +36,8 @@ module ScoutAgent
|
|
25
36
|
%w[Last\ Updated last_updated_at]
|
26
37
|
].map { |title, data| [title] + statuses.map { |row| row[data] } }
|
27
38
|
sizes = columns.map { |column|
|
28
|
-
column.map { |field| field.to_s.size }.max
|
39
|
+
column.map { |field| field.to_s.size }.max
|
40
|
+
}
|
29
41
|
format = sizes.map { |size| "%-#{size}s" }.join(" ")
|
30
42
|
puts format % columns.map { |column| column.first }
|
31
43
|
puts format % sizes.map { |size| "-" * size }
|
@@ -36,9 +48,16 @@ module ScoutAgent
|
|
36
48
|
end
|
37
49
|
end
|
38
50
|
|
51
|
+
#######
|
52
|
+
private
|
53
|
+
#######
|
54
|
+
|
55
|
+
#
|
56
|
+
# Abort with an error message to the user that says we cannot load the
|
57
|
+
# status database.
|
58
|
+
#
|
39
59
|
def abort_with_missing_db
|
40
|
-
|
41
|
-
exit 1
|
60
|
+
abort "Statuses database could not be loaded."
|
42
61
|
end
|
43
62
|
end
|
44
63
|
end
|
@@ -3,7 +3,17 @@
|
|
3
3
|
|
4
4
|
module ScoutAgent
|
5
5
|
class Assignment
|
6
|
+
#
|
7
|
+
# Invoke with:
|
8
|
+
#
|
9
|
+
# sudo scout_agent stop
|
10
|
+
#
|
11
|
+
# This command halts the currently running agent. The agent is asked nicely
|
12
|
+
# to stop. If it doesn't respond within a few seconds though, it is forced
|
13
|
+
# to terminate.
|
14
|
+
#
|
6
15
|
class Stop < Assignment
|
16
|
+
# Runs the stop command.
|
7
17
|
def execute
|
8
18
|
@agent = IDCard.new(:lifeline)
|
9
19
|
if @agent.pid_file.exist?
|
@@ -22,24 +32,47 @@ module ScoutAgent
|
|
22
32
|
end
|
23
33
|
end
|
24
34
|
|
35
|
+
#######
|
25
36
|
private
|
37
|
+
#######
|
26
38
|
|
39
|
+
#
|
40
|
+
# Sends +signal_name+ (which is expected to be some kind of stop request
|
41
|
+
# to the <tt>@agent</tt>. This method will then wait +wait_count+ periods
|
42
|
+
# of +wait_delay+ seconds checking between waits to see if the agent has
|
43
|
+
# complied. It returns when the agent has exited or the total wait period
|
44
|
+
# has expired.
|
45
|
+
#
|
27
46
|
def signal_and_wait(signal_name, wait_count = 10, wait_delay = 0.5)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
abort_with_no_permission
|
32
|
-
end
|
47
|
+
# signal the main process
|
48
|
+
@agent.signal(signal_name)
|
49
|
+
# wait for it to stop
|
33
50
|
wait_count.times do
|
34
51
|
sleep wait_delay
|
35
52
|
break unless @agent.pid_file.exist?
|
36
53
|
end
|
54
|
+
# signal other stray processes
|
55
|
+
Plan.pid_dir.each_entry do |process|
|
56
|
+
if process.to_s =~ /(\w+)\.pid\z/
|
57
|
+
IDCard.new($1).signal(signal_name)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
rescue Errno::EPERM # we don't have permission
|
61
|
+
abort_with_no_permission
|
37
62
|
end
|
38
63
|
|
64
|
+
#
|
65
|
+
# Abort with an error message to the user that says that the agent isn't
|
66
|
+
# currently running.
|
67
|
+
#
|
39
68
|
def abort_with_not_running_notice
|
40
69
|
puts "#{ScoutAgent.proper_agent_name} is not currently running."
|
41
70
|
end
|
42
71
|
|
72
|
+
#
|
73
|
+
# Abort with an error message to the user that says we don't have enough
|
74
|
+
# permission to stop the agent due to how it was started.
|
75
|
+
#
|
43
76
|
def abort_with_no_permission
|
44
77
|
abort <<-END_PERMISSION.trim
|
45
78
|
Unable to signal the daemon. Please rerun this command with
|
@@ -50,6 +83,10 @@ module ScoutAgent
|
|
50
83
|
END_PERMISSION
|
51
84
|
end
|
52
85
|
|
86
|
+
#
|
87
|
+
# Abort with an error message to the user that says we don't have enough
|
88
|
+
# permission to stop the agent due to how it was started.
|
89
|
+
#
|
53
90
|
def abort_with_failed_to_stop
|
54
91
|
abort <<-END_FAILED.trim
|
55
92
|
Unable to stop the daemon. You may need to use the PID files
|