scout_agent 3.0.6 → 3.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,19 +3,43 @@
3
3
 
4
4
  module ScoutAgent
5
5
  class Assignment
6
+ #
7
+ # Invoke with:
8
+ #
9
+ # scout_agent snap [force]
10
+ #
11
+ # This command requests that a snapshot be taken of the environment the
12
+ # agent is running on. Snapshots are a collection of commands sent down
13
+ # from the Scout server that can be used to measure the current health of
14
+ # the environment. Their output, exit status, and run time are passed back
15
+ # up to the server in response to this request.
16
+ #
17
+ # Note that this is just a request. It may not be honored if enough time
18
+ # hasn't passed since the last snapshot. This is to protect your server
19
+ # from overmuch busy work, but you can choose to override this limitation
20
+ # with the optional "force" parameter.
21
+ #
6
22
  class Snapshot < Assignment
23
+ # Runs the snapshot command.
7
24
  def execute
25
+ # prepare the log
8
26
  log = ScoutAgent.prepare_wire_tap(:snapshot, :skip_stdout)
9
27
 
28
+ # load the snapshot database
10
29
  unless db = Database.load(:snapshots, log)
11
30
  abort_with_missing_db
12
31
  end
13
32
 
33
+ #
34
+ # lock on this file to ensure only one process can run a snapshot at a
35
+ # time
36
+ #
14
37
  open(__FILE__) do |this_file|
15
38
  unless this_file.flock(File::LOCK_EX | File::LOCK_NB)
16
39
  exit # snapshot in progress
17
40
  end
18
41
 
42
+ # record our status and set removal at_exit()
19
43
  log.info("Building snapshot.")
20
44
  status_database(log)
21
45
  status("Building snapshot", :snapshot)
@@ -23,13 +47,16 @@ module ScoutAgent
23
47
  clear_status(:snapshot)
24
48
  end
25
49
 
50
+ # reset commands, if requested
26
51
  if Array(other_args).shift == "force"
27
52
  log.info("Clearing command run times to force a full snapshot.")
28
53
  db.reset_all_commands
29
54
  end
30
55
 
56
+ # read current commands
31
57
  commands = db.current_commands
32
58
 
59
+ # bail out if there's no commands to run
33
60
  if commands.empty?
34
61
  if db.have_commands?
35
62
  abort_with_too_recent
@@ -38,12 +65,14 @@ module ScoutAgent
38
65
  abort_with_no_commands
39
66
  end
40
67
  end
41
-
68
+
69
+ # build snapshot
42
70
  snapshot_started = Time.now
43
- commands.each do |command|
71
+ commands.each do |command| # run each command
44
72
  log.info("Running `#{command[:code]}`.")
45
73
  command_started = Time.now
46
74
  reader, writer = IO.pipe
75
+ # run the command in a child process
47
76
  run = fork do
48
77
  reader.close
49
78
  STDOUT.reopen(writer)
@@ -57,6 +86,7 @@ module ScoutAgent
57
86
  exit_status = nil
58
87
  output = nil
59
88
  writer.close
89
+ # make sure the child process stops in a reasonable time
60
90
  begin
61
91
  Timeout.timeout(command[:timeout]) do
62
92
  output = reader.read
@@ -68,6 +98,7 @@ module ScoutAgent
68
98
  output = "Error: This command took too long to run"
69
99
  end
70
100
  run_time = Time.now - command_started
101
+ # record results
71
102
  db.complete_run( command,
72
103
  output,
73
104
  exit_status,
@@ -76,26 +107,41 @@ module ScoutAgent
76
107
  log.debug( "`#{command[:code]}` exited (#{exit_status}) in " +
77
108
  "#{run_time} seconds." )
78
109
  end
79
-
110
+
111
+ # maintain the snapshots database
80
112
  db.maintain
81
113
 
82
114
  log.info("Snapshot complete.")
83
- this_file.flock(File::LOCK_UN)
115
+ this_file.flock(File::LOCK_UN) # release our snapshot lock
84
116
  end
85
117
  end
86
118
 
119
+ #######
87
120
  private
121
+ #######
88
122
 
123
+ #
124
+ # Abort with an error message to the user that says we cannot load the
125
+ # snapshot database.
126
+ #
89
127
  def abort_with_missing_db
90
128
  warn "Snapshots database could not be loaded."
91
129
  exit 1
92
130
  end
93
131
 
132
+ #
133
+ # Abort with an error message to the user that says a snapshot was
134
+ # recently taken and enough time has not yet passed to grab another.
135
+ #
94
136
  def abort_with_too_recent
95
137
  warn "A snapshot was recently taken."
96
138
  exit 2
97
139
  end
98
140
 
141
+ #
142
+ # Abort with an error message to the user that says we don't have any
143
+ # commands from the server yet to run as a snapshot.
144
+ #
99
145
  def abort_with_no_commands
100
146
  warn "No snapshot commands have been received from the server."
101
147
  exit 3
@@ -3,25 +3,45 @@
3
3
 
4
4
  module ScoutAgent
5
5
  class Assignment
6
+ #
7
+ # Invoke with:
8
+ #
9
+ # sudo scout_agent start
10
+ #
11
+ # This command starts the Scout agent, if it is not already running.
12
+ #
13
+ # The agent requires super user privileges to prepare it's environment, but
14
+ # it will relenquish these privileges before it begins normal work.
15
+ #
16
+ # The agent daemonizes itself as part of the startup process (unless
17
+ # configured not to) and thus will detach from your terminal. It will still
18
+ # be running happily in the background. You can check up on it with the
19
+ # status command, if you like.
20
+ #
6
21
  class Start < Assignment
7
22
  choose_user true
8
23
  choose_group true
9
24
 
25
+ # Runs the start command.
10
26
  def execute
27
+ # build the directory for PID file storage, if needed
11
28
  unless Plan.pid_dir.exist?
12
29
  unless Plan.build_pid_dir(group.gid)
13
30
  abort_with_missing_pid_dir
14
31
  end
15
32
  end
16
33
 
34
+ # switch to the selected user and group
17
35
  unless switch_group_and_user
18
36
  abort_with_wrong_group_or_user
19
37
  end
20
38
 
39
+ # test our ability to reach the server
21
40
  unless test_server_connection(:quiet)
22
41
  abort_with_cannot_connect
23
42
  end
24
43
 
44
+ # make a final check to see if everything looks properly prepared
25
45
  unless Plan.valid? and
26
46
  Plan.pid_dir.exist? and
27
47
  Plan.pid_dir.readable? and
@@ -29,6 +49,7 @@ module ScoutAgent
29
49
  abort_with_missing_resources
30
50
  end
31
51
 
52
+ # make sure we are the only process running and daemonize, if configured
32
53
  running_mode = Plan.run_as_daemon? ? " as a daemon" : ""
33
54
  puts "Starting #{ScoutAgent.proper_agent_name}#{running_mode}..."
34
55
  card = IDCard.new(:lifeline)
@@ -41,12 +62,15 @@ module ScoutAgent
41
62
  end
42
63
  end
43
64
 
65
+ # prepare the log
44
66
  log = ScoutAgent.prepare_wire_tap(:lifeline)
45
67
  log.info("Loading monitors.")
46
68
 
69
+ # load configured agents
47
70
  agents = %w[master]
48
71
  agents << "communication" if Plan.enable_xmpp?
49
72
 
73
+ # start each agent through a Lifeline monitor
50
74
  lifelines = agents.map { |agent| Lifeline.new(agent, log) }
51
75
  %w[TERM INT].each do |signal|
52
76
  trap(signal) do
@@ -56,13 +80,24 @@ module ScoutAgent
56
80
  end
57
81
  end
58
82
  end
59
- lifelines.each { |line| line.launch_and_monitor }
83
+ lifelines.each do |line|
84
+ line.launch_and_monitor
85
+ end
60
86
 
61
- lifelines.each { |line| line.join }
87
+ # wait for all monitors to finish
88
+ lifelines.each do |line|
89
+ line.join
90
+ end
62
91
  end
63
92
 
93
+ #######
64
94
  private
95
+ #######
65
96
 
97
+ #
98
+ # A typical Unix daemonization process. Returns +true+ if successful,
99
+ # +false+ otherwise.
100
+ #
66
101
  def daemonize
67
102
  exit!(0) if fork
68
103
  Process.setsid
@@ -77,6 +112,10 @@ module ScoutAgent
77
112
  false
78
113
  end
79
114
 
115
+ #
116
+ # Switches the actual and effective user and group of this process as
117
+ # configured.
118
+ #
80
119
  def switch_group_and_user
81
120
  if Process.euid != user.uid or Process.egid != group.egid
82
121
  Process.initgroups(user.name, group.gid) # prepare groups
@@ -88,6 +127,10 @@ module ScoutAgent
88
127
  false
89
128
  end
90
129
 
130
+ #
131
+ # Abort with an error message to the user that says we cannot prepare PID
132
+ # file storage because we don't have the privileges.
133
+ #
91
134
  def abort_with_missing_pid_dir
92
135
  abort <<-END_PID_DIR.trim
93
136
  Unable to prepare PID file storage. Please start the daemon
@@ -99,6 +142,10 @@ module ScoutAgent
99
142
  END_PID_DIR
100
143
  end
101
144
 
145
+ #
146
+ # Abort with an error message to the user that warns of our inability to
147
+ # reach the server do to connectivity issues or a bad configuration.
148
+ #
102
149
  def abort_with_cannot_connect
103
150
  abort <<-END_CANNOT_CONNECT.trim
104
151
  Unable to load a plan from the server at:
@@ -114,6 +161,10 @@ module ScoutAgent
114
161
  END_CANNOT_CONNECT
115
162
  end
116
163
 
164
+ #
165
+ # Abort with an error message to the user that says we were unable to
166
+ # switch to the configured user or group.
167
+ #
117
168
  def abort_with_wrong_group_or_user
118
169
  abort <<-END_GROUP_USER.trim
119
170
  Unable to switch to the selected user and group. Please
@@ -125,6 +176,10 @@ module ScoutAgent
125
176
  END_GROUP_USER
126
177
  end
127
178
 
179
+ #
180
+ # Abort with an error message to the user that says we were unable to load
181
+ # all expected resources due to a damaged configuration.
182
+ #
128
183
  def abort_with_missing_resources
129
184
  abort <<-END_RESOURCES.trim
130
185
  Some resources needed to complete the startup process are
@@ -141,10 +196,18 @@ module ScoutAgent
141
196
  END_RESOURCES
142
197
  end
143
198
 
199
+ #
200
+ # Abort with an error message to the user that informs of an agent already
201
+ # running on this system.
202
+ #
144
203
  def abort_with_other_process_running(pid)
145
204
  abort "The daemon is already running with the process ID of #{pid}."
146
205
  end
147
206
 
207
+ #
208
+ # Abort with an error message to the user that says we are unable to
209
+ # daemonize this process.
210
+ #
148
211
  def abort_with_failure_to_daemonize
149
212
  abort "Unable to daemonize this process."
150
213
  end
@@ -3,7 +3,18 @@
3
3
 
4
4
  module ScoutAgent
5
5
  class Assignment
6
+ #
7
+ # Invoke with:
8
+ #
9
+ # scout_agent status
10
+ #
11
+ # This command dumps the status database to <tt>$stdout</tt>. This data
12
+ # will show what processes are running, what they are currently working on,
13
+ # and when that status was last updated. It can be useful in tracking down
14
+ # issues with the agent to see where it went wrong.
15
+ #
6
16
  class Status < Assignment
17
+ # Runs the status command.
7
18
  def execute
8
19
  unless db = status_database
9
20
  abort_with_missing_db
@@ -25,7 +36,8 @@ module ScoutAgent
25
36
  %w[Last\ Updated last_updated_at]
26
37
  ].map { |title, data| [title] + statuses.map { |row| row[data] } }
27
38
  sizes = columns.map { |column|
28
- column.map { |field| field.to_s.size }.max }
39
+ column.map { |field| field.to_s.size }.max
40
+ }
29
41
  format = sizes.map { |size| "%-#{size}s" }.join(" ")
30
42
  puts format % columns.map { |column| column.first }
31
43
  puts format % sizes.map { |size| "-" * size }
@@ -36,9 +48,16 @@ module ScoutAgent
36
48
  end
37
49
  end
38
50
 
51
+ #######
52
+ private
53
+ #######
54
+
55
+ #
56
+ # Abort with an error message to the user that says we cannot load the
57
+ # status database.
58
+ #
39
59
  def abort_with_missing_db
40
- warn "Statuses database could not be loaded."
41
- exit 1
60
+ abort "Statuses database could not be loaded."
42
61
  end
43
62
  end
44
63
  end
@@ -3,7 +3,17 @@
3
3
 
4
4
  module ScoutAgent
5
5
  class Assignment
6
+ #
7
+ # Invoke with:
8
+ #
9
+ # sudo scout_agent stop
10
+ #
11
+ # This command halts the currently running agent. The agent is asked nicely
12
+ # to stop. If it doesn't respond within a few seconds though, it is forced
13
+ # to terminate.
14
+ #
6
15
  class Stop < Assignment
16
+ # Runs the stop command.
7
17
  def execute
8
18
  @agent = IDCard.new(:lifeline)
9
19
  if @agent.pid_file.exist?
@@ -22,24 +32,47 @@ module ScoutAgent
22
32
  end
23
33
  end
24
34
 
35
+ #######
25
36
  private
37
+ #######
26
38
 
39
+ #
40
+ # Sends +signal_name+ (which is expected to be some kind of stop request
41
+ # to the <tt>@agent</tt>. This method will then wait +wait_count+ periods
42
+ # of +wait_delay+ seconds checking between waits to see if the agent has
43
+ # complied. It returns when the agent has exited or the total wait period
44
+ # has expired.
45
+ #
27
46
  def signal_and_wait(signal_name, wait_count = 10, wait_delay = 0.5)
28
- begin
29
- @agent.signal(signal_name)
30
- rescue Errno::EPERM # we don't have permission
31
- abort_with_no_permission
32
- end
47
+ # signal the main process
48
+ @agent.signal(signal_name)
49
+ # wait for it to stop
33
50
  wait_count.times do
34
51
  sleep wait_delay
35
52
  break unless @agent.pid_file.exist?
36
53
  end
54
+ # signal other stray processes
55
+ Plan.pid_dir.each_entry do |process|
56
+ if process.to_s =~ /(\w+)\.pid\z/
57
+ IDCard.new($1).signal(signal_name)
58
+ end
59
+ end
60
+ rescue Errno::EPERM # we don't have permission
61
+ abort_with_no_permission
37
62
  end
38
63
 
64
+ #
65
+ # Abort with an error message to the user that says that the agent isn't
66
+ # currently running.
67
+ #
39
68
  def abort_with_not_running_notice
40
69
  puts "#{ScoutAgent.proper_agent_name} is not currently running."
41
70
  end
42
71
 
72
+ #
73
+ # Abort with an error message to the user that says we don't have enough
74
+ # permission to stop the agent due to how it was started.
75
+ #
43
76
  def abort_with_no_permission
44
77
  abort <<-END_PERMISSION.trim
45
78
  Unable to signal the daemon. Please rerun this command with
@@ -50,6 +83,10 @@ module ScoutAgent
50
83
  END_PERMISSION
51
84
  end
52
85
 
86
+ #
87
+ # Abort with an error message to the user that says we don't have enough
88
+ # permission to stop the agent due to how it was started.
89
+ #
53
90
  def abort_with_failed_to_stop
54
91
  abort <<-END_FAILED.trim
55
92
  Unable to stop the daemon. You may need to use the PID files