scout_agent 3.0.6 → 3.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,19 +3,43 @@
3
3
 
4
4
  module ScoutAgent
5
5
  class Assignment
6
+ #
7
+ # Invoke with:
8
+ #
9
+ # scout_agent snap [force]
10
+ #
11
+ # This command requests that a snapshot be taken of the environment the
12
+ # agent is running on. Snapshots are a collection of commands sent down
13
+ # from the Scout server that can be used to measure the current health of
14
+ # the environment. Their output, exit status, and run time are passed back
15
+ # up to the server in response to this request.
16
+ #
17
+ # Note that this is just a request. It may not be honored if enough time
18
+ # hasn't passed since the last snapshot. This is to protect your server
19
+ # from overmuch busy work, but you can choose to override this limitation
20
+ # with the optional "force" parameter.
21
+ #
6
22
  class Snapshot < Assignment
23
+ # Runs the snapshot command.
7
24
  def execute
25
+ # prepare the log
8
26
  log = ScoutAgent.prepare_wire_tap(:snapshot, :skip_stdout)
9
27
 
28
+ # load the snapshot database
10
29
  unless db = Database.load(:snapshots, log)
11
30
  abort_with_missing_db
12
31
  end
13
32
 
33
+ #
34
+ # lock on this file to ensure only one process can run a snapshot at a
35
+ # time
36
+ #
14
37
  open(__FILE__) do |this_file|
15
38
  unless this_file.flock(File::LOCK_EX | File::LOCK_NB)
16
39
  exit # snapshot in progress
17
40
  end
18
41
 
42
+ # record our status and set removal at_exit()
19
43
  log.info("Building snapshot.")
20
44
  status_database(log)
21
45
  status("Building snapshot", :snapshot)
@@ -23,13 +47,16 @@ module ScoutAgent
23
47
  clear_status(:snapshot)
24
48
  end
25
49
 
50
+ # reset commands, if requested
26
51
  if Array(other_args).shift == "force"
27
52
  log.info("Clearing command run times to force a full snapshot.")
28
53
  db.reset_all_commands
29
54
  end
30
55
 
56
+ # read current commands
31
57
  commands = db.current_commands
32
58
 
59
+ # bail out if there's no commands to run
33
60
  if commands.empty?
34
61
  if db.have_commands?
35
62
  abort_with_too_recent
@@ -38,12 +65,14 @@ module ScoutAgent
38
65
  abort_with_no_commands
39
66
  end
40
67
  end
41
-
68
+
69
+ # build snapshot
42
70
  snapshot_started = Time.now
43
- commands.each do |command|
71
+ commands.each do |command| # run each command
44
72
  log.info("Running `#{command[:code]}`.")
45
73
  command_started = Time.now
46
74
  reader, writer = IO.pipe
75
+ # run the command in a child process
47
76
  run = fork do
48
77
  reader.close
49
78
  STDOUT.reopen(writer)
@@ -57,6 +86,7 @@ module ScoutAgent
57
86
  exit_status = nil
58
87
  output = nil
59
88
  writer.close
89
+ # make sure the child process stops in a reasonable time
60
90
  begin
61
91
  Timeout.timeout(command[:timeout]) do
62
92
  output = reader.read
@@ -68,6 +98,7 @@ module ScoutAgent
68
98
  output = "Error: This command took too long to run"
69
99
  end
70
100
  run_time = Time.now - command_started
101
+ # record results
71
102
  db.complete_run( command,
72
103
  output,
73
104
  exit_status,
@@ -76,26 +107,41 @@ module ScoutAgent
76
107
  log.debug( "`#{command[:code]}` exited (#{exit_status}) in " +
77
108
  "#{run_time} seconds." )
78
109
  end
79
-
110
+
111
+ # maintain the snapshots database
80
112
  db.maintain
81
113
 
82
114
  log.info("Snapshot complete.")
83
- this_file.flock(File::LOCK_UN)
115
+ this_file.flock(File::LOCK_UN) # release our snapshot lock
84
116
  end
85
117
  end
86
118
 
119
+ #######
87
120
  private
121
+ #######
88
122
 
123
+ #
124
+ # Abort with an error message to the user that says we cannot load the
125
+ # snapshot database.
126
+ #
89
127
  def abort_with_missing_db
90
128
  warn "Snapshots database could not be loaded."
91
129
  exit 1
92
130
  end
93
131
 
132
+ #
133
+ # Abort with an error message to the user that says a snapshot was
134
+ # recently taken and enough time has not yet passed to grab another.
135
+ #
94
136
  def abort_with_too_recent
95
137
  warn "A snapshot was recently taken."
96
138
  exit 2
97
139
  end
98
140
 
141
+ #
142
+ # Abort with an error message to the user that says we don't have any
143
+ # commands from the server yet to run as a snapshot.
144
+ #
99
145
  def abort_with_no_commands
100
146
  warn "No snapshot commands have been received from the server."
101
147
  exit 3
@@ -3,25 +3,45 @@
3
3
 
4
4
  module ScoutAgent
5
5
  class Assignment
6
+ #
7
+ # Invoke with:
8
+ #
9
+ # sudo scout_agent start
10
+ #
11
+ # This command starts the Scout agent, if it is not already running.
12
+ #
13
+ # The agent requires super user privileges to prepare it's environment, but
14
+ # it will relenquish these privileges before it begins normal work.
15
+ #
16
+ # The agent daemonizes itself as part of the startup process (unless
17
+ # configured not to) and thus will detach from your terminal. It will still
18
+ # be running happily in the background. You can check up on it with the
19
+ # status command, if you like.
20
+ #
6
21
  class Start < Assignment
7
22
  choose_user true
8
23
  choose_group true
9
24
 
25
+ # Runs the start command.
10
26
  def execute
27
+ # build the directory for PID file storage, if needed
11
28
  unless Plan.pid_dir.exist?
12
29
  unless Plan.build_pid_dir(group.gid)
13
30
  abort_with_missing_pid_dir
14
31
  end
15
32
  end
16
33
 
34
+ # switch to the selected user and group
17
35
  unless switch_group_and_user
18
36
  abort_with_wrong_group_or_user
19
37
  end
20
38
 
39
+ # test our ability to reach the server
21
40
  unless test_server_connection(:quiet)
22
41
  abort_with_cannot_connect
23
42
  end
24
43
 
44
+ # make a final check to see if everything looks properly prepared
25
45
  unless Plan.valid? and
26
46
  Plan.pid_dir.exist? and
27
47
  Plan.pid_dir.readable? and
@@ -29,6 +49,7 @@ module ScoutAgent
29
49
  abort_with_missing_resources
30
50
  end
31
51
 
52
+ # make sure we are the only process running and daemonize, if configured
32
53
  running_mode = Plan.run_as_daemon? ? " as a daemon" : ""
33
54
  puts "Starting #{ScoutAgent.proper_agent_name}#{running_mode}..."
34
55
  card = IDCard.new(:lifeline)
@@ -41,12 +62,15 @@ module ScoutAgent
41
62
  end
42
63
  end
43
64
 
65
+ # prepare the log
44
66
  log = ScoutAgent.prepare_wire_tap(:lifeline)
45
67
  log.info("Loading monitors.")
46
68
 
69
+ # load configured agents
47
70
  agents = %w[master]
48
71
  agents << "communication" if Plan.enable_xmpp?
49
72
 
73
+ # start each agent through a Lifeline monitor
50
74
  lifelines = agents.map { |agent| Lifeline.new(agent, log) }
51
75
  %w[TERM INT].each do |signal|
52
76
  trap(signal) do
@@ -56,13 +80,24 @@ module ScoutAgent
56
80
  end
57
81
  end
58
82
  end
59
- lifelines.each { |line| line.launch_and_monitor }
83
+ lifelines.each do |line|
84
+ line.launch_and_monitor
85
+ end
60
86
 
61
- lifelines.each { |line| line.join }
87
+ # wait for all monitors to finish
88
+ lifelines.each do |line|
89
+ line.join
90
+ end
62
91
  end
63
92
 
93
+ #######
64
94
  private
95
+ #######
65
96
 
97
+ #
98
+ # A typical Unix daemonization process. Returns +true+ if successful,
99
+ # +false+ otherwise.
100
+ #
66
101
  def daemonize
67
102
  exit!(0) if fork
68
103
  Process.setsid
@@ -77,6 +112,10 @@ module ScoutAgent
77
112
  false
78
113
  end
79
114
 
115
+ #
116
+ # Switches the actual and effective user and group of this process as
117
+ # configured.
118
+ #
80
119
  def switch_group_and_user
81
120
  if Process.euid != user.uid or Process.egid != group.egid
82
121
  Process.initgroups(user.name, group.gid) # prepare groups
@@ -88,6 +127,10 @@ module ScoutAgent
88
127
  false
89
128
  end
90
129
 
130
+ #
131
+ # Abort with an error message to the user that says we cannot prepare PID
132
+ # file storage because we don't have the privileges.
133
+ #
91
134
  def abort_with_missing_pid_dir
92
135
  abort <<-END_PID_DIR.trim
93
136
  Unable to prepare PID file storage. Please start the daemon
@@ -99,6 +142,10 @@ module ScoutAgent
99
142
  END_PID_DIR
100
143
  end
101
144
 
145
+ #
146
+ # Abort with an error message to the user that warns of our inability to
147
+ # reach the server do to connectivity issues or a bad configuration.
148
+ #
102
149
  def abort_with_cannot_connect
103
150
  abort <<-END_CANNOT_CONNECT.trim
104
151
  Unable to load a plan from the server at:
@@ -114,6 +161,10 @@ module ScoutAgent
114
161
  END_CANNOT_CONNECT
115
162
  end
116
163
 
164
+ #
165
+ # Abort with an error message to the user that says we were unable to
166
+ # switch to the configured user or group.
167
+ #
117
168
  def abort_with_wrong_group_or_user
118
169
  abort <<-END_GROUP_USER.trim
119
170
  Unable to switch to the selected user and group. Please
@@ -125,6 +176,10 @@ module ScoutAgent
125
176
  END_GROUP_USER
126
177
  end
127
178
 
179
+ #
180
+ # Abort with an error message to the user that says we were unable to load
181
+ # all expected resources due to a damaged configuration.
182
+ #
128
183
  def abort_with_missing_resources
129
184
  abort <<-END_RESOURCES.trim
130
185
  Some resources needed to complete the startup process are
@@ -141,10 +196,18 @@ module ScoutAgent
141
196
  END_RESOURCES
142
197
  end
143
198
 
199
+ #
200
+ # Abort with an error message to the user that informs of an agent already
201
+ # running on this system.
202
+ #
144
203
  def abort_with_other_process_running(pid)
145
204
  abort "The daemon is already running with the process ID of #{pid}."
146
205
  end
147
206
 
207
+ #
208
+ # Abort with an error message to the user that says we are unable to
209
+ # daemonize this process.
210
+ #
148
211
  def abort_with_failure_to_daemonize
149
212
  abort "Unable to daemonize this process."
150
213
  end
@@ -3,7 +3,18 @@
3
3
 
4
4
  module ScoutAgent
5
5
  class Assignment
6
+ #
7
+ # Invoke with:
8
+ #
9
+ # scout_agent status
10
+ #
11
+ # This command dumps the status database to <tt>$stdout</tt>. This data
12
+ # will show what processes are running, what they are currently working on,
13
+ # and when that status was last updated. It can be useful in tracking down
14
+ # issues with the agent to see where it went wrong.
15
+ #
6
16
  class Status < Assignment
17
+ # Runs the status command.
7
18
  def execute
8
19
  unless db = status_database
9
20
  abort_with_missing_db
@@ -25,7 +36,8 @@ module ScoutAgent
25
36
  %w[Last\ Updated last_updated_at]
26
37
  ].map { |title, data| [title] + statuses.map { |row| row[data] } }
27
38
  sizes = columns.map { |column|
28
- column.map { |field| field.to_s.size }.max }
39
+ column.map { |field| field.to_s.size }.max
40
+ }
29
41
  format = sizes.map { |size| "%-#{size}s" }.join(" ")
30
42
  puts format % columns.map { |column| column.first }
31
43
  puts format % sizes.map { |size| "-" * size }
@@ -36,9 +48,16 @@ module ScoutAgent
36
48
  end
37
49
  end
38
50
 
51
+ #######
52
+ private
53
+ #######
54
+
55
+ #
56
+ # Abort with an error message to the user that says we cannot load the
57
+ # status database.
58
+ #
39
59
  def abort_with_missing_db
40
- warn "Statuses database could not be loaded."
41
- exit 1
60
+ abort "Statuses database could not be loaded."
42
61
  end
43
62
  end
44
63
  end
@@ -3,7 +3,17 @@
3
3
 
4
4
  module ScoutAgent
5
5
  class Assignment
6
+ #
7
+ # Invoke with:
8
+ #
9
+ # sudo scout_agent stop
10
+ #
11
+ # This command halts the currently running agent. The agent is asked nicely
12
+ # to stop. If it doesn't respond within a few seconds though, it is forced
13
+ # to terminate.
14
+ #
6
15
  class Stop < Assignment
16
+ # Runs the stop command.
7
17
  def execute
8
18
  @agent = IDCard.new(:lifeline)
9
19
  if @agent.pid_file.exist?
@@ -22,24 +32,47 @@ module ScoutAgent
22
32
  end
23
33
  end
24
34
 
35
+ #######
25
36
  private
37
+ #######
26
38
 
39
+ #
40
+ # Sends +signal_name+ (which is expected to be some kind of stop request
41
+ # to the <tt>@agent</tt>. This method will then wait +wait_count+ periods
42
+ # of +wait_delay+ seconds checking between waits to see if the agent has
43
+ # complied. It returns when the agent has exited or the total wait period
44
+ # has expired.
45
+ #
27
46
  def signal_and_wait(signal_name, wait_count = 10, wait_delay = 0.5)
28
- begin
29
- @agent.signal(signal_name)
30
- rescue Errno::EPERM # we don't have permission
31
- abort_with_no_permission
32
- end
47
+ # signal the main process
48
+ @agent.signal(signal_name)
49
+ # wait for it to stop
33
50
  wait_count.times do
34
51
  sleep wait_delay
35
52
  break unless @agent.pid_file.exist?
36
53
  end
54
+ # signal other stray processes
55
+ Plan.pid_dir.each_entry do |process|
56
+ if process.to_s =~ /(\w+)\.pid\z/
57
+ IDCard.new($1).signal(signal_name)
58
+ end
59
+ end
60
+ rescue Errno::EPERM # we don't have permission
61
+ abort_with_no_permission
37
62
  end
38
63
 
64
+ #
65
+ # Abort with an error message to the user that says that the agent isn't
66
+ # currently running.
67
+ #
39
68
  def abort_with_not_running_notice
40
69
  puts "#{ScoutAgent.proper_agent_name} is not currently running."
41
70
  end
42
71
 
72
+ #
73
+ # Abort with an error message to the user that says we don't have enough
74
+ # permission to stop the agent due to how it was started.
75
+ #
43
76
  def abort_with_no_permission
44
77
  abort <<-END_PERMISSION.trim
45
78
  Unable to signal the daemon. Please rerun this command with
@@ -50,6 +83,10 @@ module ScoutAgent
50
83
  END_PERMISSION
51
84
  end
52
85
 
86
+ #
87
+ # Abort with an error message to the user that says we don't have enough
88
+ # permission to stop the agent due to how it was started.
89
+ #
53
90
  def abort_with_failed_to_stop
54
91
  abort <<-END_FAILED.trim
55
92
  Unable to stop the daemon. You may need to use the PID files