bosh_agent 1.5.0.pre.1113
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +0 -0
- data/bin/bosh_agent +102 -0
- data/lib/bosh_agent/alert.rb +191 -0
- data/lib/bosh_agent/alert_processor.rb +96 -0
- data/lib/bosh_agent/apply_plan/helpers.rb +30 -0
- data/lib/bosh_agent/apply_plan/job.rb +235 -0
- data/lib/bosh_agent/apply_plan/package.rb +58 -0
- data/lib/bosh_agent/apply_plan/plan.rb +96 -0
- data/lib/bosh_agent/bootstrap.rb +341 -0
- data/lib/bosh_agent/config.rb +5 -0
- data/lib/bosh_agent/configuration.rb +102 -0
- data/lib/bosh_agent/disk_util.rb +103 -0
- data/lib/bosh_agent/errors.rb +25 -0
- data/lib/bosh_agent/ext.rb +48 -0
- data/lib/bosh_agent/file_aggregator.rb +78 -0
- data/lib/bosh_agent/file_matcher.rb +45 -0
- data/lib/bosh_agent/handler.rb +440 -0
- data/lib/bosh_agent/heartbeat.rb +74 -0
- data/lib/bosh_agent/heartbeat_processor.rb +45 -0
- data/lib/bosh_agent/http_handler.rb +135 -0
- data/lib/bosh_agent/infrastructure/aws/registry.rb +177 -0
- data/lib/bosh_agent/infrastructure/aws/settings.rb +59 -0
- data/lib/bosh_agent/infrastructure/aws.rb +17 -0
- data/lib/bosh_agent/infrastructure/dummy.rb +24 -0
- data/lib/bosh_agent/infrastructure/openstack/registry.rb +220 -0
- data/lib/bosh_agent/infrastructure/openstack/settings.rb +76 -0
- data/lib/bosh_agent/infrastructure/openstack.rb +17 -0
- data/lib/bosh_agent/infrastructure/vsphere/settings.rb +135 -0
- data/lib/bosh_agent/infrastructure/vsphere.rb +16 -0
- data/lib/bosh_agent/infrastructure.rb +25 -0
- data/lib/bosh_agent/message/apply.rb +184 -0
- data/lib/bosh_agent/message/base.rb +38 -0
- data/lib/bosh_agent/message/compile_package.rb +250 -0
- data/lib/bosh_agent/message/drain.rb +195 -0
- data/lib/bosh_agent/message/list_disk.rb +25 -0
- data/lib/bosh_agent/message/logs.rb +108 -0
- data/lib/bosh_agent/message/migrate_disk.rb +55 -0
- data/lib/bosh_agent/message/mount_disk.rb +102 -0
- data/lib/bosh_agent/message/ssh.rb +109 -0
- data/lib/bosh_agent/message/state.rb +47 -0
- data/lib/bosh_agent/message/unmount_disk.rb +29 -0
- data/lib/bosh_agent/monit.rb +354 -0
- data/lib/bosh_agent/monit_client.rb +158 -0
- data/lib/bosh_agent/mounter.rb +42 -0
- data/lib/bosh_agent/ntp.rb +32 -0
- data/lib/bosh_agent/platform/centos/disk.rb +27 -0
- data/lib/bosh_agent/platform/centos/network.rb +39 -0
- data/lib/bosh_agent/platform/centos/templates/centos-ifcfg.erb +9 -0
- data/lib/bosh_agent/platform/centos/templates/dhclient_conf.erb +56 -0
- data/lib/bosh_agent/platform/centos/templates/logrotate.erb +8 -0
- data/lib/bosh_agent/platform/centos.rb +4 -0
- data/lib/bosh_agent/platform/dummy/templates/dummy_template.erb +1 -0
- data/lib/bosh_agent/platform/linux/adapter.rb +36 -0
- data/lib/bosh_agent/platform/linux/disk.rb +121 -0
- data/lib/bosh_agent/platform/linux/logrotate.rb +32 -0
- data/lib/bosh_agent/platform/linux/network.rb +124 -0
- data/lib/bosh_agent/platform/linux/password.rb +22 -0
- data/lib/bosh_agent/platform/linux.rb +4 -0
- data/lib/bosh_agent/platform/ubuntu/network.rb +59 -0
- data/lib/bosh_agent/platform/ubuntu/templates/dhclient_conf.erb +56 -0
- data/lib/bosh_agent/platform/ubuntu/templates/interfaces.erb +14 -0
- data/lib/bosh_agent/platform/ubuntu/templates/logrotate.erb +8 -0
- data/lib/bosh_agent/platform/ubuntu.rb +4 -0
- data/lib/bosh_agent/platform.rb +26 -0
- data/lib/bosh_agent/remote_exception.rb +62 -0
- data/lib/bosh_agent/runner.rb +36 -0
- data/lib/bosh_agent/settings.rb +61 -0
- data/lib/bosh_agent/sigar_box.rb +26 -0
- data/lib/bosh_agent/smtp_server.rb +96 -0
- data/lib/bosh_agent/state.rb +100 -0
- data/lib/bosh_agent/syslog_monitor.rb +53 -0
- data/lib/bosh_agent/template.rb +50 -0
- data/lib/bosh_agent/util.rb +190 -0
- data/lib/bosh_agent/version.rb +8 -0
- data/lib/bosh_agent.rb +92 -0
- metadata +332 -0
@@ -0,0 +1,102 @@
|
|
1
|
+
# Copyright (c) 2009-2012 VMware, Inc.
|
2
|
+
require 'bosh_agent/mounter'
|
3
|
+
|
4
|
+
module Bosh::Agent
|
5
|
+
module Message
|
6
|
+
class MountDisk < Base
|
7
|
+
def self.process(args)
|
8
|
+
new(args).mount
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(args)
|
12
|
+
@cid = args.first
|
13
|
+
end
|
14
|
+
|
15
|
+
def mount
|
16
|
+
if Bosh::Agent::Config.configure
|
17
|
+
update_settings
|
18
|
+
logger.info("MountDisk: #{@cid} - #{settings['disks'].inspect}")
|
19
|
+
|
20
|
+
setup_disk
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def update_settings
|
25
|
+
Bosh::Agent::Config.settings = Bosh::Agent::Settings.load
|
26
|
+
end
|
27
|
+
|
28
|
+
def setup_disk
|
29
|
+
disk = Bosh::Agent::Config.platform.lookup_disk_by_cid(@cid)
|
30
|
+
partition = "#{disk}1"
|
31
|
+
|
32
|
+
logger.info("setup disk settings: #{settings.inspect}")
|
33
|
+
|
34
|
+
read_disk_attempts = 300
|
35
|
+
read_disk_attempts.downto(0) do |n|
|
36
|
+
begin
|
37
|
+
# Parition table is blank
|
38
|
+
disk_data = File.read(disk, 512)
|
39
|
+
|
40
|
+
if disk_data == "\x00"*512
|
41
|
+
logger.info("Found blank disk #{disk}")
|
42
|
+
else
|
43
|
+
logger.info("Disk has partition table")
|
44
|
+
logger.info(`sfdisk -Llq #{disk} 2> /dev/null`)
|
45
|
+
end
|
46
|
+
break
|
47
|
+
rescue => e
|
48
|
+
# Do nothing - we'll retry
|
49
|
+
logger.info("Re-trying reading from #{disk}")
|
50
|
+
end
|
51
|
+
|
52
|
+
if n == 0
|
53
|
+
raise Bosh::Agent::MessageHandlerError, "Unable to read from new disk"
|
54
|
+
end
|
55
|
+
sleep 1
|
56
|
+
end
|
57
|
+
|
58
|
+
if File.blockdev?(disk) && DiskUtil.ensure_no_partition?(disk, partition)
|
59
|
+
full_disk = ",,L\n"
|
60
|
+
logger.info("Partitioning #{disk}")
|
61
|
+
|
62
|
+
Bosh::Agent::Util.partition_disk(disk, full_disk)
|
63
|
+
|
64
|
+
mke2fs_options = ["-t ext4", "-j"]
|
65
|
+
mke2fs_options << "-E lazy_itable_init=1" if Bosh::Agent::Util.lazy_itable_init_enabled?
|
66
|
+
`/sbin/mke2fs #{mke2fs_options.join(" ")} #{partition}`
|
67
|
+
unless $?.exitstatus == 0
|
68
|
+
raise Bosh::Agent::MessageHandlerError, "Failed create file system (#{$?.exitstatus})"
|
69
|
+
end
|
70
|
+
elsif File.blockdev?(partition)
|
71
|
+
logger.info("Found existing partition on #{disk}")
|
72
|
+
# Do nothing
|
73
|
+
else
|
74
|
+
raise Bosh::Agent::MessageHandlerError, "Unable to format #{disk}"
|
75
|
+
end
|
76
|
+
|
77
|
+
mount_persistent_disk(partition)
|
78
|
+
{}
|
79
|
+
end
|
80
|
+
|
81
|
+
def mount_persistent_disk(partition)
|
82
|
+
store_mountpoint = File.join(base_dir, 'store')
|
83
|
+
|
84
|
+
if Pathname.new(store_mountpoint).mountpoint?
|
85
|
+
logger.info("Mounting persistent disk store migration target")
|
86
|
+
mountpoint = File.join(base_dir, 'store_migraton_target')
|
87
|
+
else
|
88
|
+
logger.info("Mounting persistent disk store")
|
89
|
+
mountpoint = store_mountpoint
|
90
|
+
end
|
91
|
+
|
92
|
+
FileUtils.mkdir_p(mountpoint)
|
93
|
+
FileUtils.chmod(0700, mountpoint)
|
94
|
+
|
95
|
+
Mounter.new(logger).mount(partition, mountpoint)
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.long_running?; true; end
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# Copyright (c) 2009-2012 VMware, Inc.
|
2
|
+
|
3
|
+
module Bosh::Agent
|
4
|
+
module Message
|
5
|
+
class Ssh < Base
|
6
|
+
SSH_USER_PREFIX = "bosh_"
|
7
|
+
|
8
|
+
def self.process(args)
|
9
|
+
ssh = self.new(args)
|
10
|
+
case ssh.command
|
11
|
+
when "setup"
|
12
|
+
ssh.setup
|
13
|
+
when "cleanup"
|
14
|
+
ssh.cleanup
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def base_dir
|
19
|
+
Bosh::Agent::Config.base_dir
|
20
|
+
end
|
21
|
+
|
22
|
+
def ssh_base_dir
|
23
|
+
File.join(base_dir, "bosh_ssh")
|
24
|
+
end
|
25
|
+
|
26
|
+
attr_reader :command
|
27
|
+
|
28
|
+
def initialize(args)
|
29
|
+
@command, @params = args
|
30
|
+
end
|
31
|
+
|
32
|
+
def shell_cmd(cmd)
|
33
|
+
shell_output = %x[#{cmd} 2>&1]
|
34
|
+
raise "'#{cmd}' failed, error: #{shell_output}" if $?.exitstatus != 0
|
35
|
+
end
|
36
|
+
|
37
|
+
def setup
|
38
|
+
begin
|
39
|
+
user = @params["user"]
|
40
|
+
password = @params["password"]
|
41
|
+
logger.info("Setting up ssh for user #{user}")
|
42
|
+
|
43
|
+
shell_cmd(%Q[mkdir -p #{ssh_base_dir}])
|
44
|
+
|
45
|
+
if password
|
46
|
+
shell_cmd(%Q[useradd -m -b #{ssh_base_dir} -s /bin/bash -p '#{password}' #{user}])
|
47
|
+
else
|
48
|
+
shell_cmd(%Q[useradd -m -b #{ssh_base_dir} -s /bin/bash #{user}])
|
49
|
+
end
|
50
|
+
|
51
|
+
# Add user to admin and vcap group
|
52
|
+
shell_cmd(%Q[usermod -G admin,vcap #{user}])
|
53
|
+
|
54
|
+
# Add public key to authorized keys
|
55
|
+
ssh_dir = File.join(ssh_base_dir, user, ".ssh")
|
56
|
+
FileUtils.mkdir_p(ssh_dir)
|
57
|
+
|
58
|
+
File.open(File.join(ssh_dir, "authorized_keys"), "w+") do |f|
|
59
|
+
f.write(@params["public_key"])
|
60
|
+
end
|
61
|
+
FileUtils.chown_R(user, user, ssh_dir)
|
62
|
+
|
63
|
+
{"command" => @command, "status" => "success", "ip" => Bosh::Agent::Config.default_ip}
|
64
|
+
rescue => e
|
65
|
+
return {"command" => @command, "status" => "failure", "error" => e.message}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def cleanup
|
70
|
+
begin
|
71
|
+
return {"command" => @command, "status" => "bad_user"} if @params["user_regex"].nil?
|
72
|
+
|
73
|
+
# CLI calls this function under the following 2 scenarios
|
74
|
+
# 1. When it wants to cleanup a single user after an interactive
|
75
|
+
# session or after executing a remote command. In this case
|
76
|
+
# the "user_regex" would match a single user i.e. "^user_name$"
|
77
|
+
# 2. CLI has a special option called "cleanup all ssh users", in this
|
78
|
+
# case the "user_regex" is more generic like "^user_name_prefix"
|
79
|
+
#
|
80
|
+
# Irrespecitve of the scenarios above, we dont fully trust the "user_regex"
|
81
|
+
# and will cull the list of users to those that match SSH_USER_PREFEX
|
82
|
+
|
83
|
+
users = []
|
84
|
+
# list users
|
85
|
+
File.open("/etc/passwd", "r") do |f|
|
86
|
+
while user_entry = f.gets
|
87
|
+
next unless user_match = /(^.*?):/.match(user_entry)
|
88
|
+
user = user_match[1]
|
89
|
+
if /#{@params["user_regex"]}/ =~ user
|
90
|
+
users << user
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
users.each do |user|
|
96
|
+
# cant trust the user_regex completely, so skip unexpected users
|
97
|
+
next unless user =~ /^#{SSH_USER_PREFIX}/
|
98
|
+
logger.info("deleting user #{user}")
|
99
|
+
shell_cmd(%Q[userdel -r #{user}])
|
100
|
+
end
|
101
|
+
|
102
|
+
{"command" => @command, "status" => "success"}
|
103
|
+
rescue => e
|
104
|
+
return {"command" => @command, "status" => "failure", "error" => e.message}
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Copyright (c) 2009-2012 VMware, Inc.
|
2
|
+
|
3
|
+
module Bosh::Agent
|
4
|
+
module Message
|
5
|
+
class State < Base
|
6
|
+
|
7
|
+
def self.process(args)
|
8
|
+
self.new(args).state
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(args = nil)
|
12
|
+
if args.is_a?(Array)
|
13
|
+
@full_format = true if args.include?("full")
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def state
|
18
|
+
response = Bosh::Agent::Config.state.to_hash
|
19
|
+
|
20
|
+
logger.info("Agent state: #{response.inspect}")
|
21
|
+
|
22
|
+
if settings
|
23
|
+
response["agent_id"] = settings["agent_id"]
|
24
|
+
response["vm"] = settings["vm"]
|
25
|
+
end
|
26
|
+
|
27
|
+
response["job_state"] = job_state
|
28
|
+
response["bosh_protocol"] = Bosh::Agent::BOSH_PROTOCOL
|
29
|
+
response["ntp"] = Bosh::Agent::NTP.offset
|
30
|
+
|
31
|
+
if @full_format
|
32
|
+
response["vitals"] = Bosh::Agent::Monit.get_vitals
|
33
|
+
response["vitals"]["disk"] = Bosh::Agent::DiskUtil.get_usage
|
34
|
+
end
|
35
|
+
|
36
|
+
response
|
37
|
+
|
38
|
+
rescue Bosh::Agent::StateError => e
|
39
|
+
raise Bosh::Agent::MessageHandlerError, e
|
40
|
+
end
|
41
|
+
|
42
|
+
def job_state
|
43
|
+
Bosh::Agent::Monit.service_group_state
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'bosh_agent/disk_util'
|
2
|
+
|
3
|
+
module Bosh::Agent
|
4
|
+
module Message
|
5
|
+
class UnmountDisk < Base
|
6
|
+
|
7
|
+
def self.long_running?; true; end
|
8
|
+
|
9
|
+
def self.process(args)
|
10
|
+
self.new.unmount(args)
|
11
|
+
end
|
12
|
+
|
13
|
+
def unmount(args)
|
14
|
+
cid = args.first
|
15
|
+
disk = Bosh::Agent::Config.platform.lookup_disk_by_cid(cid)
|
16
|
+
partition = "#{disk}1"
|
17
|
+
|
18
|
+
if DiskUtil.mount_entry(partition)
|
19
|
+
@block, @mountpoint = DiskUtil.mount_entry(partition).split
|
20
|
+
DiskUtil.umount_guard(@mountpoint)
|
21
|
+
logger.info("Unmounted #{@block} on #{@mountpoint}")
|
22
|
+
return {:message => "Unmounted #{@block} on #{@mountpoint}" }
|
23
|
+
else
|
24
|
+
return {:message => "Unknown mount for partition: #{partition}"}
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,354 @@
|
|
1
|
+
# Copyright (c) 2009-2012 VMware, Inc.
|
2
|
+
require 'bosh_agent/monit_client'
|
3
|
+
|
4
|
+
module Bosh::Agent
|
5
|
+
# A good chunk of this code is lifted from the implementation of POSIX::Spawn::Child
|
6
|
+
class Monit
|
7
|
+
BUFSIZE = (32 * 1024)
|
8
|
+
NUM_RETRY_MONIT_INCARNATION = 60
|
9
|
+
NUM_RETRY_MONIT_WAIT_INCARNATION = 15
|
10
|
+
|
11
|
+
class << self
|
12
|
+
attr_accessor :enabled
|
13
|
+
|
14
|
+
# enable supposed to be called in the very beginning as it creates
|
15
|
+
# sync primitives. Ideally this class should be refactored to minimize
|
16
|
+
# the number of singleton methods having to keep track of the state.
|
17
|
+
def enable
|
18
|
+
@enabled = true
|
19
|
+
end
|
20
|
+
|
21
|
+
def start
|
22
|
+
new.run
|
23
|
+
end
|
24
|
+
|
25
|
+
def base_dir
|
26
|
+
Bosh::Agent::Config.base_dir
|
27
|
+
end
|
28
|
+
|
29
|
+
def logger
|
30
|
+
Bosh::Agent::Config.logger
|
31
|
+
end
|
32
|
+
|
33
|
+
def monit_dir
|
34
|
+
File.join(base_dir, 'monit')
|
35
|
+
end
|
36
|
+
|
37
|
+
def monit_events_dir
|
38
|
+
File.join(monit_dir, 'events')
|
39
|
+
end
|
40
|
+
|
41
|
+
def monit_user_file
|
42
|
+
File.join(monit_dir, 'monit.user')
|
43
|
+
end
|
44
|
+
|
45
|
+
def monit_alerts_file
|
46
|
+
File.join(monit_dir, 'alerts.monitrc')
|
47
|
+
end
|
48
|
+
|
49
|
+
def smtp_port
|
50
|
+
Bosh::Agent::Config.smtp_port
|
51
|
+
end
|
52
|
+
|
53
|
+
def monit_credentials
|
54
|
+
entry = File.read(monit_user_file).lines.find { |line| line.match(/\A#{BOSH_APP_GROUP}/) }
|
55
|
+
user, cred = entry.split(/:/)
|
56
|
+
[user, cred.strip]
|
57
|
+
end
|
58
|
+
|
59
|
+
def monit_api_client
|
60
|
+
# Primarily for CI - normally done during configure
|
61
|
+
unless Bosh::Agent::Config.configure
|
62
|
+
setup_monit_user
|
63
|
+
end
|
64
|
+
|
65
|
+
user, cred = monit_credentials
|
66
|
+
MonitClient.new("https://#{user}:#{cred}@127.0.0.1:2822", :logger => logger)
|
67
|
+
end
|
68
|
+
|
69
|
+
def random_credential
|
70
|
+
OpenSSL::Random.random_bytes(8).unpack("H*")[0]
|
71
|
+
end
|
72
|
+
|
73
|
+
def setup_monit_dir
|
74
|
+
FileUtils.mkdir_p(monit_dir)
|
75
|
+
FileUtils.chmod(0700, monit_dir)
|
76
|
+
end
|
77
|
+
|
78
|
+
def setup_monit_user
|
79
|
+
unless File.exist?(monit_user_file)
|
80
|
+
setup_monit_dir
|
81
|
+
File.open(monit_user_file, 'w') do |f|
|
82
|
+
f.puts("vcap:#{random_credential}")
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# This and other methods could probably be refactored into a separate management class to avoid keeping
|
88
|
+
# all this state in a metaclass (as it's weird to test)
|
89
|
+
def setup_alerts
|
90
|
+
return unless Config.process_alerts
|
91
|
+
|
92
|
+
alerts_config = <<-CONFIG
|
93
|
+
set alert bosh@localhost
|
94
|
+
set mailserver 127.0.0.1 port #{Config.smtp_port}
|
95
|
+
username "#{Config.smtp_user}" password "#{Config.smtp_password}"
|
96
|
+
|
97
|
+
set eventqueue
|
98
|
+
basedir #{monit_events_dir}
|
99
|
+
slots 5000
|
100
|
+
|
101
|
+
set mail-format {
|
102
|
+
from: monit@localhost
|
103
|
+
subject: Monit Alert
|
104
|
+
message: Service: $SERVICE
|
105
|
+
Event: $EVENT
|
106
|
+
Action: $ACTION
|
107
|
+
Date: $DATE
|
108
|
+
Description: $DESCRIPTION
|
109
|
+
}
|
110
|
+
CONFIG
|
111
|
+
|
112
|
+
setup_monit_dir
|
113
|
+
FileUtils.mkdir_p(monit_events_dir)
|
114
|
+
|
115
|
+
File.open(monit_alerts_file, 'w') do |f|
|
116
|
+
f.puts(alerts_config)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def monit_bin
|
121
|
+
File.join(base_dir, 'bosh', 'bin', 'monit')
|
122
|
+
end
|
123
|
+
|
124
|
+
def monitrc
|
125
|
+
File.join(base_dir, 'bosh', 'etc', 'monitrc')
|
126
|
+
end
|
127
|
+
|
128
|
+
def reload
|
129
|
+
old_incarnation = incarnation
|
130
|
+
logger.info("Monit: old incarnation #{old_incarnation}")
|
131
|
+
|
132
|
+
monit_reload_cmd
|
133
|
+
logger.info("Monit: reload")
|
134
|
+
|
135
|
+
reload_start = Time.now.to_i
|
136
|
+
loop do
|
137
|
+
check_incarnation = incarnation
|
138
|
+
if old_incarnation < check_incarnation
|
139
|
+
logger.info("Monit: updated incarnation #{check_incarnation}")
|
140
|
+
return
|
141
|
+
end
|
142
|
+
sleep reload_incarnation_sleep
|
143
|
+
break if Time.now.to_i > (reload_start + reload_timeout)
|
144
|
+
end
|
145
|
+
|
146
|
+
# If we ever get here we have failed to get the new incarnation
|
147
|
+
raise StateError, "Failed to get updated incarnation from Monit"
|
148
|
+
end
|
149
|
+
|
150
|
+
def reload_timeout
|
151
|
+
300
|
152
|
+
end
|
153
|
+
|
154
|
+
def monit_reload_cmd
|
155
|
+
# Exit code and output has no usable output
|
156
|
+
`#{monit_bin} reload`
|
157
|
+
end
|
158
|
+
|
159
|
+
def reload_incarnation_sleep
|
160
|
+
5
|
161
|
+
end
|
162
|
+
|
163
|
+
def unmonitor_services(attempts=10)
|
164
|
+
retry_monit_request(attempts) do |client|
|
165
|
+
client.unmonitor(:group => BOSH_APP_GROUP)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def monitor_services(attempts=10)
|
170
|
+
retry_monit_request(attempts) do |client|
|
171
|
+
client.monitor(:group => BOSH_APP_GROUP)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def start_services(attempts=20)
|
176
|
+
retry_monit_request(attempts) do |client|
|
177
|
+
client.start(:group => BOSH_APP_GROUP)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def stop_services(attempts=20)
|
182
|
+
retry_monit_request(attempts) do |client|
|
183
|
+
client.stop(:group => BOSH_APP_GROUP)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def retry_monit_request(attempts=10)
|
188
|
+
# HACK: Monit becomes unresponsive after reload
|
189
|
+
begin
|
190
|
+
yield monit_api_client if block_given?
|
191
|
+
rescue Errno::ECONNREFUSED, TimeoutError
|
192
|
+
sleep 1
|
193
|
+
logger.info("Monit Service Connection Refused: retrying")
|
194
|
+
retry if (attempts -= 1) > 0
|
195
|
+
rescue => e
|
196
|
+
messages = [
|
197
|
+
"Connection reset by peer",
|
198
|
+
"Service Unavailable"
|
199
|
+
]
|
200
|
+
if messages.include?(e.message)
|
201
|
+
logger.info("Monit Service Unavailable (#{e.message}): retrying")
|
202
|
+
sleep 1
|
203
|
+
retry if (attempts -= 1) > 0
|
204
|
+
end
|
205
|
+
raise e
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def incarnation
|
210
|
+
NUM_RETRY_MONIT_INCARNATION.times do
|
211
|
+
info = monit_info
|
212
|
+
if info && info[:incarnation]
|
213
|
+
return info[:incarnation].to_i
|
214
|
+
end
|
215
|
+
sleep 1
|
216
|
+
end
|
217
|
+
|
218
|
+
# If we ever get here we have failed to get incarnation
|
219
|
+
raise StateError, "Failed to get incarnation from Monit"
|
220
|
+
end
|
221
|
+
|
222
|
+
def monit_info
|
223
|
+
retry_monit_request { |client| client.monit_info }
|
224
|
+
end
|
225
|
+
|
226
|
+
def get_status(num_retries=10)
|
227
|
+
return {} unless @enabled
|
228
|
+
retry_monit_request(num_retries) do |client|
|
229
|
+
client.status(:group => BOSH_APP_GROUP)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def get_system_status(num_retries=10)
|
234
|
+
return {} unless @enabled
|
235
|
+
retry_monit_request(num_retries) do |client|
|
236
|
+
system_status = client.status(:type => :system)
|
237
|
+
return {} unless system_status.is_a?(Hash)
|
238
|
+
system_status.values.first
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def get_vitals(num_retries=10)
|
243
|
+
return {} unless @enabled
|
244
|
+
status = get_system_status(num_retries)
|
245
|
+
return {} unless status.is_a?(Hash)
|
246
|
+
|
247
|
+
raw_data = status[:raw] || {}
|
248
|
+
sys_data = raw_data["system"] || {}
|
249
|
+
loadavg = sys_data["load"] || {}
|
250
|
+
cpu = sys_data["cpu"] || {}
|
251
|
+
mem = sys_data["memory"] || {}
|
252
|
+
swap = sys_data["swap"] || {}
|
253
|
+
|
254
|
+
{
|
255
|
+
"load" => [ loadavg["avg01"], loadavg["avg05"], loadavg["avg15"] ],
|
256
|
+
"cpu" => { "user" => cpu["user"], "sys" => cpu["system"], "wait" => cpu["wait"] },
|
257
|
+
"mem" => { "percent" => mem["percent"], "kb" => mem["kilobyte"] },
|
258
|
+
"swap" => { "percent" => swap["percent"], "kb" => swap["kilobyte"] }
|
259
|
+
}
|
260
|
+
end
|
261
|
+
|
262
|
+
def service_group_state(num_retries=10)
|
263
|
+
# FIXME: state should be unknown if monit is disabled
|
264
|
+
# However right now that would break director interaction
|
265
|
+
# (at least in integration tests)
|
266
|
+
return "running" unless @enabled
|
267
|
+
status = get_status(num_retries)
|
268
|
+
|
269
|
+
return "starting" if status.any? { |_, job_status| starting?(job_status) }
|
270
|
+
|
271
|
+
not_running = status.reject do |name, data|
|
272
|
+
# break early if any service is initializing
|
273
|
+
# at least with monit_api a stopped services is still running
|
274
|
+
(data[:monitor] == :yes && data[:status][:message] == "running")
|
275
|
+
end
|
276
|
+
|
277
|
+
not_running.empty? ? "running" : "failing"
|
278
|
+
rescue => e
|
279
|
+
logger.info("Unable to determine job state: #{e}")
|
280
|
+
"unknown"
|
281
|
+
end
|
282
|
+
|
283
|
+
def starting?(status)
|
284
|
+
status[:monitor] == :init
|
285
|
+
end
|
286
|
+
|
287
|
+
end
|
288
|
+
|
289
|
+
def initialize
|
290
|
+
@logger = Bosh::Agent::Config.logger
|
291
|
+
end
|
292
|
+
|
293
|
+
def run
|
294
|
+
Thread.new { exec_monit }
|
295
|
+
end
|
296
|
+
|
297
|
+
def exec_monit
|
298
|
+
status = nil
|
299
|
+
|
300
|
+
stdout_rd, stdout_wr = IO.pipe()
|
301
|
+
stderr_rd, stderr_wr = IO.pipe()
|
302
|
+
pid = Process.spawn("#{Monit.monit_bin} -I -c #{Monit.monitrc}", :in => :close, :out => stdout_wr, :err=> stderr_wr)
|
303
|
+
|
304
|
+
at_exit {
|
305
|
+
Process.kill('TERM', pid) rescue nil
|
306
|
+
Process.waitpid(pid) rescue nil
|
307
|
+
}
|
308
|
+
|
309
|
+
log_monit_output(stdout_rd, stderr_rd)
|
310
|
+
|
311
|
+
status = Process.waitpid(pid) rescue nil
|
312
|
+
rescue => e
|
313
|
+
@logger.error("Failed to run Monit: #{e.inspect} #{e.backtrace}")
|
314
|
+
|
315
|
+
[stdin, stdout, stderr].each { |fd| fd.close rescue nil }
|
316
|
+
|
317
|
+
if status.nil?
|
318
|
+
Process.kill('TERM', pid) rescue nil
|
319
|
+
Process.waitpid(pid) rescue nil
|
320
|
+
end
|
321
|
+
|
322
|
+
raise
|
323
|
+
ensure
|
324
|
+
[stdin, stdout, stderr].each { |fd| fd.close rescue nil }
|
325
|
+
end
|
326
|
+
|
327
|
+
def log_monit_output(stdout, stderr)
|
328
|
+
timeout = nil
|
329
|
+
out, err = '', ''
|
330
|
+
readers = [stdout, stderr]
|
331
|
+
writers = []
|
332
|
+
|
333
|
+
while readers.any?
|
334
|
+
ready = IO.select(readers, writers, readers + writers, timeout)
|
335
|
+
ready[0].each do |fd|
|
336
|
+
buf = (fd == stdout) ? out : err
|
337
|
+
begin
|
338
|
+
buf << fd.readpartial(BUFSIZE)
|
339
|
+
rescue Errno::EAGAIN, Errno::EINTR
|
340
|
+
rescue EOFError
|
341
|
+
readers.delete(fd)
|
342
|
+
fd.close
|
343
|
+
end
|
344
|
+
buf.gsub!(/\n\Z/,'')
|
345
|
+
@logger.info("Monit: #{buf}")
|
346
|
+
end
|
347
|
+
out, err = '', ''
|
348
|
+
end
|
349
|
+
|
350
|
+
end
|
351
|
+
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|