bosh_agent 1.5.0.pre.1113

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/CHANGELOG +0 -0
  2. data/bin/bosh_agent +102 -0
  3. data/lib/bosh_agent/alert.rb +191 -0
  4. data/lib/bosh_agent/alert_processor.rb +96 -0
  5. data/lib/bosh_agent/apply_plan/helpers.rb +30 -0
  6. data/lib/bosh_agent/apply_plan/job.rb +235 -0
  7. data/lib/bosh_agent/apply_plan/package.rb +58 -0
  8. data/lib/bosh_agent/apply_plan/plan.rb +96 -0
  9. data/lib/bosh_agent/bootstrap.rb +341 -0
  10. data/lib/bosh_agent/config.rb +5 -0
  11. data/lib/bosh_agent/configuration.rb +102 -0
  12. data/lib/bosh_agent/disk_util.rb +103 -0
  13. data/lib/bosh_agent/errors.rb +25 -0
  14. data/lib/bosh_agent/ext.rb +48 -0
  15. data/lib/bosh_agent/file_aggregator.rb +78 -0
  16. data/lib/bosh_agent/file_matcher.rb +45 -0
  17. data/lib/bosh_agent/handler.rb +440 -0
  18. data/lib/bosh_agent/heartbeat.rb +74 -0
  19. data/lib/bosh_agent/heartbeat_processor.rb +45 -0
  20. data/lib/bosh_agent/http_handler.rb +135 -0
  21. data/lib/bosh_agent/infrastructure/aws/registry.rb +177 -0
  22. data/lib/bosh_agent/infrastructure/aws/settings.rb +59 -0
  23. data/lib/bosh_agent/infrastructure/aws.rb +17 -0
  24. data/lib/bosh_agent/infrastructure/dummy.rb +24 -0
  25. data/lib/bosh_agent/infrastructure/openstack/registry.rb +220 -0
  26. data/lib/bosh_agent/infrastructure/openstack/settings.rb +76 -0
  27. data/lib/bosh_agent/infrastructure/openstack.rb +17 -0
  28. data/lib/bosh_agent/infrastructure/vsphere/settings.rb +135 -0
  29. data/lib/bosh_agent/infrastructure/vsphere.rb +16 -0
  30. data/lib/bosh_agent/infrastructure.rb +25 -0
  31. data/lib/bosh_agent/message/apply.rb +184 -0
  32. data/lib/bosh_agent/message/base.rb +38 -0
  33. data/lib/bosh_agent/message/compile_package.rb +250 -0
  34. data/lib/bosh_agent/message/drain.rb +195 -0
  35. data/lib/bosh_agent/message/list_disk.rb +25 -0
  36. data/lib/bosh_agent/message/logs.rb +108 -0
  37. data/lib/bosh_agent/message/migrate_disk.rb +55 -0
  38. data/lib/bosh_agent/message/mount_disk.rb +102 -0
  39. data/lib/bosh_agent/message/ssh.rb +109 -0
  40. data/lib/bosh_agent/message/state.rb +47 -0
  41. data/lib/bosh_agent/message/unmount_disk.rb +29 -0
  42. data/lib/bosh_agent/monit.rb +354 -0
  43. data/lib/bosh_agent/monit_client.rb +158 -0
  44. data/lib/bosh_agent/mounter.rb +42 -0
  45. data/lib/bosh_agent/ntp.rb +32 -0
  46. data/lib/bosh_agent/platform/centos/disk.rb +27 -0
  47. data/lib/bosh_agent/platform/centos/network.rb +39 -0
  48. data/lib/bosh_agent/platform/centos/templates/centos-ifcfg.erb +9 -0
  49. data/lib/bosh_agent/platform/centos/templates/dhclient_conf.erb +56 -0
  50. data/lib/bosh_agent/platform/centos/templates/logrotate.erb +8 -0
  51. data/lib/bosh_agent/platform/centos.rb +4 -0
  52. data/lib/bosh_agent/platform/dummy/templates/dummy_template.erb +1 -0
  53. data/lib/bosh_agent/platform/linux/adapter.rb +36 -0
  54. data/lib/bosh_agent/platform/linux/disk.rb +121 -0
  55. data/lib/bosh_agent/platform/linux/logrotate.rb +32 -0
  56. data/lib/bosh_agent/platform/linux/network.rb +124 -0
  57. data/lib/bosh_agent/platform/linux/password.rb +22 -0
  58. data/lib/bosh_agent/platform/linux.rb +4 -0
  59. data/lib/bosh_agent/platform/ubuntu/network.rb +59 -0
  60. data/lib/bosh_agent/platform/ubuntu/templates/dhclient_conf.erb +56 -0
  61. data/lib/bosh_agent/platform/ubuntu/templates/interfaces.erb +14 -0
  62. data/lib/bosh_agent/platform/ubuntu/templates/logrotate.erb +8 -0
  63. data/lib/bosh_agent/platform/ubuntu.rb +4 -0
  64. data/lib/bosh_agent/platform.rb +26 -0
  65. data/lib/bosh_agent/remote_exception.rb +62 -0
  66. data/lib/bosh_agent/runner.rb +36 -0
  67. data/lib/bosh_agent/settings.rb +61 -0
  68. data/lib/bosh_agent/sigar_box.rb +26 -0
  69. data/lib/bosh_agent/smtp_server.rb +96 -0
  70. data/lib/bosh_agent/state.rb +100 -0
  71. data/lib/bosh_agent/syslog_monitor.rb +53 -0
  72. data/lib/bosh_agent/template.rb +50 -0
  73. data/lib/bosh_agent/util.rb +190 -0
  74. data/lib/bosh_agent/version.rb +8 -0
  75. data/lib/bosh_agent.rb +92 -0
  76. metadata +332 -0
@@ -0,0 +1,102 @@
1
+ # Copyright (c) 2009-2012 VMware, Inc.
2
+ require 'bosh_agent/mounter'
3
+
4
+ module Bosh::Agent
5
+ module Message
6
+ class MountDisk < Base
7
+ def self.process(args)
8
+ new(args).mount
9
+ end
10
+
11
+ def initialize(args)
12
+ @cid = args.first
13
+ end
14
+
15
+ def mount
16
+ if Bosh::Agent::Config.configure
17
+ update_settings
18
+ logger.info("MountDisk: #{@cid} - #{settings['disks'].inspect}")
19
+
20
+ setup_disk
21
+ end
22
+ end
23
+
24
+ def update_settings
25
+ Bosh::Agent::Config.settings = Bosh::Agent::Settings.load
26
+ end
27
+
28
+ def setup_disk
29
+ disk = Bosh::Agent::Config.platform.lookup_disk_by_cid(@cid)
30
+ partition = "#{disk}1"
31
+
32
+ logger.info("setup disk settings: #{settings.inspect}")
33
+
34
+ read_disk_attempts = 300
35
+ read_disk_attempts.downto(0) do |n|
36
+ begin
37
+ # Parition table is blank
38
+ disk_data = File.read(disk, 512)
39
+
40
+ if disk_data == "\x00"*512
41
+ logger.info("Found blank disk #{disk}")
42
+ else
43
+ logger.info("Disk has partition table")
44
+ logger.info(`sfdisk -Llq #{disk} 2> /dev/null`)
45
+ end
46
+ break
47
+ rescue => e
48
+ # Do nothing - we'll retry
49
+ logger.info("Re-trying reading from #{disk}")
50
+ end
51
+
52
+ if n == 0
53
+ raise Bosh::Agent::MessageHandlerError, "Unable to read from new disk"
54
+ end
55
+ sleep 1
56
+ end
57
+
58
+ if File.blockdev?(disk) && DiskUtil.ensure_no_partition?(disk, partition)
59
+ full_disk = ",,L\n"
60
+ logger.info("Partitioning #{disk}")
61
+
62
+ Bosh::Agent::Util.partition_disk(disk, full_disk)
63
+
64
+ mke2fs_options = ["-t ext4", "-j"]
65
+ mke2fs_options << "-E lazy_itable_init=1" if Bosh::Agent::Util.lazy_itable_init_enabled?
66
+ `/sbin/mke2fs #{mke2fs_options.join(" ")} #{partition}`
67
+ unless $?.exitstatus == 0
68
+ raise Bosh::Agent::MessageHandlerError, "Failed create file system (#{$?.exitstatus})"
69
+ end
70
+ elsif File.blockdev?(partition)
71
+ logger.info("Found existing partition on #{disk}")
72
+ # Do nothing
73
+ else
74
+ raise Bosh::Agent::MessageHandlerError, "Unable to format #{disk}"
75
+ end
76
+
77
+ mount_persistent_disk(partition)
78
+ {}
79
+ end
80
+
81
+ def mount_persistent_disk(partition)
82
+ store_mountpoint = File.join(base_dir, 'store')
83
+
84
+ if Pathname.new(store_mountpoint).mountpoint?
85
+ logger.info("Mounting persistent disk store migration target")
86
+ mountpoint = File.join(base_dir, 'store_migraton_target')
87
+ else
88
+ logger.info("Mounting persistent disk store")
89
+ mountpoint = store_mountpoint
90
+ end
91
+
92
+ FileUtils.mkdir_p(mountpoint)
93
+ FileUtils.chmod(0700, mountpoint)
94
+
95
+ Mounter.new(logger).mount(partition, mountpoint)
96
+ end
97
+
98
+ def self.long_running?; true; end
99
+ end
100
+
101
+ end
102
+ end
@@ -0,0 +1,109 @@
1
+ # Copyright (c) 2009-2012 VMware, Inc.
2
+
3
+ module Bosh::Agent
4
+ module Message
5
+ class Ssh < Base
6
+ SSH_USER_PREFIX = "bosh_"
7
+
8
+ def self.process(args)
9
+ ssh = self.new(args)
10
+ case ssh.command
11
+ when "setup"
12
+ ssh.setup
13
+ when "cleanup"
14
+ ssh.cleanup
15
+ end
16
+ end
17
+
18
+ def base_dir
19
+ Bosh::Agent::Config.base_dir
20
+ end
21
+
22
+ def ssh_base_dir
23
+ File.join(base_dir, "bosh_ssh")
24
+ end
25
+
26
+ attr_reader :command
27
+
28
+ def initialize(args)
29
+ @command, @params = args
30
+ end
31
+
32
+ def shell_cmd(cmd)
33
+ shell_output = %x[#{cmd} 2>&1]
34
+ raise "'#{cmd}' failed, error: #{shell_output}" if $?.exitstatus != 0
35
+ end
36
+
37
+ def setup
38
+ begin
39
+ user = @params["user"]
40
+ password = @params["password"]
41
+ logger.info("Setting up ssh for user #{user}")
42
+
43
+ shell_cmd(%Q[mkdir -p #{ssh_base_dir}])
44
+
45
+ if password
46
+ shell_cmd(%Q[useradd -m -b #{ssh_base_dir} -s /bin/bash -p '#{password}' #{user}])
47
+ else
48
+ shell_cmd(%Q[useradd -m -b #{ssh_base_dir} -s /bin/bash #{user}])
49
+ end
50
+
51
+ # Add user to admin and vcap group
52
+ shell_cmd(%Q[usermod -G admin,vcap #{user}])
53
+
54
+ # Add public key to authorized keys
55
+ ssh_dir = File.join(ssh_base_dir, user, ".ssh")
56
+ FileUtils.mkdir_p(ssh_dir)
57
+
58
+ File.open(File.join(ssh_dir, "authorized_keys"), "w+") do |f|
59
+ f.write(@params["public_key"])
60
+ end
61
+ FileUtils.chown_R(user, user, ssh_dir)
62
+
63
+ {"command" => @command, "status" => "success", "ip" => Bosh::Agent::Config.default_ip}
64
+ rescue => e
65
+ return {"command" => @command, "status" => "failure", "error" => e.message}
66
+ end
67
+ end
68
+
69
+ def cleanup
70
+ begin
71
+ return {"command" => @command, "status" => "bad_user"} if @params["user_regex"].nil?
72
+
73
+ # CLI calls this function under the following 2 scenarios
74
+ # 1. When it wants to cleanup a single user after an interactive
75
+ # session or after executing a remote command. In this case
76
+ # the "user_regex" would match a single user i.e. "^user_name$"
77
+ # 2. CLI has a special option called "cleanup all ssh users", in this
78
+ # case the "user_regex" is more generic like "^user_name_prefix"
79
+ #
80
+ # Irrespecitve of the scenarios above, we dont fully trust the "user_regex"
81
+ # and will cull the list of users to those that match SSH_USER_PREFEX
82
+
83
+ users = []
84
+ # list users
85
+ File.open("/etc/passwd", "r") do |f|
86
+ while user_entry = f.gets
87
+ next unless user_match = /(^.*?):/.match(user_entry)
88
+ user = user_match[1]
89
+ if /#{@params["user_regex"]}/ =~ user
90
+ users << user
91
+ end
92
+ end
93
+ end
94
+
95
+ users.each do |user|
96
+ # cant trust the user_regex completely, so skip unexpected users
97
+ next unless user =~ /^#{SSH_USER_PREFIX}/
98
+ logger.info("deleting user #{user}")
99
+ shell_cmd(%Q[userdel -r #{user}])
100
+ end
101
+
102
+ {"command" => @command, "status" => "success"}
103
+ rescue => e
104
+ return {"command" => @command, "status" => "failure", "error" => e.message}
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,47 @@
1
+ # Copyright (c) 2009-2012 VMware, Inc.
2
+
3
+ module Bosh::Agent
4
+ module Message
5
+ class State < Base
6
+
7
+ def self.process(args)
8
+ self.new(args).state
9
+ end
10
+
11
+ def initialize(args = nil)
12
+ if args.is_a?(Array)
13
+ @full_format = true if args.include?("full")
14
+ end
15
+ end
16
+
17
+ def state
18
+ response = Bosh::Agent::Config.state.to_hash
19
+
20
+ logger.info("Agent state: #{response.inspect}")
21
+
22
+ if settings
23
+ response["agent_id"] = settings["agent_id"]
24
+ response["vm"] = settings["vm"]
25
+ end
26
+
27
+ response["job_state"] = job_state
28
+ response["bosh_protocol"] = Bosh::Agent::BOSH_PROTOCOL
29
+ response["ntp"] = Bosh::Agent::NTP.offset
30
+
31
+ if @full_format
32
+ response["vitals"] = Bosh::Agent::Monit.get_vitals
33
+ response["vitals"]["disk"] = Bosh::Agent::DiskUtil.get_usage
34
+ end
35
+
36
+ response
37
+
38
+ rescue Bosh::Agent::StateError => e
39
+ raise Bosh::Agent::MessageHandlerError, e
40
+ end
41
+
42
+ def job_state
43
+ Bosh::Agent::Monit.service_group_state
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,29 @@
1
+ require 'bosh_agent/disk_util'
2
+
3
+ module Bosh::Agent
4
+ module Message
5
+ class UnmountDisk < Base
6
+
7
+ def self.long_running?; true; end
8
+
9
+ def self.process(args)
10
+ self.new.unmount(args)
11
+ end
12
+
13
+ def unmount(args)
14
+ cid = args.first
15
+ disk = Bosh::Agent::Config.platform.lookup_disk_by_cid(cid)
16
+ partition = "#{disk}1"
17
+
18
+ if DiskUtil.mount_entry(partition)
19
+ @block, @mountpoint = DiskUtil.mount_entry(partition).split
20
+ DiskUtil.umount_guard(@mountpoint)
21
+ logger.info("Unmounted #{@block} on #{@mountpoint}")
22
+ return {:message => "Unmounted #{@block} on #{@mountpoint}" }
23
+ else
24
+ return {:message => "Unknown mount for partition: #{partition}"}
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,354 @@
1
+ # Copyright (c) 2009-2012 VMware, Inc.
2
+ require 'bosh_agent/monit_client'
3
+
4
+ module Bosh::Agent
5
+ # A good chunk of this code is lifted from the implementation of POSIX::Spawn::Child
6
+ class Monit
7
+ BUFSIZE = (32 * 1024)
8
+ NUM_RETRY_MONIT_INCARNATION = 60
9
+ NUM_RETRY_MONIT_WAIT_INCARNATION = 15
10
+
11
+ class << self
12
+ attr_accessor :enabled
13
+
14
+ # enable supposed to be called in the very beginning as it creates
15
+ # sync primitives. Ideally this class should be refactored to minimize
16
+ # the number of singleton methods having to keep track of the state.
17
+ def enable
18
+ @enabled = true
19
+ end
20
+
21
+ def start
22
+ new.run
23
+ end
24
+
25
+ def base_dir
26
+ Bosh::Agent::Config.base_dir
27
+ end
28
+
29
+ def logger
30
+ Bosh::Agent::Config.logger
31
+ end
32
+
33
+ def monit_dir
34
+ File.join(base_dir, 'monit')
35
+ end
36
+
37
+ def monit_events_dir
38
+ File.join(monit_dir, 'events')
39
+ end
40
+
41
+ def monit_user_file
42
+ File.join(monit_dir, 'monit.user')
43
+ end
44
+
45
+ def monit_alerts_file
46
+ File.join(monit_dir, 'alerts.monitrc')
47
+ end
48
+
49
+ def smtp_port
50
+ Bosh::Agent::Config.smtp_port
51
+ end
52
+
53
+ def monit_credentials
54
+ entry = File.read(monit_user_file).lines.find { |line| line.match(/\A#{BOSH_APP_GROUP}/) }
55
+ user, cred = entry.split(/:/)
56
+ [user, cred.strip]
57
+ end
58
+
59
+ def monit_api_client
60
+ # Primarily for CI - normally done during configure
61
+ unless Bosh::Agent::Config.configure
62
+ setup_monit_user
63
+ end
64
+
65
+ user, cred = monit_credentials
66
+ MonitClient.new("https://#{user}:#{cred}@127.0.0.1:2822", :logger => logger)
67
+ end
68
+
69
+ def random_credential
70
+ OpenSSL::Random.random_bytes(8).unpack("H*")[0]
71
+ end
72
+
73
+ def setup_monit_dir
74
+ FileUtils.mkdir_p(monit_dir)
75
+ FileUtils.chmod(0700, monit_dir)
76
+ end
77
+
78
+ def setup_monit_user
79
+ unless File.exist?(monit_user_file)
80
+ setup_monit_dir
81
+ File.open(monit_user_file, 'w') do |f|
82
+ f.puts("vcap:#{random_credential}")
83
+ end
84
+ end
85
+ end
86
+
87
+ # This and other methods could probably be refactored into a separate management class to avoid keeping
88
+ # all this state in a metaclass (as it's weird to test)
89
+ def setup_alerts
90
+ return unless Config.process_alerts
91
+
92
+ alerts_config = <<-CONFIG
93
+ set alert bosh@localhost
94
+ set mailserver 127.0.0.1 port #{Config.smtp_port}
95
+ username "#{Config.smtp_user}" password "#{Config.smtp_password}"
96
+
97
+ set eventqueue
98
+ basedir #{monit_events_dir}
99
+ slots 5000
100
+
101
+ set mail-format {
102
+ from: monit@localhost
103
+ subject: Monit Alert
104
+ message: Service: $SERVICE
105
+ Event: $EVENT
106
+ Action: $ACTION
107
+ Date: $DATE
108
+ Description: $DESCRIPTION
109
+ }
110
+ CONFIG
111
+
112
+ setup_monit_dir
113
+ FileUtils.mkdir_p(monit_events_dir)
114
+
115
+ File.open(monit_alerts_file, 'w') do |f|
116
+ f.puts(alerts_config)
117
+ end
118
+ end
119
+
120
+ def monit_bin
121
+ File.join(base_dir, 'bosh', 'bin', 'monit')
122
+ end
123
+
124
+ def monitrc
125
+ File.join(base_dir, 'bosh', 'etc', 'monitrc')
126
+ end
127
+
128
+ def reload
129
+ old_incarnation = incarnation
130
+ logger.info("Monit: old incarnation #{old_incarnation}")
131
+
132
+ monit_reload_cmd
133
+ logger.info("Monit: reload")
134
+
135
+ reload_start = Time.now.to_i
136
+ loop do
137
+ check_incarnation = incarnation
138
+ if old_incarnation < check_incarnation
139
+ logger.info("Monit: updated incarnation #{check_incarnation}")
140
+ return
141
+ end
142
+ sleep reload_incarnation_sleep
143
+ break if Time.now.to_i > (reload_start + reload_timeout)
144
+ end
145
+
146
+ # If we ever get here we have failed to get the new incarnation
147
+ raise StateError, "Failed to get updated incarnation from Monit"
148
+ end
149
+
150
+ def reload_timeout
151
+ 300
152
+ end
153
+
154
+ def monit_reload_cmd
155
+ # Exit code and output has no usable output
156
+ `#{monit_bin} reload`
157
+ end
158
+
159
+ def reload_incarnation_sleep
160
+ 5
161
+ end
162
+
163
+ def unmonitor_services(attempts=10)
164
+ retry_monit_request(attempts) do |client|
165
+ client.unmonitor(:group => BOSH_APP_GROUP)
166
+ end
167
+ end
168
+
169
+ def monitor_services(attempts=10)
170
+ retry_monit_request(attempts) do |client|
171
+ client.monitor(:group => BOSH_APP_GROUP)
172
+ end
173
+ end
174
+
175
+ def start_services(attempts=20)
176
+ retry_monit_request(attempts) do |client|
177
+ client.start(:group => BOSH_APP_GROUP)
178
+ end
179
+ end
180
+
181
+ def stop_services(attempts=20)
182
+ retry_monit_request(attempts) do |client|
183
+ client.stop(:group => BOSH_APP_GROUP)
184
+ end
185
+ end
186
+
187
+ def retry_monit_request(attempts=10)
188
+ # HACK: Monit becomes unresponsive after reload
189
+ begin
190
+ yield monit_api_client if block_given?
191
+ rescue Errno::ECONNREFUSED, TimeoutError
192
+ sleep 1
193
+ logger.info("Monit Service Connection Refused: retrying")
194
+ retry if (attempts -= 1) > 0
195
+ rescue => e
196
+ messages = [
197
+ "Connection reset by peer",
198
+ "Service Unavailable"
199
+ ]
200
+ if messages.include?(e.message)
201
+ logger.info("Monit Service Unavailable (#{e.message}): retrying")
202
+ sleep 1
203
+ retry if (attempts -= 1) > 0
204
+ end
205
+ raise e
206
+ end
207
+ end
208
+
209
+ def incarnation
210
+ NUM_RETRY_MONIT_INCARNATION.times do
211
+ info = monit_info
212
+ if info && info[:incarnation]
213
+ return info[:incarnation].to_i
214
+ end
215
+ sleep 1
216
+ end
217
+
218
+ # If we ever get here we have failed to get incarnation
219
+ raise StateError, "Failed to get incarnation from Monit"
220
+ end
221
+
222
+ def monit_info
223
+ retry_monit_request { |client| client.monit_info }
224
+ end
225
+
226
+ def get_status(num_retries=10)
227
+ return {} unless @enabled
228
+ retry_monit_request(num_retries) do |client|
229
+ client.status(:group => BOSH_APP_GROUP)
230
+ end
231
+ end
232
+
233
+ def get_system_status(num_retries=10)
234
+ return {} unless @enabled
235
+ retry_monit_request(num_retries) do |client|
236
+ system_status = client.status(:type => :system)
237
+ return {} unless system_status.is_a?(Hash)
238
+ system_status.values.first
239
+ end
240
+ end
241
+
242
+ def get_vitals(num_retries=10)
243
+ return {} unless @enabled
244
+ status = get_system_status(num_retries)
245
+ return {} unless status.is_a?(Hash)
246
+
247
+ raw_data = status[:raw] || {}
248
+ sys_data = raw_data["system"] || {}
249
+ loadavg = sys_data["load"] || {}
250
+ cpu = sys_data["cpu"] || {}
251
+ mem = sys_data["memory"] || {}
252
+ swap = sys_data["swap"] || {}
253
+
254
+ {
255
+ "load" => [ loadavg["avg01"], loadavg["avg05"], loadavg["avg15"] ],
256
+ "cpu" => { "user" => cpu["user"], "sys" => cpu["system"], "wait" => cpu["wait"] },
257
+ "mem" => { "percent" => mem["percent"], "kb" => mem["kilobyte"] },
258
+ "swap" => { "percent" => swap["percent"], "kb" => swap["kilobyte"] }
259
+ }
260
+ end
261
+
262
+ def service_group_state(num_retries=10)
263
+ # FIXME: state should be unknown if monit is disabled
264
+ # However right now that would break director interaction
265
+ # (at least in integration tests)
266
+ return "running" unless @enabled
267
+ status = get_status(num_retries)
268
+
269
+ return "starting" if status.any? { |_, job_status| starting?(job_status) }
270
+
271
+ not_running = status.reject do |name, data|
272
+ # break early if any service is initializing
273
+ # at least with monit_api a stopped services is still running
274
+ (data[:monitor] == :yes && data[:status][:message] == "running")
275
+ end
276
+
277
+ not_running.empty? ? "running" : "failing"
278
+ rescue => e
279
+ logger.info("Unable to determine job state: #{e}")
280
+ "unknown"
281
+ end
282
+
283
+ def starting?(status)
284
+ status[:monitor] == :init
285
+ end
286
+
287
+ end
288
+
289
+ def initialize
290
+ @logger = Bosh::Agent::Config.logger
291
+ end
292
+
293
+ def run
294
+ Thread.new { exec_monit }
295
+ end
296
+
297
+ def exec_monit
298
+ status = nil
299
+
300
+ stdout_rd, stdout_wr = IO.pipe()
301
+ stderr_rd, stderr_wr = IO.pipe()
302
+ pid = Process.spawn("#{Monit.monit_bin} -I -c #{Monit.monitrc}", :in => :close, :out => stdout_wr, :err=> stderr_wr)
303
+
304
+ at_exit {
305
+ Process.kill('TERM', pid) rescue nil
306
+ Process.waitpid(pid) rescue nil
307
+ }
308
+
309
+ log_monit_output(stdout_rd, stderr_rd)
310
+
311
+ status = Process.waitpid(pid) rescue nil
312
+ rescue => e
313
+ @logger.error("Failed to run Monit: #{e.inspect} #{e.backtrace}")
314
+
315
+ [stdin, stdout, stderr].each { |fd| fd.close rescue nil }
316
+
317
+ if status.nil?
318
+ Process.kill('TERM', pid) rescue nil
319
+ Process.waitpid(pid) rescue nil
320
+ end
321
+
322
+ raise
323
+ ensure
324
+ [stdin, stdout, stderr].each { |fd| fd.close rescue nil }
325
+ end
326
+
327
+ def log_monit_output(stdout, stderr)
328
+ timeout = nil
329
+ out, err = '', ''
330
+ readers = [stdout, stderr]
331
+ writers = []
332
+
333
+ while readers.any?
334
+ ready = IO.select(readers, writers, readers + writers, timeout)
335
+ ready[0].each do |fd|
336
+ buf = (fd == stdout) ? out : err
337
+ begin
338
+ buf << fd.readpartial(BUFSIZE)
339
+ rescue Errno::EAGAIN, Errno::EINTR
340
+ rescue EOFError
341
+ readers.delete(fd)
342
+ fd.close
343
+ end
344
+ buf.gsub!(/\n\Z/,'')
345
+ @logger.info("Monit: #{buf}")
346
+ end
347
+ out, err = '', ''
348
+ end
349
+
350
+ end
351
+
352
+ end
353
+ end
354
+