bosh-director 1.5.0.pre.1113
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +34 -0
- data/bin/bosh-director +36 -0
- data/bin/bosh-director-console +84 -0
- data/bin/bosh-director-drain-workers +42 -0
- data/bin/bosh-director-migrate +58 -0
- data/bin/bosh-director-scheduler +27 -0
- data/bin/bosh-director-worker +76 -0
- data/db/migrations/README +1 -0
- data/db/migrations/director/20110209010747_initial.rb +118 -0
- data/db/migrations/director/20110406055800_add_task_user.rb +9 -0
- data/db/migrations/director/20110518225809_remove_cid_constrain.rb +13 -0
- data/db/migrations/director/20110617211923_add_deployments_release_versions.rb +32 -0
- data/db/migrations/director/20110622212607_add_task_checkpoint_timestamp.rb +9 -0
- data/db/migrations/director/20110628023039_add_state_to_instances.rb +21 -0
- data/db/migrations/director/20110709012332_add_disk_size_to_instances.rb +9 -0
- data/db/migrations/director/20110906183441_add_log_bundles.rb +11 -0
- data/db/migrations/director/20110907194830_add_logs_json_to_templates.rb +9 -0
- data/db/migrations/director/20110915205610_add_persistent_disks.rb +51 -0
- data/db/migrations/director/20111005180929_add_properties.rb +14 -0
- data/db/migrations/director/20111110024617_add_deployment_problems.rb +24 -0
- data/db/migrations/director/20111216214145_recreate_support_for_vms.rb +9 -0
- data/db/migrations/director/20120102084027_add_credentials_to_vms.rb +7 -0
- data/db/migrations/director/20120427235217_allow_multiple_releases_per_deployment.rb +36 -0
- data/db/migrations/director/20120524175805_add_task_type.rb +44 -0
- data/db/migrations/director/20120614001930_delete_redundant_deployment_release_relation.rb +34 -0
- data/db/migrations/director/20120822004528_add_fingerprint_to_templates_and_packages.rb +17 -0
- data/db/migrations/director/20120830191244_add_properties_to_templates.rb +9 -0
- data/db/migrations/director/20121106190739_persist_vm_env.rb +9 -0
- data/db/migrations/director/20130222232131_add_sha1_to_stemcells.rb +9 -0
- data/db/migrations/director/20130312211407_add_commit_hash_to_release_versions.rb +19 -0
- data/db/migrations/director/20130409235338_snapshot.rb +15 -0
- data/db/migrations/director/20130530164918_add_paused_flag_to_instance.rb +14 -0
- data/db/migrations/director/20130531172604_add_director_attributes.rb +13 -0
- data/db/migrations/dns/20120123234908_initial.rb +27 -0
- data/lib/bosh/director.rb +133 -0
- data/lib/bosh/director/agent_client.rb +78 -0
- data/lib/bosh/director/api.rb +29 -0
- data/lib/bosh/director/api/api_helper.rb +81 -0
- data/lib/bosh/director/api/backup_manager.rb +15 -0
- data/lib/bosh/director/api/controller.rb +639 -0
- data/lib/bosh/director/api/controller_helpers.rb +34 -0
- data/lib/bosh/director/api/deployment_lookup.rb +13 -0
- data/lib/bosh/director/api/deployment_manager.rb +60 -0
- data/lib/bosh/director/api/http_constants.rb +16 -0
- data/lib/bosh/director/api/instance_lookup.rb +44 -0
- data/lib/bosh/director/api/instance_manager.rb +63 -0
- data/lib/bosh/director/api/problem_manager.rb +40 -0
- data/lib/bosh/director/api/property_manager.rb +69 -0
- data/lib/bosh/director/api/release_manager.rb +59 -0
- data/lib/bosh/director/api/resource_manager.rb +69 -0
- data/lib/bosh/director/api/resurrector_manager.rb +15 -0
- data/lib/bosh/director/api/snapshot_manager.rb +94 -0
- data/lib/bosh/director/api/stemcell_manager.rb +50 -0
- data/lib/bosh/director/api/task_helper.rb +46 -0
- data/lib/bosh/director/api/task_manager.rb +64 -0
- data/lib/bosh/director/api/user_manager.rb +72 -0
- data/lib/bosh/director/api/vm_state_manager.rb +11 -0
- data/lib/bosh/director/app.rb +35 -0
- data/lib/bosh/director/blob_util.rb +87 -0
- data/lib/bosh/director/blobstores.rb +29 -0
- data/lib/bosh/director/client.rb +156 -0
- data/lib/bosh/director/cloudcheck_helper.rb +204 -0
- data/lib/bosh/director/compile_task.rb +157 -0
- data/lib/bosh/director/config.rb +370 -0
- data/lib/bosh/director/configuration_hasher.rb +114 -0
- data/lib/bosh/director/cycle_helper.rb +36 -0
- data/lib/bosh/director/db_backup.rb +22 -0
- data/lib/bosh/director/db_backup/adapter.rb +3 -0
- data/lib/bosh/director/db_backup/adapter/mysql2.rb +27 -0
- data/lib/bosh/director/db_backup/adapter/postgres.rb +36 -0
- data/lib/bosh/director/db_backup/adapter/sqlite.rb +17 -0
- data/lib/bosh/director/db_backup/error.rb +10 -0
- data/lib/bosh/director/deployment_plan.rb +26 -0
- data/lib/bosh/director/deployment_plan/assembler.rb +430 -0
- data/lib/bosh/director/deployment_plan/compilation_config.rb +54 -0
- data/lib/bosh/director/deployment_plan/compiled_package.rb +35 -0
- data/lib/bosh/director/deployment_plan/dynamic_network.rb +91 -0
- data/lib/bosh/director/deployment_plan/idle_vm.rb +109 -0
- data/lib/bosh/director/deployment_plan/instance.rb +413 -0
- data/lib/bosh/director/deployment_plan/job.rb +470 -0
- data/lib/bosh/director/deployment_plan/manual_network.rb +137 -0
- data/lib/bosh/director/deployment_plan/network.rb +74 -0
- data/lib/bosh/director/deployment_plan/network_subnet.rb +167 -0
- data/lib/bosh/director/deployment_plan/planner.rb +288 -0
- data/lib/bosh/director/deployment_plan/preparer.rb +52 -0
- data/lib/bosh/director/deployment_plan/release.rb +126 -0
- data/lib/bosh/director/deployment_plan/resource_pool.rb +143 -0
- data/lib/bosh/director/deployment_plan/resource_pools.rb +68 -0
- data/lib/bosh/director/deployment_plan/stemcell.rb +56 -0
- data/lib/bosh/director/deployment_plan/template.rb +94 -0
- data/lib/bosh/director/deployment_plan/update_config.rb +80 -0
- data/lib/bosh/director/deployment_plan/updater.rb +55 -0
- data/lib/bosh/director/deployment_plan/vip_network.rb +79 -0
- data/lib/bosh/director/dns_helper.rb +204 -0
- data/lib/bosh/director/download_helper.rb +44 -0
- data/lib/bosh/director/duration.rb +36 -0
- data/lib/bosh/director/encryption_helper.rb +10 -0
- data/lib/bosh/director/errors.rb +198 -0
- data/lib/bosh/director/event_log.rb +136 -0
- data/lib/bosh/director/ext.rb +64 -0
- data/lib/bosh/director/hash_string_vals.rb +13 -0
- data/lib/bosh/director/instance_deleter.rb +109 -0
- data/lib/bosh/director/instance_updater.rb +506 -0
- data/lib/bosh/director/ip_util.rb +67 -0
- data/lib/bosh/director/job_queue.rb +16 -0
- data/lib/bosh/director/job_runner.rb +162 -0
- data/lib/bosh/director/job_updater.rb +121 -0
- data/lib/bosh/director/jobs/backup.rb +86 -0
- data/lib/bosh/director/jobs/base_job.rb +66 -0
- data/lib/bosh/director/jobs/cloud_check/apply_resolutions.rb +46 -0
- data/lib/bosh/director/jobs/cloud_check/scan.rb +38 -0
- data/lib/bosh/director/jobs/cloud_check/scan_and_fix.rb +73 -0
- data/lib/bosh/director/jobs/create_snapshot.rb +23 -0
- data/lib/bosh/director/jobs/delete_deployment.rb +183 -0
- data/lib/bosh/director/jobs/delete_deployment_snapshots.rb +34 -0
- data/lib/bosh/director/jobs/delete_release.rb +219 -0
- data/lib/bosh/director/jobs/delete_snapshots.rb +23 -0
- data/lib/bosh/director/jobs/delete_stemcell.rb +102 -0
- data/lib/bosh/director/jobs/fetch_logs.rb +99 -0
- data/lib/bosh/director/jobs/scheduled_backup.rb +38 -0
- data/lib/bosh/director/jobs/snapshot_deployment.rb +61 -0
- data/lib/bosh/director/jobs/snapshot_deployments.rb +23 -0
- data/lib/bosh/director/jobs/snapshot_self.rb +43 -0
- data/lib/bosh/director/jobs/ssh.rb +59 -0
- data/lib/bosh/director/jobs/update_deployment.rb +110 -0
- data/lib/bosh/director/jobs/update_release.rb +672 -0
- data/lib/bosh/director/jobs/update_stemcell.rb +109 -0
- data/lib/bosh/director/jobs/vm_state.rb +89 -0
- data/lib/bosh/director/lock.rb +133 -0
- data/lib/bosh/director/lock_helper.rb +92 -0
- data/lib/bosh/director/models.rb +29 -0
- data/lib/bosh/director/models/compiled_package.rb +33 -0
- data/lib/bosh/director/models/deployment.rb +22 -0
- data/lib/bosh/director/models/deployment_problem.rb +49 -0
- data/lib/bosh/director/models/deployment_property.rb +21 -0
- data/lib/bosh/director/models/director_attribute.rb +9 -0
- data/lib/bosh/director/models/dns.rb +9 -0
- data/lib/bosh/director/models/dns/domain.rb +9 -0
- data/lib/bosh/director/models/dns/record.rb +7 -0
- data/lib/bosh/director/models/helpers/model_helper.rb +7 -0
- data/lib/bosh/director/models/instance.rb +28 -0
- data/lib/bosh/director/models/log_bundle.rb +10 -0
- data/lib/bosh/director/models/package.rb +30 -0
- data/lib/bosh/director/models/persistent_disk.rb +13 -0
- data/lib/bosh/director/models/release.rb +17 -0
- data/lib/bosh/director/models/release_version.rb +16 -0
- data/lib/bosh/director/models/snapshot.rb +13 -0
- data/lib/bosh/director/models/stemcell.rb +18 -0
- data/lib/bosh/director/models/task.rb +10 -0
- data/lib/bosh/director/models/template.rb +44 -0
- data/lib/bosh/director/models/user.rb +11 -0
- data/lib/bosh/director/models/vm.rb +42 -0
- data/lib/bosh/director/nats_rpc.rb +54 -0
- data/lib/bosh/director/network_reservation.rb +121 -0
- data/lib/bosh/director/next_rebase_version.rb +20 -0
- data/lib/bosh/director/package_compiler.rb +423 -0
- data/lib/bosh/director/problem_handlers/base.rb +153 -0
- data/lib/bosh/director/problem_handlers/inactive_disk.rb +112 -0
- data/lib/bosh/director/problem_handlers/invalid_problem.rb +28 -0
- data/lib/bosh/director/problem_handlers/missing_vm.rb +34 -0
- data/lib/bosh/director/problem_handlers/mount_info_mismatch.rb +62 -0
- data/lib/bosh/director/problem_handlers/out_of_sync_vm.rb +64 -0
- data/lib/bosh/director/problem_handlers/unbound_instance_vm.rb +85 -0
- data/lib/bosh/director/problem_handlers/unresponsive_agent.rb +78 -0
- data/lib/bosh/director/problem_resolver.rb +103 -0
- data/lib/bosh/director/problem_scanner.rb +268 -0
- data/lib/bosh/director/resource_pool_updater.rb +216 -0
- data/lib/bosh/director/scheduler.rb +57 -0
- data/lib/bosh/director/sequel.rb +13 -0
- data/lib/bosh/director/tar_gzipper.rb +47 -0
- data/lib/bosh/director/task_result_file.rb +19 -0
- data/lib/bosh/director/thread_pool.rb +8 -0
- data/lib/bosh/director/validation_helper.rb +55 -0
- data/lib/bosh/director/version.rb +7 -0
- data/lib/bosh/director/vm_creator.rb +80 -0
- data/lib/bosh/director/vm_data.rb +63 -0
- data/lib/bosh/director/vm_metadata_updater.rb +29 -0
- data/lib/bosh/director/vm_reuser.rb +63 -0
- data/lib/cloud/dummy.rb +149 -0
- metadata +664 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright (c) 2009-2012 VMware, Inc.
|
|
2
|
+
|
|
3
|
+
module Bosh::Director
|
|
4
|
+
module ProblemHandlers
|
|
5
|
+
class UnboundInstanceVm < Base
|
|
6
|
+
|
|
7
|
+
register_as :unbound_instance_vm
|
|
8
|
+
auto_resolution :reassociate_vm
|
|
9
|
+
|
|
10
|
+
def initialize(vm_id, data)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
@vm = Models::Vm[vm_id]
|
|
14
|
+
@data = data
|
|
15
|
+
|
|
16
|
+
if @vm.nil?
|
|
17
|
+
handler_error("VM `#{vm_id}' is no longer in the database")
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
if @vm.agent_id.nil?
|
|
21
|
+
handler_error("VM `#{vm_id}' doesn't have an agent id")
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
if @vm.cid.nil?
|
|
25
|
+
handler_error("VM `#{vm_id}' doesn't have a cloud id")
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def description
|
|
31
|
+
job = @data["job"] || "unknown job"
|
|
32
|
+
index = @data["index"] || "unknown index"
|
|
33
|
+
"VM `#{@vm.cid}' reports itself as `#{job}/#{index}' but does not have a bound instance"
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
resolution :ignore do
|
|
37
|
+
plan { "Ignore problem" }
|
|
38
|
+
action { }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
resolution :delete_vm do
|
|
42
|
+
plan { "Delete VM (unless it has persistent disk)" }
|
|
43
|
+
action { validate; delete_vm(@vm) }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
resolution :reassociate_vm do
|
|
47
|
+
plan { "Reassociate VM with corresponding instance" }
|
|
48
|
+
action { validate; reassociate_vm }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def validate
|
|
52
|
+
unless @vm.instance.nil?
|
|
53
|
+
handler_error("Instance is now bound to VM")
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
state = agent_timeout_guard(@vm) { |agent| agent.get_state }
|
|
57
|
+
if state["job"].nil?
|
|
58
|
+
handler_error("VM now properly reports no job")
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def reassociate_vm
|
|
63
|
+
instances = Models::Instance.
|
|
64
|
+
filter(:deployment_id => @vm.deployment_id,
|
|
65
|
+
:job => @data["job"], :index => @data["index"]).all
|
|
66
|
+
|
|
67
|
+
if instances.size > 1
|
|
68
|
+
handler_error("More than one instance in DB matches this VM")
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
if instances.empty?
|
|
72
|
+
handler_error("No instances in DB match this VM")
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
instance = instances[0]
|
|
76
|
+
|
|
77
|
+
if instance.vm
|
|
78
|
+
handler_error("The corresponding instance is associated with another VM")
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
instance.update(:vm => @vm)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Copyright (c) 2009-2012 VMware, Inc.
|
|
2
|
+
|
|
3
|
+
module Bosh::Director
|
|
4
|
+
module ProblemHandlers
|
|
5
|
+
class UnresponsiveAgent < Base
|
|
6
|
+
|
|
7
|
+
register_as :unresponsive_agent
|
|
8
|
+
auto_resolution :ignore
|
|
9
|
+
|
|
10
|
+
def initialize(vm_id, data)
|
|
11
|
+
super
|
|
12
|
+
@vm = Models::Vm[vm_id]
|
|
13
|
+
|
|
14
|
+
if @vm.nil?
|
|
15
|
+
handler_error("VM `#{vm_id}' is no longer in the database")
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
if @vm.agent_id.nil?
|
|
19
|
+
handler_error("VM `#{vm_id}' doesn't have an agent id")
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def description
|
|
24
|
+
"#{instance_name(@vm)} (#{@vm.cid}) is not responding"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
resolution :ignore do
|
|
28
|
+
plan { "Ignore problem" }
|
|
29
|
+
action { }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
resolution :reboot_vm do
|
|
33
|
+
plan { "Reboot VM" }
|
|
34
|
+
action { validate; ensure_cid; reboot_vm(@vm) }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
resolution :recreate_vm do
|
|
38
|
+
plan { "Recreate VM using last known apply spec" }
|
|
39
|
+
action { validate; ensure_cid; recreate_vm(@vm) }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
resolution :delete_vm_reference do
|
|
43
|
+
plan { "Delete VM reference (DANGEROUS!)" }
|
|
44
|
+
action { validate; ensure_no_cid; delete_vm_reference(@vm) }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def agent_alive?
|
|
48
|
+
agent_client(@vm).ping
|
|
49
|
+
true
|
|
50
|
+
rescue Bosh::Director::RpcTimeout
|
|
51
|
+
false
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def ensure_cid
|
|
55
|
+
if @vm.cid.nil?
|
|
56
|
+
handler_error("VM `#{@vm.id}' doesn't have a cloud id, " +
|
|
57
|
+
"only resolution is to delete the VM reference.")
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def ensure_no_cid
|
|
62
|
+
if @vm.cid
|
|
63
|
+
handler_error("VM `#{@vm.id}' has a cloud id, " +
|
|
64
|
+
"please use a different resolution.")
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def validate
|
|
69
|
+
if agent_alive?
|
|
70
|
+
handler_error("Agent is responding now, skipping resolution")
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def delete_vm
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
module Bosh::Director
|
|
2
|
+
class ProblemResolver
|
|
3
|
+
|
|
4
|
+
attr_reader :event_log, :logger
|
|
5
|
+
|
|
6
|
+
def initialize(deployment)
|
|
7
|
+
@deployment = deployment
|
|
8
|
+
@resolved_count = 0
|
|
9
|
+
|
|
10
|
+
#temp
|
|
11
|
+
@event_log = Config.event_log
|
|
12
|
+
@logger = Config.logger
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def begin_stage(stage_name, n_steps)
|
|
16
|
+
event_log.begin_stage(stage_name, n_steps)
|
|
17
|
+
logger.info(stage_name)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def track_and_log(task, log = true)
|
|
21
|
+
event_log.track(task) do |ticker|
|
|
22
|
+
logger.info(task) if log
|
|
23
|
+
yield ticker if block_given?
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def apply_resolutions(resolutions)
|
|
28
|
+
@resolutions = resolutions
|
|
29
|
+
problems = Models::DeploymentProblem.filter(deployment: @deployment, state: 'open').all
|
|
30
|
+
problem_ids = Set.new
|
|
31
|
+
|
|
32
|
+
problems.each do |problem|
|
|
33
|
+
problem_ids << problem.id.to_s
|
|
34
|
+
unless @resolutions.has_key?(problem.id.to_s)
|
|
35
|
+
raise CloudcheckResolutionNotProvided,
|
|
36
|
+
"Resolution for problem #{problem.id} (#{problem.type}) is not provided"
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# We might have some resolutions for problems that are no longer open
|
|
41
|
+
# or just some bogus problem ids, in that case we still need to mention
|
|
42
|
+
# them in event log so end user understands what actually happened.
|
|
43
|
+
missing_problem_ids = @resolutions.keys.to_set - problem_ids
|
|
44
|
+
|
|
45
|
+
begin_stage("Applying problem resolutions", problems.size + missing_problem_ids.size)
|
|
46
|
+
problems.each do |problem|
|
|
47
|
+
apply_resolution(problem)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
missing_problem_ids.each do |problem_id|
|
|
51
|
+
if problem_id !~ /^\d+$/
|
|
52
|
+
reason = "malformed id"
|
|
53
|
+
else
|
|
54
|
+
problem = Models::DeploymentProblem[problem_id.to_i]
|
|
55
|
+
if problem.nil?
|
|
56
|
+
reason = "not found"
|
|
57
|
+
elsif problem.state != "open"
|
|
58
|
+
reason = "state is '#{problem.state}'"
|
|
59
|
+
elsif problem.deployment_id != @deployment.id
|
|
60
|
+
reason = "not a part of this deployment"
|
|
61
|
+
else
|
|
62
|
+
reason = "reason unknown"
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
track_and_log("Ignoring problem #{problem_id} (#{reason})") { }
|
|
67
|
+
end
|
|
68
|
+
@resolved_count
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def apply_resolution(problem)
|
|
72
|
+
handler = ProblemHandlers::Base.create_from_model(problem)
|
|
73
|
+
handler.job = self
|
|
74
|
+
|
|
75
|
+
resolution = @resolutions[problem.id.to_s] || handler.auto_resolution
|
|
76
|
+
problem_summary = "#{problem.type} #{problem.resource_id}"
|
|
77
|
+
resolution_summary = handler.resolution_plan(resolution)
|
|
78
|
+
resolution_summary ||= "no resolution"
|
|
79
|
+
|
|
80
|
+
begin
|
|
81
|
+
track_and_log("#{problem_summary}: #{resolution_summary}") do
|
|
82
|
+
handler.apply_resolution(resolution)
|
|
83
|
+
end
|
|
84
|
+
rescue Bosh::Director::ProblemHandlerError => e
|
|
85
|
+
log_resolution_error(problem, e)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
problem.state = "resolved"
|
|
89
|
+
problem.save
|
|
90
|
+
@resolved_count += 1
|
|
91
|
+
|
|
92
|
+
rescue => e
|
|
93
|
+
log_resolution_error(problem, e)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
def log_resolution_error(problem, error)
|
|
99
|
+
logger.error("Error resolving problem `#{problem.id}': #{error}")
|
|
100
|
+
logger.error(error.backtrace.join("\n"))
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# Copyright (c) 2009-2012 VMware, Inc.
|
|
2
|
+
|
|
3
|
+
module Bosh::Director
|
|
4
|
+
class ProblemScanner
|
|
5
|
+
|
|
6
|
+
AGENT_TIMEOUT = 10 # seconds
|
|
7
|
+
|
|
8
|
+
attr_reader :event_log, :logger
|
|
9
|
+
|
|
10
|
+
@queue = :normal
|
|
11
|
+
|
|
12
|
+
# @param [String] deployment_name Deployment name
|
|
13
|
+
def initialize(deployment)
|
|
14
|
+
@deployment = deployment
|
|
15
|
+
@instance_manager = Api::InstanceManager.new
|
|
16
|
+
|
|
17
|
+
@problem_lock = Mutex.new
|
|
18
|
+
@agent_disks = {}
|
|
19
|
+
|
|
20
|
+
#temp
|
|
21
|
+
@event_log = Config.event_log
|
|
22
|
+
@logger = Config.logger
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def begin_stage(stage_name, n_steps)
|
|
26
|
+
event_log.begin_stage(stage_name, n_steps)
|
|
27
|
+
logger.info(stage_name)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def track_and_log(task, log = true)
|
|
31
|
+
event_log.track(task) do |ticker|
|
|
32
|
+
logger.info(task) if log
|
|
33
|
+
yield ticker if block_given?
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def reset(vms=nil)
|
|
38
|
+
if vms
|
|
39
|
+
vms.each do |job, index|
|
|
40
|
+
instance = @instance_manager.find_by_name(@deployment.name, job, index)
|
|
41
|
+
Models::DeploymentProblem.where(deployment: deployment,
|
|
42
|
+
:resource_id => instance.vm.id,
|
|
43
|
+
:state => "open").update(state: "closed")
|
|
44
|
+
end
|
|
45
|
+
else
|
|
46
|
+
Models::DeploymentProblem.where(state: "open", deployment: deployment).update(state: "closed")
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def scan_disks
|
|
51
|
+
disks = Models::PersistentDisk.eager(:instance).all.select do |disk|
|
|
52
|
+
disk.instance && disk.instance.deployment_id == deployment.id
|
|
53
|
+
end
|
|
54
|
+
results = Hash.new(0)
|
|
55
|
+
|
|
56
|
+
begin_stage("Scanning #{disks.size} persistent disks", 2)
|
|
57
|
+
|
|
58
|
+
track_and_log("Looking for inactive disks") do
|
|
59
|
+
disks.each do |disk|
|
|
60
|
+
scan_result = scan_disk(disk)
|
|
61
|
+
results[scan_result] += 1
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
track_and_log("#{results[:ok]} OK, " +
|
|
66
|
+
"#{results[:inactive]} inactive, " +
|
|
67
|
+
"#{results[:mount_info_mismatch]} mount-info mismatch")
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def scan_vms(vms=nil)
|
|
71
|
+
if vms
|
|
72
|
+
vm_list = []
|
|
73
|
+
vms.each do |job, index|
|
|
74
|
+
instance = @instance_manager.find_by_name(@deployment.name, job, index)
|
|
75
|
+
vm_list << instance.vm
|
|
76
|
+
end
|
|
77
|
+
vms = vm_list
|
|
78
|
+
else
|
|
79
|
+
vms = Models::Vm.eager(:instance).filter(deployment: deployment).all
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
begin_stage("Scanning #{vms.size} VMs", 2)
|
|
83
|
+
results = Hash.new(0)
|
|
84
|
+
lock = Mutex.new
|
|
85
|
+
|
|
86
|
+
track_and_log("Checking VM states") do
|
|
87
|
+
ThreadPool.new(:max_threads => Config.max_threads).wrap do |pool|
|
|
88
|
+
vms.each do |vm|
|
|
89
|
+
pool.process do
|
|
90
|
+
scan_result = scan_vm(vm)
|
|
91
|
+
lock.synchronize { results[scan_result] += 1 }
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
track_and_log("#{results[:ok]} OK, " +
|
|
98
|
+
"#{results[:unresponsive]} unresponsive, " +
|
|
99
|
+
"#{results[:missing]} missing, " +
|
|
100
|
+
"#{results[:unbound]} unbound, " +
|
|
101
|
+
"#{results[:out_of_sync]} out of sync")
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def scan_disk(disk)
|
|
105
|
+
# inactive disks
|
|
106
|
+
unless disk.active
|
|
107
|
+
logger.info("Found inactive disk: #{disk.id}")
|
|
108
|
+
problem_found(:inactive_disk, disk)
|
|
109
|
+
return :inactive
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
disk_cid = disk.disk_cid
|
|
113
|
+
vm_cid = nil
|
|
114
|
+
|
|
115
|
+
if disk.instance && disk.instance.vm
|
|
116
|
+
vm_cid = disk.instance.vm.cid
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
if vm_cid.nil?
|
|
120
|
+
# With the db dependencies this should not happen.
|
|
121
|
+
logger.warn("Disk #{disk_cid} is not associated to any VM. " +
|
|
122
|
+
"Skipping scan")
|
|
123
|
+
return :ok
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
owner_vms = get_disk_owners(disk_cid) || []
|
|
127
|
+
# active disk is not mounted or mounted more than once -or-
|
|
128
|
+
# the disk is mounted on a vm that is different form the record.
|
|
129
|
+
if owner_vms.size != 1 || owner_vms.first != vm_cid
|
|
130
|
+
logger.info("Found problem in mount info: " +
|
|
131
|
+
"active disk #{disk_cid} mounted on " +
|
|
132
|
+
"#{owner_vms.join(', ')}")
|
|
133
|
+
problem_found(:mount_info_mismatch, disk, :owner_vms => owner_vms)
|
|
134
|
+
return :mount_info_mismatch
|
|
135
|
+
end
|
|
136
|
+
:ok
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def scan_vm(vm)
|
|
140
|
+
agent_options = {
|
|
141
|
+
:timeout => AGENT_TIMEOUT,
|
|
142
|
+
:retry_methods => {:get_state => 0}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
instance = nil
|
|
146
|
+
mounted_disk_cid = nil
|
|
147
|
+
@problem_lock.synchronize do
|
|
148
|
+
instance = vm.instance
|
|
149
|
+
mounted_disk_cid = instance.persistent_disk_cid if instance
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
agent = AgentClient.new(vm.agent_id, agent_options)
|
|
153
|
+
begin
|
|
154
|
+
state = agent.get_state
|
|
155
|
+
|
|
156
|
+
# gather mounted disk info. (used by scan_disk)
|
|
157
|
+
begin
|
|
158
|
+
disk_list = agent.list_disk
|
|
159
|
+
mounted_disk_cid = disk_list.first
|
|
160
|
+
rescue Bosh::Director::RpcTimeout => e
|
|
161
|
+
mounted_disk_cid = nil
|
|
162
|
+
rescue RuntimeError => e
|
|
163
|
+
# For old agents that doesn't implement list_disk we assume the disk is mounted
|
|
164
|
+
logger.info("agent.list_disk failed on agent #{vm.agent_id}")
|
|
165
|
+
end
|
|
166
|
+
add_disk_owner(mounted_disk_cid, vm.cid) if mounted_disk_cid
|
|
167
|
+
|
|
168
|
+
return :out_of_sync if is_out_of_sync_vm?(vm, instance, state)
|
|
169
|
+
return :unbound if is_unbound_instance_vm?(vm, instance, state)
|
|
170
|
+
:ok
|
|
171
|
+
rescue Bosh::Director::RpcTimeout
|
|
172
|
+
# We add the disk to avoid a duplicate problem when timeouts fetching agent status (unresponsive_agent and
|
|
173
|
+
# mount_info_mismatch)
|
|
174
|
+
add_disk_owner(mounted_disk_cid, vm.cid) if mounted_disk_cid
|
|
175
|
+
|
|
176
|
+
begin
|
|
177
|
+
unless cloud.has_vm?(vm.cid)
|
|
178
|
+
logger.info("Missing VM #{vm.cid}")
|
|
179
|
+
problem_found(:missing_vm, vm)
|
|
180
|
+
return :missing
|
|
181
|
+
end
|
|
182
|
+
rescue Bosh::Clouds::NotImplemented
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
logger.info("Found unresponsive agent #{vm.agent_id}")
|
|
186
|
+
problem_found(:unresponsive_agent, vm)
|
|
187
|
+
:unresponsive
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def problem_found(type, resource, data = {})
|
|
192
|
+
@problem_lock.synchronize do
|
|
193
|
+
similar_open_problems = Models::DeploymentProblem.
|
|
194
|
+
filter(:deployment_id => deployment.id, :type => type.to_s,
|
|
195
|
+
:resource_id => resource.id, :state => "open").all
|
|
196
|
+
|
|
197
|
+
if similar_open_problems.size > 1
|
|
198
|
+
raise CloudcheckTooManySimilarProblems,
|
|
199
|
+
"More than one problem of type `#{type}' " +
|
|
200
|
+
"exists for resource #{type} #{resource.id}"
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
if similar_open_problems.empty?
|
|
204
|
+
problem = Models::DeploymentProblem.
|
|
205
|
+
create(:type => type.to_s, :resource_id => resource.id,
|
|
206
|
+
:state => "open", :deployment_id => deployment.id,
|
|
207
|
+
:data => data, :counter => 1)
|
|
208
|
+
|
|
209
|
+
logger.info("Created problem #{problem.id} (#{problem.type})")
|
|
210
|
+
else
|
|
211
|
+
# This assumes we are running with deployment lock acquired,
|
|
212
|
+
# so there is no possible update conflict
|
|
213
|
+
problem = similar_open_problems[0]
|
|
214
|
+
problem.data = data
|
|
215
|
+
problem.last_seen_at = Time.now
|
|
216
|
+
problem.counter += 1
|
|
217
|
+
problem.save
|
|
218
|
+
logger.info("Updated problem #{problem.id} (#{problem.type}), " +
|
|
219
|
+
"count is now #{problem.counter}")
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
private
|
|
226
|
+
attr_reader :deployment
|
|
227
|
+
|
|
228
|
+
def is_out_of_sync_vm?(vm, instance, state)
|
|
229
|
+
job = state["job"] ? state["job"]["name"] : nil
|
|
230
|
+
index = state["index"]
|
|
231
|
+
if state["deployment"] != deployment.name ||
|
|
232
|
+
(instance && (instance.job != job || instance.index != index))
|
|
233
|
+
problem_found(:out_of_sync_vm, vm,
|
|
234
|
+
:deployment => state["deployment"],
|
|
235
|
+
:job => job, :index => index)
|
|
236
|
+
true
|
|
237
|
+
else
|
|
238
|
+
false
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def is_unbound_instance_vm?(vm, instance, state)
|
|
243
|
+
job = state["job"] ? state["job"]["name"] : nil
|
|
244
|
+
index = state["index"]
|
|
245
|
+
if job && !instance
|
|
246
|
+
logger.info("Found unbound VM #{vm.agent_id}")
|
|
247
|
+
problem_found(:unbound_instance_vm, vm,
|
|
248
|
+
:job => job, :index => index)
|
|
249
|
+
true
|
|
250
|
+
else
|
|
251
|
+
false
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def add_disk_owner(disk_cid, vm_cid)
|
|
256
|
+
@agent_disks[disk_cid] ||= []
|
|
257
|
+
@agent_disks[disk_cid] << vm_cid
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
def get_disk_owners(disk_cid)
|
|
261
|
+
@agent_disks[disk_cid]
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def cloud
|
|
265
|
+
Config.cloud
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|