bosh-director 1.5.0.pre.1113

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. data/CHANGELOG +34 -0
  2. data/bin/bosh-director +36 -0
  3. data/bin/bosh-director-console +84 -0
  4. data/bin/bosh-director-drain-workers +42 -0
  5. data/bin/bosh-director-migrate +58 -0
  6. data/bin/bosh-director-scheduler +27 -0
  7. data/bin/bosh-director-worker +76 -0
  8. data/db/migrations/README +1 -0
  9. data/db/migrations/director/20110209010747_initial.rb +118 -0
  10. data/db/migrations/director/20110406055800_add_task_user.rb +9 -0
  11. data/db/migrations/director/20110518225809_remove_cid_constrain.rb +13 -0
  12. data/db/migrations/director/20110617211923_add_deployments_release_versions.rb +32 -0
  13. data/db/migrations/director/20110622212607_add_task_checkpoint_timestamp.rb +9 -0
  14. data/db/migrations/director/20110628023039_add_state_to_instances.rb +21 -0
  15. data/db/migrations/director/20110709012332_add_disk_size_to_instances.rb +9 -0
  16. data/db/migrations/director/20110906183441_add_log_bundles.rb +11 -0
  17. data/db/migrations/director/20110907194830_add_logs_json_to_templates.rb +9 -0
  18. data/db/migrations/director/20110915205610_add_persistent_disks.rb +51 -0
  19. data/db/migrations/director/20111005180929_add_properties.rb +14 -0
  20. data/db/migrations/director/20111110024617_add_deployment_problems.rb +24 -0
  21. data/db/migrations/director/20111216214145_recreate_support_for_vms.rb +9 -0
  22. data/db/migrations/director/20120102084027_add_credentials_to_vms.rb +7 -0
  23. data/db/migrations/director/20120427235217_allow_multiple_releases_per_deployment.rb +36 -0
  24. data/db/migrations/director/20120524175805_add_task_type.rb +44 -0
  25. data/db/migrations/director/20120614001930_delete_redundant_deployment_release_relation.rb +34 -0
  26. data/db/migrations/director/20120822004528_add_fingerprint_to_templates_and_packages.rb +17 -0
  27. data/db/migrations/director/20120830191244_add_properties_to_templates.rb +9 -0
  28. data/db/migrations/director/20121106190739_persist_vm_env.rb +9 -0
  29. data/db/migrations/director/20130222232131_add_sha1_to_stemcells.rb +9 -0
  30. data/db/migrations/director/20130312211407_add_commit_hash_to_release_versions.rb +19 -0
  31. data/db/migrations/director/20130409235338_snapshot.rb +15 -0
  32. data/db/migrations/director/20130530164918_add_paused_flag_to_instance.rb +14 -0
  33. data/db/migrations/director/20130531172604_add_director_attributes.rb +13 -0
  34. data/db/migrations/dns/20120123234908_initial.rb +27 -0
  35. data/lib/bosh/director.rb +133 -0
  36. data/lib/bosh/director/agent_client.rb +78 -0
  37. data/lib/bosh/director/api.rb +29 -0
  38. data/lib/bosh/director/api/api_helper.rb +81 -0
  39. data/lib/bosh/director/api/backup_manager.rb +15 -0
  40. data/lib/bosh/director/api/controller.rb +639 -0
  41. data/lib/bosh/director/api/controller_helpers.rb +34 -0
  42. data/lib/bosh/director/api/deployment_lookup.rb +13 -0
  43. data/lib/bosh/director/api/deployment_manager.rb +60 -0
  44. data/lib/bosh/director/api/http_constants.rb +16 -0
  45. data/lib/bosh/director/api/instance_lookup.rb +44 -0
  46. data/lib/bosh/director/api/instance_manager.rb +63 -0
  47. data/lib/bosh/director/api/problem_manager.rb +40 -0
  48. data/lib/bosh/director/api/property_manager.rb +69 -0
  49. data/lib/bosh/director/api/release_manager.rb +59 -0
  50. data/lib/bosh/director/api/resource_manager.rb +69 -0
  51. data/lib/bosh/director/api/resurrector_manager.rb +15 -0
  52. data/lib/bosh/director/api/snapshot_manager.rb +94 -0
  53. data/lib/bosh/director/api/stemcell_manager.rb +50 -0
  54. data/lib/bosh/director/api/task_helper.rb +46 -0
  55. data/lib/bosh/director/api/task_manager.rb +64 -0
  56. data/lib/bosh/director/api/user_manager.rb +72 -0
  57. data/lib/bosh/director/api/vm_state_manager.rb +11 -0
  58. data/lib/bosh/director/app.rb +35 -0
  59. data/lib/bosh/director/blob_util.rb +87 -0
  60. data/lib/bosh/director/blobstores.rb +29 -0
  61. data/lib/bosh/director/client.rb +156 -0
  62. data/lib/bosh/director/cloudcheck_helper.rb +204 -0
  63. data/lib/bosh/director/compile_task.rb +157 -0
  64. data/lib/bosh/director/config.rb +370 -0
  65. data/lib/bosh/director/configuration_hasher.rb +114 -0
  66. data/lib/bosh/director/cycle_helper.rb +36 -0
  67. data/lib/bosh/director/db_backup.rb +22 -0
  68. data/lib/bosh/director/db_backup/adapter.rb +3 -0
  69. data/lib/bosh/director/db_backup/adapter/mysql2.rb +27 -0
  70. data/lib/bosh/director/db_backup/adapter/postgres.rb +36 -0
  71. data/lib/bosh/director/db_backup/adapter/sqlite.rb +17 -0
  72. data/lib/bosh/director/db_backup/error.rb +10 -0
  73. data/lib/bosh/director/deployment_plan.rb +26 -0
  74. data/lib/bosh/director/deployment_plan/assembler.rb +430 -0
  75. data/lib/bosh/director/deployment_plan/compilation_config.rb +54 -0
  76. data/lib/bosh/director/deployment_plan/compiled_package.rb +35 -0
  77. data/lib/bosh/director/deployment_plan/dynamic_network.rb +91 -0
  78. data/lib/bosh/director/deployment_plan/idle_vm.rb +109 -0
  79. data/lib/bosh/director/deployment_plan/instance.rb +413 -0
  80. data/lib/bosh/director/deployment_plan/job.rb +470 -0
  81. data/lib/bosh/director/deployment_plan/manual_network.rb +137 -0
  82. data/lib/bosh/director/deployment_plan/network.rb +74 -0
  83. data/lib/bosh/director/deployment_plan/network_subnet.rb +167 -0
  84. data/lib/bosh/director/deployment_plan/planner.rb +288 -0
  85. data/lib/bosh/director/deployment_plan/preparer.rb +52 -0
  86. data/lib/bosh/director/deployment_plan/release.rb +126 -0
  87. data/lib/bosh/director/deployment_plan/resource_pool.rb +143 -0
  88. data/lib/bosh/director/deployment_plan/resource_pools.rb +68 -0
  89. data/lib/bosh/director/deployment_plan/stemcell.rb +56 -0
  90. data/lib/bosh/director/deployment_plan/template.rb +94 -0
  91. data/lib/bosh/director/deployment_plan/update_config.rb +80 -0
  92. data/lib/bosh/director/deployment_plan/updater.rb +55 -0
  93. data/lib/bosh/director/deployment_plan/vip_network.rb +79 -0
  94. data/lib/bosh/director/dns_helper.rb +204 -0
  95. data/lib/bosh/director/download_helper.rb +44 -0
  96. data/lib/bosh/director/duration.rb +36 -0
  97. data/lib/bosh/director/encryption_helper.rb +10 -0
  98. data/lib/bosh/director/errors.rb +198 -0
  99. data/lib/bosh/director/event_log.rb +136 -0
  100. data/lib/bosh/director/ext.rb +64 -0
  101. data/lib/bosh/director/hash_string_vals.rb +13 -0
  102. data/lib/bosh/director/instance_deleter.rb +109 -0
  103. data/lib/bosh/director/instance_updater.rb +506 -0
  104. data/lib/bosh/director/ip_util.rb +67 -0
  105. data/lib/bosh/director/job_queue.rb +16 -0
  106. data/lib/bosh/director/job_runner.rb +162 -0
  107. data/lib/bosh/director/job_updater.rb +121 -0
  108. data/lib/bosh/director/jobs/backup.rb +86 -0
  109. data/lib/bosh/director/jobs/base_job.rb +66 -0
  110. data/lib/bosh/director/jobs/cloud_check/apply_resolutions.rb +46 -0
  111. data/lib/bosh/director/jobs/cloud_check/scan.rb +38 -0
  112. data/lib/bosh/director/jobs/cloud_check/scan_and_fix.rb +73 -0
  113. data/lib/bosh/director/jobs/create_snapshot.rb +23 -0
  114. data/lib/bosh/director/jobs/delete_deployment.rb +183 -0
  115. data/lib/bosh/director/jobs/delete_deployment_snapshots.rb +34 -0
  116. data/lib/bosh/director/jobs/delete_release.rb +219 -0
  117. data/lib/bosh/director/jobs/delete_snapshots.rb +23 -0
  118. data/lib/bosh/director/jobs/delete_stemcell.rb +102 -0
  119. data/lib/bosh/director/jobs/fetch_logs.rb +99 -0
  120. data/lib/bosh/director/jobs/scheduled_backup.rb +38 -0
  121. data/lib/bosh/director/jobs/snapshot_deployment.rb +61 -0
  122. data/lib/bosh/director/jobs/snapshot_deployments.rb +23 -0
  123. data/lib/bosh/director/jobs/snapshot_self.rb +43 -0
  124. data/lib/bosh/director/jobs/ssh.rb +59 -0
  125. data/lib/bosh/director/jobs/update_deployment.rb +110 -0
  126. data/lib/bosh/director/jobs/update_release.rb +672 -0
  127. data/lib/bosh/director/jobs/update_stemcell.rb +109 -0
  128. data/lib/bosh/director/jobs/vm_state.rb +89 -0
  129. data/lib/bosh/director/lock.rb +133 -0
  130. data/lib/bosh/director/lock_helper.rb +92 -0
  131. data/lib/bosh/director/models.rb +29 -0
  132. data/lib/bosh/director/models/compiled_package.rb +33 -0
  133. data/lib/bosh/director/models/deployment.rb +22 -0
  134. data/lib/bosh/director/models/deployment_problem.rb +49 -0
  135. data/lib/bosh/director/models/deployment_property.rb +21 -0
  136. data/lib/bosh/director/models/director_attribute.rb +9 -0
  137. data/lib/bosh/director/models/dns.rb +9 -0
  138. data/lib/bosh/director/models/dns/domain.rb +9 -0
  139. data/lib/bosh/director/models/dns/record.rb +7 -0
  140. data/lib/bosh/director/models/helpers/model_helper.rb +7 -0
  141. data/lib/bosh/director/models/instance.rb +28 -0
  142. data/lib/bosh/director/models/log_bundle.rb +10 -0
  143. data/lib/bosh/director/models/package.rb +30 -0
  144. data/lib/bosh/director/models/persistent_disk.rb +13 -0
  145. data/lib/bosh/director/models/release.rb +17 -0
  146. data/lib/bosh/director/models/release_version.rb +16 -0
  147. data/lib/bosh/director/models/snapshot.rb +13 -0
  148. data/lib/bosh/director/models/stemcell.rb +18 -0
  149. data/lib/bosh/director/models/task.rb +10 -0
  150. data/lib/bosh/director/models/template.rb +44 -0
  151. data/lib/bosh/director/models/user.rb +11 -0
  152. data/lib/bosh/director/models/vm.rb +42 -0
  153. data/lib/bosh/director/nats_rpc.rb +54 -0
  154. data/lib/bosh/director/network_reservation.rb +121 -0
  155. data/lib/bosh/director/next_rebase_version.rb +20 -0
  156. data/lib/bosh/director/package_compiler.rb +423 -0
  157. data/lib/bosh/director/problem_handlers/base.rb +153 -0
  158. data/lib/bosh/director/problem_handlers/inactive_disk.rb +112 -0
  159. data/lib/bosh/director/problem_handlers/invalid_problem.rb +28 -0
  160. data/lib/bosh/director/problem_handlers/missing_vm.rb +34 -0
  161. data/lib/bosh/director/problem_handlers/mount_info_mismatch.rb +62 -0
  162. data/lib/bosh/director/problem_handlers/out_of_sync_vm.rb +64 -0
  163. data/lib/bosh/director/problem_handlers/unbound_instance_vm.rb +85 -0
  164. data/lib/bosh/director/problem_handlers/unresponsive_agent.rb +78 -0
  165. data/lib/bosh/director/problem_resolver.rb +103 -0
  166. data/lib/bosh/director/problem_scanner.rb +268 -0
  167. data/lib/bosh/director/resource_pool_updater.rb +216 -0
  168. data/lib/bosh/director/scheduler.rb +57 -0
  169. data/lib/bosh/director/sequel.rb +13 -0
  170. data/lib/bosh/director/tar_gzipper.rb +47 -0
  171. data/lib/bosh/director/task_result_file.rb +19 -0
  172. data/lib/bosh/director/thread_pool.rb +8 -0
  173. data/lib/bosh/director/validation_helper.rb +55 -0
  174. data/lib/bosh/director/version.rb +7 -0
  175. data/lib/bosh/director/vm_creator.rb +80 -0
  176. data/lib/bosh/director/vm_data.rb +63 -0
  177. data/lib/bosh/director/vm_metadata_updater.rb +29 -0
  178. data/lib/bosh/director/vm_reuser.rb +63 -0
  179. data/lib/cloud/dummy.rb +149 -0
  180. metadata +664 -0
@@ -0,0 +1,85 @@
1
+ # Copyright (c) 2009-2012 VMware, Inc.
2
+
3
+ module Bosh::Director
4
+ module ProblemHandlers
5
+ class UnboundInstanceVm < Base
6
+
7
+ register_as :unbound_instance_vm
8
+ auto_resolution :reassociate_vm
9
+
10
+ def initialize(vm_id, data)
11
+ super
12
+
13
+ @vm = Models::Vm[vm_id]
14
+ @data = data
15
+
16
+ if @vm.nil?
17
+ handler_error("VM `#{vm_id}' is no longer in the database")
18
+ end
19
+
20
+ if @vm.agent_id.nil?
21
+ handler_error("VM `#{vm_id}' doesn't have an agent id")
22
+ end
23
+
24
+ if @vm.cid.nil?
25
+ handler_error("VM `#{vm_id}' doesn't have a cloud id")
26
+ end
27
+
28
+ end
29
+
30
+ def description
31
+ job = @data["job"] || "unknown job"
32
+ index = @data["index"] || "unknown index"
33
+ "VM `#{@vm.cid}' reports itself as `#{job}/#{index}' but does not have a bound instance"
34
+ end
35
+
36
+ resolution :ignore do
37
+ plan { "Ignore problem" }
38
+ action { }
39
+ end
40
+
41
+ resolution :delete_vm do
42
+ plan { "Delete VM (unless it has persistent disk)" }
43
+ action { validate; delete_vm(@vm) }
44
+ end
45
+
46
+ resolution :reassociate_vm do
47
+ plan { "Reassociate VM with corresponding instance" }
48
+ action { validate; reassociate_vm }
49
+ end
50
+
51
+ def validate
52
+ unless @vm.instance.nil?
53
+ handler_error("Instance is now bound to VM")
54
+ end
55
+
56
+ state = agent_timeout_guard(@vm) { |agent| agent.get_state }
57
+ if state["job"].nil?
58
+ handler_error("VM now properly reports no job")
59
+ end
60
+ end
61
+
62
+ def reassociate_vm
63
+ instances = Models::Instance.
64
+ filter(:deployment_id => @vm.deployment_id,
65
+ :job => @data["job"], :index => @data["index"]).all
66
+
67
+ if instances.size > 1
68
+ handler_error("More than one instance in DB matches this VM")
69
+ end
70
+
71
+ if instances.empty?
72
+ handler_error("No instances in DB match this VM")
73
+ end
74
+
75
+ instance = instances[0]
76
+
77
+ if instance.vm
78
+ handler_error("The corresponding instance is associated with another VM")
79
+ end
80
+
81
+ instance.update(:vm => @vm)
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,78 @@
1
+ # Copyright (c) 2009-2012 VMware, Inc.
2
+
3
+ module Bosh::Director
4
+ module ProblemHandlers
5
+ class UnresponsiveAgent < Base
6
+
7
+ register_as :unresponsive_agent
8
+ auto_resolution :ignore
9
+
10
+ def initialize(vm_id, data)
11
+ super
12
+ @vm = Models::Vm[vm_id]
13
+
14
+ if @vm.nil?
15
+ handler_error("VM `#{vm_id}' is no longer in the database")
16
+ end
17
+
18
+ if @vm.agent_id.nil?
19
+ handler_error("VM `#{vm_id}' doesn't have an agent id")
20
+ end
21
+ end
22
+
23
+ def description
24
+ "#{instance_name(@vm)} (#{@vm.cid}) is not responding"
25
+ end
26
+
27
+ resolution :ignore do
28
+ plan { "Ignore problem" }
29
+ action { }
30
+ end
31
+
32
+ resolution :reboot_vm do
33
+ plan { "Reboot VM" }
34
+ action { validate; ensure_cid; reboot_vm(@vm) }
35
+ end
36
+
37
+ resolution :recreate_vm do
38
+ plan { "Recreate VM using last known apply spec" }
39
+ action { validate; ensure_cid; recreate_vm(@vm) }
40
+ end
41
+
42
+ resolution :delete_vm_reference do
43
+ plan { "Delete VM reference (DANGEROUS!)" }
44
+ action { validate; ensure_no_cid; delete_vm_reference(@vm) }
45
+ end
46
+
47
+ def agent_alive?
48
+ agent_client(@vm).ping
49
+ true
50
+ rescue Bosh::Director::RpcTimeout
51
+ false
52
+ end
53
+
54
+ def ensure_cid
55
+ if @vm.cid.nil?
56
+ handler_error("VM `#{@vm.id}' doesn't have a cloud id, " +
57
+ "only resolution is to delete the VM reference.")
58
+ end
59
+ end
60
+
61
+ def ensure_no_cid
62
+ if @vm.cid
63
+ handler_error("VM `#{@vm.id}' has a cloud id, " +
64
+ "please use a different resolution.")
65
+ end
66
+ end
67
+
68
+ def validate
69
+ if agent_alive?
70
+ handler_error("Agent is responding now, skipping resolution")
71
+ end
72
+ end
73
+
74
+ def delete_vm
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,103 @@
1
+ module Bosh::Director
2
+ class ProblemResolver
3
+
4
+ attr_reader :event_log, :logger
5
+
6
+ def initialize(deployment)
7
+ @deployment = deployment
8
+ @resolved_count = 0
9
+
10
+ #temp
11
+ @event_log = Config.event_log
12
+ @logger = Config.logger
13
+ end
14
+
15
+ def begin_stage(stage_name, n_steps)
16
+ event_log.begin_stage(stage_name, n_steps)
17
+ logger.info(stage_name)
18
+ end
19
+
20
+ def track_and_log(task, log = true)
21
+ event_log.track(task) do |ticker|
22
+ logger.info(task) if log
23
+ yield ticker if block_given?
24
+ end
25
+ end
26
+
27
+ def apply_resolutions(resolutions)
28
+ @resolutions = resolutions
29
+ problems = Models::DeploymentProblem.filter(deployment: @deployment, state: 'open').all
30
+ problem_ids = Set.new
31
+
32
+ problems.each do |problem|
33
+ problem_ids << problem.id.to_s
34
+ unless @resolutions.has_key?(problem.id.to_s)
35
+ raise CloudcheckResolutionNotProvided,
36
+ "Resolution for problem #{problem.id} (#{problem.type}) is not provided"
37
+ end
38
+ end
39
+
40
+ # We might have some resolutions for problems that are no longer open
41
+ # or just some bogus problem ids, in that case we still need to mention
42
+ # them in event log so end user understands what actually happened.
43
+ missing_problem_ids = @resolutions.keys.to_set - problem_ids
44
+
45
+ begin_stage("Applying problem resolutions", problems.size + missing_problem_ids.size)
46
+ problems.each do |problem|
47
+ apply_resolution(problem)
48
+ end
49
+
50
+ missing_problem_ids.each do |problem_id|
51
+ if problem_id !~ /^\d+$/
52
+ reason = "malformed id"
53
+ else
54
+ problem = Models::DeploymentProblem[problem_id.to_i]
55
+ if problem.nil?
56
+ reason = "not found"
57
+ elsif problem.state != "open"
58
+ reason = "state is '#{problem.state}'"
59
+ elsif problem.deployment_id != @deployment.id
60
+ reason = "not a part of this deployment"
61
+ else
62
+ reason = "reason unknown"
63
+ end
64
+ end
65
+
66
+ track_and_log("Ignoring problem #{problem_id} (#{reason})") { }
67
+ end
68
+ @resolved_count
69
+ end
70
+
71
+ def apply_resolution(problem)
72
+ handler = ProblemHandlers::Base.create_from_model(problem)
73
+ handler.job = self
74
+
75
+ resolution = @resolutions[problem.id.to_s] || handler.auto_resolution
76
+ problem_summary = "#{problem.type} #{problem.resource_id}"
77
+ resolution_summary = handler.resolution_plan(resolution)
78
+ resolution_summary ||= "no resolution"
79
+
80
+ begin
81
+ track_and_log("#{problem_summary}: #{resolution_summary}") do
82
+ handler.apply_resolution(resolution)
83
+ end
84
+ rescue Bosh::Director::ProblemHandlerError => e
85
+ log_resolution_error(problem, e)
86
+ end
87
+
88
+ problem.state = "resolved"
89
+ problem.save
90
+ @resolved_count += 1
91
+
92
+ rescue => e
93
+ log_resolution_error(problem, e)
94
+ end
95
+
96
+ private
97
+
98
+ def log_resolution_error(problem, error)
99
+ logger.error("Error resolving problem `#{problem.id}': #{error}")
100
+ logger.error(error.backtrace.join("\n"))
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,268 @@
1
+ # Copyright (c) 2009-2012 VMware, Inc.
2
+
3
+ module Bosh::Director
4
+ class ProblemScanner
5
+
6
+ AGENT_TIMEOUT = 10 # seconds
7
+
8
+ attr_reader :event_log, :logger
9
+
10
+ @queue = :normal
11
+
12
+ # @param [String] deployment_name Deployment name
13
+ def initialize(deployment)
14
+ @deployment = deployment
15
+ @instance_manager = Api::InstanceManager.new
16
+
17
+ @problem_lock = Mutex.new
18
+ @agent_disks = {}
19
+
20
+ #temp
21
+ @event_log = Config.event_log
22
+ @logger = Config.logger
23
+ end
24
+
25
+ def begin_stage(stage_name, n_steps)
26
+ event_log.begin_stage(stage_name, n_steps)
27
+ logger.info(stage_name)
28
+ end
29
+
30
+ def track_and_log(task, log = true)
31
+ event_log.track(task) do |ticker|
32
+ logger.info(task) if log
33
+ yield ticker if block_given?
34
+ end
35
+ end
36
+
37
+ def reset(vms=nil)
38
+ if vms
39
+ vms.each do |job, index|
40
+ instance = @instance_manager.find_by_name(@deployment.name, job, index)
41
+ Models::DeploymentProblem.where(deployment: deployment,
42
+ :resource_id => instance.vm.id,
43
+ :state => "open").update(state: "closed")
44
+ end
45
+ else
46
+ Models::DeploymentProblem.where(state: "open", deployment: deployment).update(state: "closed")
47
+ end
48
+ end
49
+
50
+ def scan_disks
51
+ disks = Models::PersistentDisk.eager(:instance).all.select do |disk|
52
+ disk.instance && disk.instance.deployment_id == deployment.id
53
+ end
54
+ results = Hash.new(0)
55
+
56
+ begin_stage("Scanning #{disks.size} persistent disks", 2)
57
+
58
+ track_and_log("Looking for inactive disks") do
59
+ disks.each do |disk|
60
+ scan_result = scan_disk(disk)
61
+ results[scan_result] += 1
62
+ end
63
+ end
64
+
65
+ track_and_log("#{results[:ok]} OK, " +
66
+ "#{results[:inactive]} inactive, " +
67
+ "#{results[:mount_info_mismatch]} mount-info mismatch")
68
+ end
69
+
70
+ def scan_vms(vms=nil)
71
+ if vms
72
+ vm_list = []
73
+ vms.each do |job, index|
74
+ instance = @instance_manager.find_by_name(@deployment.name, job, index)
75
+ vm_list << instance.vm
76
+ end
77
+ vms = vm_list
78
+ else
79
+ vms = Models::Vm.eager(:instance).filter(deployment: deployment).all
80
+ end
81
+
82
+ begin_stage("Scanning #{vms.size} VMs", 2)
83
+ results = Hash.new(0)
84
+ lock = Mutex.new
85
+
86
+ track_and_log("Checking VM states") do
87
+ ThreadPool.new(:max_threads => Config.max_threads).wrap do |pool|
88
+ vms.each do |vm|
89
+ pool.process do
90
+ scan_result = scan_vm(vm)
91
+ lock.synchronize { results[scan_result] += 1 }
92
+ end
93
+ end
94
+ end
95
+ end
96
+
97
+ track_and_log("#{results[:ok]} OK, " +
98
+ "#{results[:unresponsive]} unresponsive, " +
99
+ "#{results[:missing]} missing, " +
100
+ "#{results[:unbound]} unbound, " +
101
+ "#{results[:out_of_sync]} out of sync")
102
+ end
103
+
104
+ def scan_disk(disk)
105
+ # inactive disks
106
+ unless disk.active
107
+ logger.info("Found inactive disk: #{disk.id}")
108
+ problem_found(:inactive_disk, disk)
109
+ return :inactive
110
+ end
111
+
112
+ disk_cid = disk.disk_cid
113
+ vm_cid = nil
114
+
115
+ if disk.instance && disk.instance.vm
116
+ vm_cid = disk.instance.vm.cid
117
+ end
118
+
119
+ if vm_cid.nil?
120
+ # With the db dependencies this should not happen.
121
+ logger.warn("Disk #{disk_cid} is not associated to any VM. " +
122
+ "Skipping scan")
123
+ return :ok
124
+ end
125
+
126
+ owner_vms = get_disk_owners(disk_cid) || []
127
+ # active disk is not mounted or mounted more than once -or-
128
+ # the disk is mounted on a vm that is different form the record.
129
+ if owner_vms.size != 1 || owner_vms.first != vm_cid
130
+ logger.info("Found problem in mount info: " +
131
+ "active disk #{disk_cid} mounted on " +
132
+ "#{owner_vms.join(', ')}")
133
+ problem_found(:mount_info_mismatch, disk, :owner_vms => owner_vms)
134
+ return :mount_info_mismatch
135
+ end
136
+ :ok
137
+ end
138
+
139
+ def scan_vm(vm)
140
+ agent_options = {
141
+ :timeout => AGENT_TIMEOUT,
142
+ :retry_methods => {:get_state => 0}
143
+ }
144
+
145
+ instance = nil
146
+ mounted_disk_cid = nil
147
+ @problem_lock.synchronize do
148
+ instance = vm.instance
149
+ mounted_disk_cid = instance.persistent_disk_cid if instance
150
+ end
151
+
152
+ agent = AgentClient.new(vm.agent_id, agent_options)
153
+ begin
154
+ state = agent.get_state
155
+
156
+ # gather mounted disk info. (used by scan_disk)
157
+ begin
158
+ disk_list = agent.list_disk
159
+ mounted_disk_cid = disk_list.first
160
+ rescue Bosh::Director::RpcTimeout => e
161
+ mounted_disk_cid = nil
162
+ rescue RuntimeError => e
163
+ # For old agents that doesn't implement list_disk we assume the disk is mounted
164
+ logger.info("agent.list_disk failed on agent #{vm.agent_id}")
165
+ end
166
+ add_disk_owner(mounted_disk_cid, vm.cid) if mounted_disk_cid
167
+
168
+ return :out_of_sync if is_out_of_sync_vm?(vm, instance, state)
169
+ return :unbound if is_unbound_instance_vm?(vm, instance, state)
170
+ :ok
171
+ rescue Bosh::Director::RpcTimeout
172
+ # We add the disk to avoid a duplicate problem when timeouts fetching agent status (unresponsive_agent and
173
+ # mount_info_mismatch)
174
+ add_disk_owner(mounted_disk_cid, vm.cid) if mounted_disk_cid
175
+
176
+ begin
177
+ unless cloud.has_vm?(vm.cid)
178
+ logger.info("Missing VM #{vm.cid}")
179
+ problem_found(:missing_vm, vm)
180
+ return :missing
181
+ end
182
+ rescue Bosh::Clouds::NotImplemented
183
+ end
184
+
185
+ logger.info("Found unresponsive agent #{vm.agent_id}")
186
+ problem_found(:unresponsive_agent, vm)
187
+ :unresponsive
188
+ end
189
+ end
190
+
191
+ def problem_found(type, resource, data = {})
192
+ @problem_lock.synchronize do
193
+ similar_open_problems = Models::DeploymentProblem.
194
+ filter(:deployment_id => deployment.id, :type => type.to_s,
195
+ :resource_id => resource.id, :state => "open").all
196
+
197
+ if similar_open_problems.size > 1
198
+ raise CloudcheckTooManySimilarProblems,
199
+ "More than one problem of type `#{type}' " +
200
+ "exists for resource #{type} #{resource.id}"
201
+ end
202
+
203
+ if similar_open_problems.empty?
204
+ problem = Models::DeploymentProblem.
205
+ create(:type => type.to_s, :resource_id => resource.id,
206
+ :state => "open", :deployment_id => deployment.id,
207
+ :data => data, :counter => 1)
208
+
209
+ logger.info("Created problem #{problem.id} (#{problem.type})")
210
+ else
211
+ # This assumes we are running with deployment lock acquired,
212
+ # so there is no possible update conflict
213
+ problem = similar_open_problems[0]
214
+ problem.data = data
215
+ problem.last_seen_at = Time.now
216
+ problem.counter += 1
217
+ problem.save
218
+ logger.info("Updated problem #{problem.id} (#{problem.type}), " +
219
+ "count is now #{problem.counter}")
220
+ end
221
+ end
222
+ end
223
+
224
+
225
+ private
226
+ attr_reader :deployment
227
+
228
+ def is_out_of_sync_vm?(vm, instance, state)
229
+ job = state["job"] ? state["job"]["name"] : nil
230
+ index = state["index"]
231
+ if state["deployment"] != deployment.name ||
232
+ (instance && (instance.job != job || instance.index != index))
233
+ problem_found(:out_of_sync_vm, vm,
234
+ :deployment => state["deployment"],
235
+ :job => job, :index => index)
236
+ true
237
+ else
238
+ false
239
+ end
240
+ end
241
+
242
+ def is_unbound_instance_vm?(vm, instance, state)
243
+ job = state["job"] ? state["job"]["name"] : nil
244
+ index = state["index"]
245
+ if job && !instance
246
+ logger.info("Found unbound VM #{vm.agent_id}")
247
+ problem_found(:unbound_instance_vm, vm,
248
+ :job => job, :index => index)
249
+ true
250
+ else
251
+ false
252
+ end
253
+ end
254
+
255
+ def add_disk_owner(disk_cid, vm_cid)
256
+ @agent_disks[disk_cid] ||= []
257
+ @agent_disks[disk_cid] << vm_cid
258
+ end
259
+
260
+ def get_disk_owners(disk_cid)
261
+ @agent_disks[disk_cid]
262
+ end
263
+
264
+ def cloud
265
+ Config.cloud
266
+ end
267
+ end
268
+ end