bosh-director 1.2682.1.0 → 1.2685.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/bin/bosh-director +55 -2
  3. data/lib/bosh/director.rb +15 -2
  4. data/lib/bosh/director/api/controllers/backups_controller.rb +2 -2
  5. data/lib/bosh/director/api/controllers/base_controller.rb +1 -1
  6. data/lib/bosh/director/api/controllers/compiled_packages_controller.rb +2 -2
  7. data/lib/bosh/director/api/controllers/deployments_controller.rb +40 -26
  8. data/lib/bosh/director/api/controllers/info_controller.rb +1 -1
  9. data/lib/bosh/director/api/controllers/locks_controller.rb +1 -1
  10. data/lib/bosh/director/api/controllers/packages_controller.rb +1 -1
  11. data/lib/bosh/director/api/controllers/releases_controller.rb +5 -5
  12. data/lib/bosh/director/api/controllers/resources_controller.rb +1 -1
  13. data/lib/bosh/director/api/controllers/resurrection_controller.rb +1 -1
  14. data/lib/bosh/director/api/controllers/stemcells_controller.rb +4 -4
  15. data/lib/bosh/director/api/controllers/task_controller.rb +20 -0
  16. data/lib/bosh/director/api/controllers/tasks_controller.rb +3 -18
  17. data/lib/bosh/director/api/controllers/users_controller.rb +3 -3
  18. data/lib/bosh/director/api/deployment_lookup.rb +1 -1
  19. data/lib/bosh/director/api/resource_manager.rb +1 -1
  20. data/lib/bosh/director/instance_updater/vm_updater.rb +2 -1
  21. data/lib/bosh/director/jobs/cloud_check/scan.rb +1 -1
  22. data/lib/bosh/director/jobs/cloud_check/scan_and_fix.rb +1 -1
  23. data/lib/bosh/director/problem_handlers/missing_disk.rb +74 -0
  24. data/lib/bosh/director/problem_scanner/disk_scan_stage.rb +80 -0
  25. data/lib/bosh/director/problem_scanner/problem_register.rb +55 -0
  26. data/lib/bosh/director/problem_scanner/scanner.rb +86 -0
  27. data/lib/bosh/director/problem_scanner/vm_scan_stage.rb +134 -0
  28. data/lib/bosh/director/version.rb +1 -1
  29. data/lib/cloud/dummy.rb +9 -0
  30. metadata +29 -26
  31. data/lib/bosh/director/api/controller.rb +0 -33
  32. data/lib/bosh/director/api/controllers/errands_controller.rb +0 -26
  33. data/lib/bosh/director/problem_scanner.rb +0 -268
@@ -1,268 +0,0 @@
1
- # Copyright (c) 2009-2012 VMware, Inc.
2
-
3
- module Bosh::Director
4
- class ProblemScanner
5
-
6
- AGENT_TIMEOUT = 10 # seconds
7
-
8
- attr_reader :event_log, :logger
9
-
10
- @queue = :normal
11
-
12
- # @param [String] deployment_name Deployment name
13
- def initialize(deployment)
14
- @deployment = deployment
15
- @instance_manager = Api::InstanceManager.new
16
-
17
- @problem_lock = Mutex.new
18
- @agent_disks = {}
19
-
20
- #temp
21
- @event_log = Config.event_log
22
- @logger = Config.logger
23
- end
24
-
25
- def begin_stage(stage_name, n_steps)
26
- event_log.begin_stage(stage_name, n_steps)
27
- logger.info(stage_name)
28
- end
29
-
30
- def track_and_log(task, log = true)
31
- event_log.track(task) do |ticker|
32
- logger.info(task) if log
33
- yield ticker if block_given?
34
- end
35
- end
36
-
37
- def reset(vms=nil)
38
- if vms
39
- vms.each do |job, index|
40
- instance = @instance_manager.find_by_name(@deployment.name, job, index)
41
- Models::DeploymentProblem.where(deployment: deployment,
42
- :resource_id => instance.vm.id,
43
- :state => "open").update(state: "closed")
44
- end
45
- else
46
- Models::DeploymentProblem.where(state: "open", deployment: deployment).update(state: "closed")
47
- end
48
- end
49
-
50
- def scan_disks
51
- disks = Models::PersistentDisk.eager(:instance).all.select do |disk|
52
- disk.instance && disk.instance.deployment_id == deployment.id
53
- end
54
- results = Hash.new(0)
55
-
56
- begin_stage("Scanning #{disks.size} persistent disks", 2)
57
-
58
- track_and_log("Looking for inactive disks") do
59
- disks.each do |disk|
60
- scan_result = scan_disk(disk)
61
- results[scan_result] += 1
62
- end
63
- end
64
-
65
- track_and_log("#{results[:ok]} OK, " +
66
- "#{results[:inactive]} inactive, " +
67
- "#{results[:mount_info_mismatch]} mount-info mismatch")
68
- end
69
-
70
- def scan_vms(vms=nil)
71
- if vms
72
- vm_list = []
73
- vms.each do |job, index|
74
- instance = @instance_manager.find_by_name(@deployment.name, job, index)
75
- vm_list << instance.vm
76
- end
77
- vms = vm_list
78
- else
79
- vms = Models::Vm.eager(:instance).filter(deployment: deployment).all
80
- end
81
-
82
- begin_stage("Scanning #{vms.size} VMs", 2)
83
- results = Hash.new(0)
84
- lock = Mutex.new
85
-
86
- track_and_log("Checking VM states") do
87
- ThreadPool.new(:max_threads => Config.max_threads).wrap do |pool|
88
- vms.each do |vm|
89
- pool.process do
90
- scan_result = scan_vm(vm)
91
- lock.synchronize { results[scan_result] += 1 }
92
- end
93
- end
94
- end
95
- end
96
-
97
- track_and_log("#{results[:ok]} OK, " +
98
- "#{results[:unresponsive]} unresponsive, " +
99
- "#{results[:missing]} missing, " +
100
- "#{results[:unbound]} unbound, " +
101
- "#{results[:out_of_sync]} out of sync")
102
- end
103
-
104
- def scan_disk(disk)
105
- # inactive disks
106
- unless disk.active
107
- logger.info("Found inactive disk: #{disk.id}")
108
- problem_found(:inactive_disk, disk)
109
- return :inactive
110
- end
111
-
112
- disk_cid = disk.disk_cid
113
- vm_cid = nil
114
-
115
- if disk.instance && disk.instance.vm
116
- vm_cid = disk.instance.vm.cid
117
- end
118
-
119
- if vm_cid.nil?
120
- # With the db dependencies this should not happen.
121
- logger.warn("Disk #{disk_cid} is not associated to any VM. " +
122
- "Skipping scan")
123
- return :ok
124
- end
125
-
126
- owner_vms = get_disk_owners(disk_cid) || []
127
- # active disk is not mounted or mounted more than once -or-
128
- # the disk is mounted on a vm that is different form the record.
129
- if owner_vms.size != 1 || owner_vms.first != vm_cid
130
- logger.info("Found problem in mount info: " +
131
- "active disk #{disk_cid} mounted on " +
132
- "#{owner_vms.join(', ')}")
133
- problem_found(:mount_info_mismatch, disk, :owner_vms => owner_vms)
134
- return :mount_info_mismatch
135
- end
136
- :ok
137
- end
138
-
139
- def scan_vm(vm)
140
- agent_options = {
141
- :timeout => AGENT_TIMEOUT,
142
- :retry_methods => {:get_state => 0}
143
- }
144
-
145
- instance = nil
146
- mounted_disk_cid = nil
147
- @problem_lock.synchronize do
148
- instance = vm.instance
149
- mounted_disk_cid = instance.persistent_disk_cid if instance
150
- end
151
-
152
- agent = AgentClient.with_defaults(vm.agent_id, agent_options)
153
- begin
154
- state = agent.get_state
155
-
156
- # gather mounted disk info. (used by scan_disk)
157
- begin
158
- disk_list = agent.list_disk
159
- mounted_disk_cid = disk_list.first
160
- rescue Bosh::Director::RpcTimeout => e
161
- mounted_disk_cid = nil
162
- rescue RuntimeError => e
163
- # For old agents that doesn't implement list_disk we assume the disk is mounted
164
- logger.info("agent.list_disk failed on agent #{vm.agent_id}")
165
- end
166
- add_disk_owner(mounted_disk_cid, vm.cid) if mounted_disk_cid
167
-
168
- return :out_of_sync if is_out_of_sync_vm?(vm, instance, state)
169
- return :unbound if is_unbound_instance_vm?(vm, instance, state)
170
- :ok
171
- rescue Bosh::Director::RpcTimeout
172
- # We add the disk to avoid a duplicate problem when timeouts fetching agent status (unresponsive_agent and
173
- # mount_info_mismatch)
174
- add_disk_owner(mounted_disk_cid, vm.cid) if mounted_disk_cid
175
-
176
- begin
177
- unless cloud.has_vm?(vm.cid)
178
- logger.info("Missing VM #{vm.cid}")
179
- problem_found(:missing_vm, vm)
180
- return :missing
181
- end
182
- rescue Bosh::Clouds::NotImplemented
183
- end
184
-
185
- logger.info("Found unresponsive agent #{vm.agent_id}")
186
- problem_found(:unresponsive_agent, vm)
187
- :unresponsive
188
- end
189
- end
190
-
191
- def problem_found(type, resource, data = {})
192
- @problem_lock.synchronize do
193
- similar_open_problems = Models::DeploymentProblem.
194
- filter(:deployment_id => deployment.id, :type => type.to_s,
195
- :resource_id => resource.id, :state => "open").all
196
-
197
- if similar_open_problems.size > 1
198
- raise CloudcheckTooManySimilarProblems,
199
- "More than one problem of type `#{type}' " +
200
- "exists for resource #{type} #{resource.id}"
201
- end
202
-
203
- if similar_open_problems.empty?
204
- problem = Models::DeploymentProblem.
205
- create(:type => type.to_s, :resource_id => resource.id,
206
- :state => "open", :deployment_id => deployment.id,
207
- :data => data, :counter => 1)
208
-
209
- logger.info("Created problem #{problem.id} (#{problem.type})")
210
- else
211
- # This assumes we are running with deployment lock acquired,
212
- # so there is no possible update conflict
213
- problem = similar_open_problems[0]
214
- problem.data = data
215
- problem.last_seen_at = Time.now
216
- problem.counter += 1
217
- problem.save
218
- logger.info("Updated problem #{problem.id} (#{problem.type}), " +
219
- "count is now #{problem.counter}")
220
- end
221
- end
222
- end
223
-
224
-
225
- private
226
- attr_reader :deployment
227
-
228
- def is_out_of_sync_vm?(vm, instance, state)
229
- job = state["job"] ? state["job"]["name"] : nil
230
- index = state["index"]
231
- if state["deployment"] != deployment.name ||
232
- (instance && (instance.job != job || instance.index != index))
233
- problem_found(:out_of_sync_vm, vm,
234
- :deployment => state["deployment"],
235
- :job => job, :index => index)
236
- true
237
- else
238
- false
239
- end
240
- end
241
-
242
- def is_unbound_instance_vm?(vm, instance, state)
243
- job = state["job"] ? state["job"]["name"] : nil
244
- index = state["index"]
245
- if job && !instance
246
- logger.info("Found unbound VM #{vm.agent_id}")
247
- problem_found(:unbound_instance_vm, vm,
248
- :job => job, :index => index)
249
- true
250
- else
251
- false
252
- end
253
- end
254
-
255
- def add_disk_owner(disk_cid, vm_cid)
256
- @agent_disks[disk_cid] ||= []
257
- @agent_disks[disk_cid] << vm_cid
258
- end
259
-
260
- def get_disk_owners(disk_cid)
261
- @agent_disks[disk_cid]
262
- end
263
-
264
- def cloud
265
- Config.cloud
266
- end
267
- end
268
- end