bosh_aws_cpi 0.7.0 → 1.5.0.pre.1113

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/README.md +22 -19
  2. data/bin/bosh_aws_console +1 -13
  3. data/lib/bosh_aws_cpi.rb +1 -1
  4. data/lib/cloud/aws/aki_picker.rb +7 -7
  5. data/lib/cloud/aws/availability_zone_selector.rb +40 -0
  6. data/lib/cloud/aws/cloud.rb +359 -476
  7. data/lib/cloud/aws/dynamic_network.rb +0 -6
  8. data/lib/cloud/aws/helpers.rb +10 -68
  9. data/lib/cloud/aws/instance_manager.rb +171 -0
  10. data/lib/cloud/aws/manual_network.rb +26 -0
  11. data/lib/cloud/aws/network_configurator.rb +33 -62
  12. data/lib/cloud/aws/resource_wait.rb +189 -0
  13. data/lib/cloud/aws/stemcell.rb +68 -0
  14. data/lib/cloud/aws/stemcell_creator.rb +114 -0
  15. data/lib/cloud/aws/tag_manager.rb +30 -0
  16. data/lib/cloud/aws/version.rb +1 -1
  17. data/lib/cloud/aws/vip_network.rb +9 -7
  18. data/lib/cloud/aws.rb +11 -2
  19. data/scripts/stemcell-copy.sh +37 -0
  20. metadata +45 -81
  21. data/Rakefile +0 -50
  22. data/lib/cloud/aws/registry_client.rb +0 -109
  23. data/spec/assets/stemcell-copy +0 -31
  24. data/spec/integration/cpi_test.rb +0 -78
  25. data/spec/spec_helper.rb +0 -121
  26. data/spec/unit/aki_picker_spec.rb +0 -29
  27. data/spec/unit/attach_disk_spec.rb +0 -143
  28. data/spec/unit/cloud_spec.rb +0 -32
  29. data/spec/unit/configure_networks_spec.rb +0 -113
  30. data/spec/unit/create_disk_spec.rb +0 -73
  31. data/spec/unit/create_stemcell_spec.rb +0 -113
  32. data/spec/unit/create_vm_spec.rb +0 -249
  33. data/spec/unit/delete_disk_spec.rb +0 -34
  34. data/spec/unit/delete_stemcell_spec.rb +0 -29
  35. data/spec/unit/delete_vm_spec.rb +0 -25
  36. data/spec/unit/detach_disk_spec.rb +0 -63
  37. data/spec/unit/helpers_spec.rb +0 -64
  38. data/spec/unit/network_configurator_spec.rb +0 -57
  39. data/spec/unit/reboot_vm_spec.rb +0 -38
  40. data/spec/unit/set_vm_metadata_spec.rb +0 -30
  41. data/spec/unit/validate_deployment_spec.rb +0 -16
@@ -7,16 +7,12 @@ module Bosh::AwsCloud
7
7
 
8
8
  # default maximum number of times to retry an AWS API call
9
9
  DEFAULT_MAX_RETRIES = 2
10
- # default availability zone for instances and disks
11
- DEFAULT_AVAILABILITY_ZONE = "us-east-1a"
12
- DEFAULT_EC2_ENDPOINT = "ec2.amazonaws.com"
13
- METADATA_TIMEOUT = 5 # in seconds
10
+ METADATA_TIMEOUT = 5 # in seconds
14
11
  DEVICE_POLL_TIMEOUT = 60 # in seconds
15
- MAX_TAG_KEY_LENGTH = 127
16
- MAX_TAG_VALUE_LENGTH = 255
17
12
 
18
- attr_reader :ec2
19
- attr_reader :registry
13
+ attr_reader :ec2
14
+ attr_reader :registry
15
+ attr_reader :options
20
16
  attr_accessor :logger
21
17
 
22
18
  ##
@@ -26,48 +22,43 @@ module Bosh::AwsCloud
26
22
  # @option options [Hash] agent agent options
27
23
  # @option options [Hash] registry agent options
28
24
  def initialize(options)
29
- @options = options.dup
30
-
25
+ @options = options.dup.freeze
31
26
  validate_options
32
27
 
33
28
  @logger = Bosh::Clouds::Config.logger
34
29
 
35
- @aws_logger = @logger # TODO make configurable
36
-
37
- @agent_properties = @options["agent"] || {}
38
- @aws_properties = @options["aws"]
39
- @aws_region = @aws_properties.delete("region")
40
- @registry_properties = @options["registry"]
30
+ initialize_aws
31
+ initialize_registry
41
32
 
42
- @default_key_name = @aws_properties["default_key_name"]
43
- @default_security_groups = @aws_properties["default_security_groups"]
33
+ @metadata_lock = Mutex.new
34
+ end
44
35
 
45
- aws_params = {
46
- :access_key_id => @aws_properties["access_key_id"],
47
- :secret_access_key => @aws_properties["secret_access_key"],
48
- :ec2_endpoint => @aws_properties["ec2_endpoint"] || default_ec2_endpoint,
49
- :max_retries => @aws_properties["max_retries"] || DEFAULT_MAX_RETRIES,
50
- :logger => @aws_logger
51
- }
36
+ ##
37
+ # Reads current instance id from EC2 metadata. We are assuming
38
+ # instance id cannot change while current process is running
39
+ # and thus memoizing it.
40
+ def current_vm_id
41
+ @metadata_lock.synchronize do
42
+ return @current_vm_id if @current_vm_id
52
43
 
53
- registry_endpoint = @registry_properties["endpoint"]
54
- registry_user = @registry_properties["user"]
55
- registry_password = @registry_properties["password"]
44
+ client = HTTPClient.new
45
+ client.connect_timeout = METADATA_TIMEOUT
46
+ # Using 169.254.169.254 is an EC2 convention for getting
47
+ # instance metadata
48
+ uri = "http://169.254.169.254/latest/meta-data/instance-id/"
56
49
 
57
- # AWS Ruby SDK is threadsafe but Ruby autoload isn't,
58
- # so we need to trigger eager autoload while constructing CPI
59
- AWS.eager_autoload!
60
- @ec2 = AWS::EC2.new(aws_params)
50
+ response = client.get(uri)
51
+ unless response.status == 200
52
+ cloud_error("Instance metadata endpoint returned " \
53
+ "HTTP #{response.status}")
54
+ end
61
55
 
62
- # Registry updates are not really atomic in relation to
63
- # EC2 API calls, so they might get out of sync. Cloudcheck
64
- # is supposed to fix that.
65
- @registry = RegistryClient.new(registry_endpoint,
66
- registry_user,
67
- registry_password)
56
+ @current_vm_id = response.body
57
+ end
68
58
 
69
- @aki_picker = AKIPicker.new(@ec2)
70
- @metadata_lock = Mutex.new
59
+ rescue HTTPClient::TimeoutError
60
+ cloud_error("Timed out reading instance metadata, " \
61
+ "please make sure CPI is running on EC2 instance")
71
62
  end
72
63
 
73
64
  ##
@@ -86,49 +77,43 @@ module Bosh::AwsCloud
86
77
  # @param [optional, Hash] environment data to be merged into
87
78
  # agent settings
88
79
  # @return [String] EC2 instance id of the new virtual machine
89
- def create_vm(agent_id, stemcell_id, resource_pool,
90
- network_spec, disk_locality = nil, environment = nil)
80
+ def create_vm(agent_id, stemcell_id, resource_pool, network_spec, disk_locality = nil, environment = nil)
91
81
  with_thread_name("create_vm(#{agent_id}, ...)") do
92
- network_configurator = NetworkConfigurator.new(network_spec)
93
-
94
- security_groups =
95
- network_configurator.security_groups(@default_security_groups)
96
- @logger.debug("using security groups: #{security_groups.join(', ')}")
82
+ # do this early to fail fast
83
+ stemcell = Stemcell.find(region, stemcell_id)
97
84
 
98
- response = @ec2.client.describe_images(:image_ids => [stemcell_id])
99
- images_set = response.images_set
100
- if images_set.empty?
101
- cloud_error("no stemcell info for #{stemcell_id}")
85
+ begin
86
+ instance_manager = InstanceManager.new(region, registry, az_selector)
87
+ instance = instance_manager.
88
+ create(agent_id, stemcell_id, resource_pool, network_spec, (disk_locality || []), environment, options)
89
+
90
+ logger.info("Creating new instance '#{instance.id}'")
91
+
92
+ NetworkConfigurator.new(network_spec).configure(region, instance)
93
+
94
+ registry_settings = initial_agent_settings(
95
+ agent_id,
96
+ network_spec,
97
+ environment,
98
+ stemcell.root_device_name,
99
+ )
100
+ registry.update_settings(instance.id, registry_settings)
101
+
102
+ instance.id
103
+ rescue => e # is this rescuing too much?
104
+ logger.error(%Q[Failed to create instance: #{e.message}\n#{e.backtrace.join("\n")}])
105
+ instance_manager.terminate(instance.id, fast_path_delete?) if instance
106
+ raise e
102
107
  end
103
- root_device_name = images_set.first.root_device_name
104
-
105
- instance_params = {
106
- :image_id => stemcell_id,
107
- :count => 1,
108
- :key_name => resource_pool["key_name"] || @default_key_name,
109
- :security_groups => security_groups,
110
- :instance_type => resource_pool["instance_type"],
111
- :user_data => Yajl::Encoder.encode(user_data(network_spec))
112
- }
113
-
114
- instance_params[:availability_zone] =
115
- select_availability_zone(disk_locality,
116
- resource_pool["availability_zone"])
117
-
118
- @logger.info("Creating new instance...")
119
- instance = @ec2.instances.create(instance_params)
120
-
121
- @logger.info("Creating new instance `#{instance.id}'")
122
- wait_resource(instance, :running)
123
-
124
- network_configurator.configure(@ec2, instance)
108
+ end
109
+ end
125
110
 
126
- settings = initial_agent_settings(agent_id, network_spec, environment,
127
- root_device_name)
128
- @registry.update_settings(instance.id, settings)
111
+ def default_ec2_endpoint
112
+ ['ec2', aws_region, 'amazonaws.com'].compact.join('.')
113
+ end
129
114
 
130
- instance.id
131
- end
115
+ def default_elb_endpoint
116
+ ['elasticloadbalancing', aws_region, 'amazonaws.com'].compact.join('.')
132
117
  end
133
118
 
134
119
  ##
@@ -137,20 +122,8 @@ module Bosh::AwsCloud
137
122
  # @param [String] instance_id EC2 instance id
138
123
  def delete_vm(instance_id)
139
124
  with_thread_name("delete_vm(#{instance_id})") do
140
- instance = @ec2.instances[instance_id]
141
-
142
- instance.terminate
143
-
144
- begin
145
- # TODO: should this be done before or after deleting VM?
146
- @logger.info("Deleting instance settings for `#{instance.id}'")
147
- @registry.delete_settings(instance.id)
148
-
149
- @logger.info("Deleting instance `#{instance.id}'")
150
- wait_resource(instance, :terminated)
151
- rescue AWS::EC2::Errors::InvalidInstanceID::NotFound
152
- # It's OK, just means that instance has already been deleted
153
- end
125
+ logger.info("Deleting instance '#{instance_id}'")
126
+ InstanceManager.new(region, registry).terminate(instance_id, fast_path_delete?)
154
127
  end
155
128
  end
156
129
 
@@ -159,8 +132,16 @@ module Bosh::AwsCloud
159
132
  # @param [String] instance_id EC2 instance id
160
133
  def reboot_vm(instance_id)
161
134
  with_thread_name("reboot_vm(#{instance_id})") do
162
- instance = @ec2.instances[instance_id]
163
- soft_reboot(instance)
135
+ InstanceManager.new(region, registry).reboot(instance_id)
136
+ end
137
+ end
138
+
139
+ ##
140
+ # Has EC2 instance
141
+ # @param [String] instance_id EC2 instance id
142
+ def has_vm?(instance_id)
143
+ with_thread_name("has_vm?(#{instance_id})") do
144
+ InstanceManager.new(region, registry).has_instance?(instance_id)
164
145
  end
165
146
  end
166
147
 
@@ -172,40 +153,26 @@ module Bosh::AwsCloud
172
153
  # @return [String] created EBS volume id
173
154
  def create_disk(size, instance_id = nil)
174
155
  with_thread_name("create_disk(#{size}, #{instance_id})") do
175
- unless size.kind_of?(Integer)
176
- raise ArgumentError, "disk size needs to be an integer"
177
- end
156
+ validate_disk_size(size)
178
157
 
179
- if size < 1024
180
- cloud_error("AWS CPI minimum disk size is 1 GiB")
181
- end
158
+ # if the disk is created for an instance, use the same availability zone as they must match
159
+ volume = @ec2.volumes.create(:size => (size / 1024.0).ceil,
160
+ :availability_zone => @az_selector.select_availability_zone(instance_id))
182
161
 
183
- if size > 1024 * 1000
184
- cloud_error("AWS CPI maximum disk size is 1 TiB")
185
- end
186
-
187
- # if the disk is created for an instance, use the same availability
188
- # zone as they must match
189
- if instance_id
190
- instance = @ec2.instances[instance_id]
191
- availability_zone = instance.availability_zone
192
- else
193
- availability_zone = default_availability_zone
194
- end
195
-
196
- volume_params = {
197
- :size => (size / 1024.0).ceil,
198
- :availability_zone => availability_zone
199
- }
200
-
201
- volume = @ec2.volumes.create(volume_params)
202
- @logger.info("Creating volume `#{volume.id}'")
203
- wait_resource(volume, :available)
162
+ logger.info("Creating volume '#{volume.id}'")
163
+ ResourceWait.for_volume(volume: volume, state: :available)
204
164
 
205
165
  volume.id
206
166
  end
207
167
  end
208
168
 
169
+ def validate_disk_size(size)
170
+ raise ArgumentError, "disk size needs to be an integer" unless size.kind_of?(Integer)
171
+
172
+ cloud_error("AWS CPI minimum disk size is 1 GiB") if size < 1024
173
+ cloud_error("AWS CPI maximum disk size is 1 TiB") if size > 1024 * 1000
174
+ end
175
+
209
176
  ##
210
177
  # Delete EBS volume
211
178
  # @param [String] disk_id EBS volume id
@@ -213,22 +180,35 @@ module Bosh::AwsCloud
213
180
  def delete_disk(disk_id)
214
181
  with_thread_name("delete_disk(#{disk_id})") do
215
182
  volume = @ec2.volumes[disk_id]
216
- state = volume.state
217
183
 
218
- if state != :available
219
- cloud_error("Cannot delete volume `#{volume.id}', state is #{state}")
184
+ logger.info("Deleting volume `#{volume.id}'")
185
+
186
+ tries = 10
187
+ sleep_cb = ResourceWait.sleep_callback("Waiting for volume `#{volume.id}' to be deleted", tries)
188
+ ensure_cb = Proc.new do |retries|
189
+ cloud_error("Timed out waiting to delete volume `#{volume.id}'") if retries == tries
220
190
  end
191
+ error = AWS::EC2::Errors::Client::VolumeInUse
221
192
 
222
- volume.delete
193
+ Bosh::Common.retryable(tries: tries, sleep: sleep_cb, on: error, ensure: ensure_cb) do
194
+ volume.delete
195
+ true # return true to only retry on Exceptions
196
+ end
223
197
 
224
- begin
225
- @logger.info("Deleting volume `#{volume.id}'")
226
- wait_resource(volume, :deleted)
227
- rescue AWS::EC2::Errors::InvalidVolume::NotFound
228
- # It's OK, just means the volume has already been deleted
198
+ if fast_path_delete?
199
+ begin
200
+ TagManager.tag(volume, "Name", "to be deleted")
201
+ logger.info("Volume `#{disk_id}' has been marked for deletion")
202
+ rescue AWS::EC2::Errors::InvalidVolume::NotFound
203
+ # Once in a blue moon AWS if actually fast enough that the volume is already gone
204
+ # when we get here, and if it is, our work here is done!
205
+ end
206
+ return
229
207
  end
230
208
 
231
- @logger.info("Volume `#{disk_id}' has been deleted")
209
+ ResourceWait.for_volume(volume: volume, state: :deleted)
210
+
211
+ logger.info("Volume `#{disk_id}' has been deleted")
232
212
  end
233
213
  end
234
214
 
@@ -247,7 +227,7 @@ module Bosh::AwsCloud
247
227
  settings["disks"]["persistent"] ||= {}
248
228
  settings["disks"]["persistent"][disk_id] = device_name
249
229
  end
250
- @logger.info("Attached `#{disk_id}' to `#{instance_id}'")
230
+ logger.info("Attached `#{disk_id}' to `#{instance_id}'")
251
231
  end
252
232
  end
253
233
 
@@ -267,33 +247,76 @@ module Bosh::AwsCloud
267
247
 
268
248
  detach_ebs_volume(instance, volume)
269
249
 
270
- @logger.info("Detached `#{disk_id}' from `#{instance_id}'")
250
+ logger.info("Detached `#{disk_id}' from `#{instance_id}'")
251
+ end
252
+ end
253
+
254
+ def get_disks(vm_id)
255
+ disks = []
256
+ @ec2.instances[vm_id].block_devices.each do |block_device|
257
+ if block_device[:ebs]
258
+ disks << block_device[:ebs][:volume_id]
259
+ end
260
+ end
261
+ disks
262
+ end
263
+
264
+ # Take snapshot of disk
265
+ # @param [String] disk_id disk id of the disk to take the snapshot of
266
+ # @return [String] snapshot id
267
+ def snapshot_disk(disk_id, metadata)
268
+ with_thread_name("snapshot_disk(#{disk_id})") do
269
+ volume = @ec2.volumes[disk_id]
270
+ devices = []
271
+ volume.attachments.each {|attachment| devices << attachment.device}
272
+
273
+ name = [:deployment, :job, :index].collect { |key| metadata[key] }
274
+ name << devices.first.split('/').last unless devices.empty?
275
+
276
+ snapshot = volume.create_snapshot(name.join('/'))
277
+ logger.info("snapshot '#{snapshot.id}' of volume '#{disk_id}' created")
278
+
279
+ [:agent_id, :instance_id, :director_name, :director_uuid].each do |key|
280
+ TagManager.tag(snapshot, key, metadata[key])
281
+ end
282
+ TagManager.tag(snapshot, :device, devices.first) unless devices.empty?
283
+ TagManager.tag(snapshot, 'Name', name.join('/'))
284
+
285
+ ResourceWait.for_snapshot(snapshot: snapshot, state: :completed)
286
+ snapshot.id
287
+ end
288
+ end
289
+
290
+ # Delete a disk snapshot
291
+ # @param [String] snapshot_id snapshot id to delete
292
+ def delete_snapshot(snapshot_id)
293
+ with_thread_name("delete_snapshot(#{snapshot_id})") do
294
+ snapshot = @ec2.snapshots[snapshot_id]
295
+
296
+ if snapshot.status == :in_use
297
+ raise Bosh::Clouds::CloudError, "snapshot '#{snapshot.id}' can not be deleted as it is in use"
298
+ end
299
+
300
+ snapshot.delete
301
+ logger.info("snapshot '#{snapshot_id}' deleted")
271
302
  end
272
303
  end
273
304
 
274
305
  # Configure network for an EC2 instance
275
306
  # @param [String] instance_id EC2 instance id
276
307
  # @param [Hash] network_spec network properties
277
- # @raise [Bosh::Clouds:NotSupported] if the security groups change
308
+ # @raise [Bosh::Clouds:NotSupported] if there's a network change that requires the recreation of the VM
278
309
  def configure_networks(instance_id, network_spec)
279
310
  with_thread_name("configure_networks(#{instance_id}, ...)") do
280
- @logger.info("Configuring `#{instance_id}' to use the following " \
281
- "network settings: #{network_spec.pretty_inspect}")
311
+ logger.info("Configuring '#{instance_id}' to use new network settings: #{network_spec.pretty_inspect}")
282
312
 
283
- network_configurator = NetworkConfigurator.new(network_spec)
284
313
  instance = @ec2.instances[instance_id]
285
314
 
286
- actual = instance.security_groups.collect {|sg| sg.name }.sort
287
- new = network_configurator.security_groups(@default_security_groups)
315
+ network_configurator = NetworkConfigurator.new(network_spec)
316
+
317
+ compare_security_groups(instance, network_spec)
288
318
 
289
- # If the security groups change, we need to recreate the VM
290
- # as you can't change the security group of a running instance,
291
- # we need to send the InstanceUpdater a request to do it for us
292
- unless actual == new
293
- raise Bosh::Clouds::NotSupported,
294
- "security groups change requires VM recreation: %s to %s" %
295
- [actual.join(", "), new.join(", ")]
296
- end
319
+ compare_private_ip_addresses(instance, network_configurator.private_ip)
297
320
 
298
321
  network_configurator.configure(@ec2, instance)
299
322
 
@@ -303,6 +326,43 @@ module Bosh::AwsCloud
303
326
  end
304
327
  end
305
328
 
329
+ # If the security groups change, we need to recreate the VM
330
+ # as you can't change the security group of a running instance,
331
+ # we need to send the InstanceUpdater a request to do it for us
332
+ def compare_security_groups(instance, network_spec)
333
+ actual_group_names = instance.security_groups.collect { |sg| sg.name }
334
+ specified_group_names = extract_security_group_names(network_spec)
335
+ if specified_group_names.empty?
336
+ new_group_names = Array(aws_properties["default_security_groups"])
337
+ else
338
+ new_group_names = specified_group_names
339
+ end
340
+
341
+ unless actual_group_names.sort == new_group_names.sort
342
+ raise Bosh::Clouds::NotSupported,
343
+ "security groups change requires VM recreation: %s to %s" %
344
+ [actual_group_names.join(", "), new_group_names.join(", ")]
345
+ end
346
+ end
347
+
348
+ ##
349
+ # Compares actual instance private IP addresses with the IP address specified at the network spec
350
+ #
351
+ # @param [AWS::EC2::Instance] instance EC2 instance
352
+ # @param [String] specified_ip_address IP address specified at the network spec (if Manual Network)
353
+ # @return [void]
354
+ # @raise [Bosh::Clouds:NotSupported] If the IP address change, we need to recreate the VM as you can't
355
+ # change the IP address of a running server, so we need to send the InstanceUpdater a request to do it for us
356
+ def compare_private_ip_addresses(instance, specified_ip_address)
357
+ actual_ip_address = instance.private_ip_address
358
+
359
+ unless specified_ip_address.nil? || actual_ip_address == specified_ip_address
360
+ raise Bosh::Clouds::NotSupported,
361
+ "IP address change requires VM recreation: %s to %s" %
362
+ [actual_ip_address, specified_ip_address]
363
+ end
364
+ end
365
+
306
366
  ##
307
367
  # Creates a new EC2 AMI using stemcell image.
308
368
  # This method can only be run on an EC2 instance, as image creation
@@ -319,43 +379,34 @@ module Bosh::AwsCloud
319
379
  # @option cloud_properties [String] disk (2048)
320
380
  # root disk size
321
381
  # @return [String] EC2 AMI name of the stemcell
322
- def create_stemcell(image_path, cloud_properties)
323
- # TODO: refactor into several smaller methods
382
+ def create_stemcell(image_path, stemcell_properties)
324
383
  with_thread_name("create_stemcell(#{image_path}...)") do
384
+ creator = StemcellCreator.new(region, stemcell_properties)
385
+
386
+ return creator.fake.id if creator.fake?
387
+
325
388
  begin
326
389
  # These three variables are used in 'ensure' clause
327
390
  instance = nil
328
391
  volume = nil
392
+
329
393
  # 1. Create and mount new EBS volume (2GB default)
330
- disk_size = cloud_properties["disk"] || 2048
331
- volume_id = create_disk(disk_size, current_instance_id)
394
+ disk_size = stemcell_properties["disk"] || 2048
395
+ volume_id = create_disk(disk_size, current_vm_id)
332
396
  volume = @ec2.volumes[volume_id]
333
- instance = @ec2.instances[current_instance_id]
397
+ instance = @ec2.instances[current_vm_id]
334
398
 
335
399
  sd_name = attach_ebs_volume(instance, volume)
336
400
  ebs_volume = find_ebs_device(sd_name)
337
401
 
338
- # 2. Copy image to new EBS volume
339
- @logger.info("Copying stemcell disk image to '#{ebs_volume}'")
340
- copy_root_image(image_path, ebs_volume)
341
-
342
- # 3. Create snapshot and then an image using this snapshot
343
- snapshot = volume.create_snapshot
344
- wait_resource(snapshot, :completed)
345
-
346
- params = image_params(cloud_properties, snapshot.id)
347
- image = @ec2.images.create(params)
348
- wait_resource(image, :available, :state)
349
-
350
- tag(image, "Name", params[:description]) if params[:description]
351
-
352
- image.id
402
+ logger.info("Creating stemcell with: '#{volume.id}' and '#{stemcell_properties.inspect}'")
403
+ creator.create(volume, ebs_volume, image_path).id
353
404
  rescue => e
354
- @logger.error(e)
405
+ logger.error(e)
355
406
  raise e
356
407
  ensure
357
408
  if instance && volume
358
- detach_ebs_volume(instance, volume)
409
+ detach_ebs_volume(instance, volume, true)
359
410
  delete_disk(volume.id)
360
411
  end
361
412
  end
@@ -366,24 +417,8 @@ module Bosh::AwsCloud
366
417
  # @param [String] stemcell_id EC2 AMI name of the stemcell to be deleted
367
418
  def delete_stemcell(stemcell_id)
368
419
  with_thread_name("delete_stemcell(#{stemcell_id})") do
369
- snapshots = []
370
- image = @ec2.images[stemcell_id]
371
-
372
- image.block_device_mappings.each do |device, map|
373
- id = map[:snapshot_id]
374
- if id
375
- @logger.debug("queuing snapshot #{id} for deletion")
376
- snapshots << id
377
- end
378
- end
379
-
380
- image.deregister
381
-
382
- snapshots.each do |id|
383
- @logger.info("cleaning up snapshot #{id}")
384
- snapshot = @ec2.snapshots[id]
385
- snapshot.delete
386
- end
420
+ stemcell = Stemcell.find(region, stemcell_id)
421
+ stemcell.delete
387
422
  end
388
423
  end
389
424
 
@@ -395,17 +430,21 @@ module Bosh::AwsCloud
395
430
  def set_vm_metadata(vm, metadata)
396
431
  instance = @ec2.instances[vm]
397
432
 
398
- # TODO should we clear existing tags that don't exist in metadata?
399
433
  metadata.each_pair do |key, value|
400
- tag(instance, key, value)
434
+ TagManager.tag(instance, key, value)
401
435
  end
402
436
 
403
- # should deployment name be included too?
404
437
  job = metadata[:job]
405
438
  index = metadata[:index]
406
- tag(instance, "Name", "#{job}/#{index}") if job && index
439
+
440
+ if job && index
441
+ name = "#{job}/#{index}"
442
+ elsif metadata[:compiling]
443
+ name = "compiling/#{metadata[:compiling]}"
444
+ end
445
+ TagManager.tag(instance, "Name", name) if name
407
446
  rescue AWS::EC2::Errors::TagLimitExceeded => e
408
- @logger.error("could not tag #{instance.id}: #{e.message}")
447
+ logger.error("could not tag #{instance.id}: #{e.message}")
409
448
  end
410
449
 
411
450
  # @note Not implemented in the AWS CPI
@@ -414,132 +453,79 @@ module Bosh::AwsCloud
414
453
  not_implemented(:validate_deployment)
415
454
  end
416
455
 
417
- # Selects the availability zone to use from a list of disk volumes,
418
- # resource pool availability zone (if any) and the default availability
419
- # zone.
420
- # @param [Hash] volumes volume ids to attach to the vm
421
- # @param [String] resource_pool_az availability zone specified in
422
- # the resource pool (may be nil)
423
- # @return [String] availability zone to use
424
- # @note this is a private method that is public to make it easier to test
425
- def select_availability_zone(volumes, resource_pool_az)
426
- if volumes && !volumes.empty?
427
- disks = volumes.map { |vid| @ec2.volumes[vid] }
428
- ensure_same_availability_zone(disks, resource_pool_az)
429
- disks.first.availability_zone
430
- else
431
- resource_pool_az || default_availability_zone
456
+ def find_ebs_device(sd_name)
457
+ xvd_name = sd_name.gsub(/^\/dev\/sd/, "/dev/xvd")
458
+
459
+ DEVICE_POLL_TIMEOUT.times do
460
+ if File.blockdev?(sd_name)
461
+ return sd_name
462
+ elsif File.blockdev?(xvd_name)
463
+ return xvd_name
464
+ end
465
+ sleep(1)
432
466
  end
433
- end
434
467
 
435
- # ensure all supplied availability zones are the same
436
- # @note this is a private method that is public to make it easier to test
437
- def ensure_same_availability_zone(disks, default)
438
- zones = disks.map { |disk| disk.availability_zone }
439
- zones << default if default
440
- zones.uniq!
441
- cloud_error "can't use multiple availability zones: %s" %
442
- zones.join(", ") unless zones.size == 1 || zones.empty?
468
+ cloud_error("Cannot find EBS volume on current instance")
443
469
  end
444
470
 
445
- private
446
-
447
- # add a tag to something
448
- def tag(taggable, key, value)
449
- trimmed_key = key[0..(MAX_TAG_KEY_LENGTH - 1)]
450
- trimmed_value = value[0..(MAX_TAG_VALUE_LENGTH - 1)]
451
- taggable.add_tag(trimmed_key, :value => trimmed_value)
452
- rescue AWS::EC2::Errors::InvalidParameterValue => e
453
- @logger.error("could not tag #{taggable.id}: #{e.message}")
454
- end
455
471
 
456
- # Prepare EC2 user data
457
- # @param [Hash] network_spec network specification
458
- # @return [Hash] EC2 user data
459
- def user_data(network_spec)
460
- data = {}
472
+ private
461
473
 
462
- data["registry"] = { "endpoint" => @registry.endpoint }
474
+ attr_reader :az_selector
475
+ attr_reader :region
463
476
 
464
- with_dns(network_spec) do |servers|
465
- data["dns"] = { "nameserver" => servers }
466
- end
477
+ def agent_properties
478
+ @agent_properties ||= options.fetch('agent', {})
479
+ end
467
480
 
468
- data
481
+ def aws_properties
482
+ @aws_properties ||= options.fetch('aws')
469
483
  end
470
484
 
471
- # extract dns server list from network spec and yield the the list
472
- # @param [Hash] network_spec network specification for instance
473
- # @yield [Array]
474
- def with_dns(network_spec)
475
- network_spec.each_value do |properties|
476
- if properties["dns"]
477
- yield properties["dns"]
478
- return
479
- end
480
- end
485
+ def aws_region
486
+ @aws_region ||= aws_properties.fetch('region', nil)
481
487
  end
482
488
 
483
- def image_params(cloud_properties, snapshot_id)
484
- root_device_name = cloud_properties["root_device_name"]
485
- architecture = cloud_properties["architecture"]
489
+ def fast_path_delete?
490
+ aws_properties.fetch('fast_path_delete', false)
491
+ end
486
492
 
487
- params = {
488
- :name => "BOSH-#{generate_unique_name}",
489
- :architecture => architecture,
490
- :kernel_id => find_aki(architecture, root_device_name),
491
- :root_device_name => root_device_name,
492
- :block_device_mappings => {
493
- "/dev/sda" => { :snapshot_id => snapshot_id },
494
- "/dev/sdb" => "ephemeral0"
495
- }
493
+ def initialize_aws
494
+ aws_logger = logger
495
+ aws_params = {
496
+ access_key_id: aws_properties['access_key_id'],
497
+ secret_access_key: aws_properties['secret_access_key'],
498
+ ec2_endpoint: aws_properties['ec2_endpoint'] || default_ec2_endpoint,
499
+ elb_endpoint: aws_properties['elb_endpoint'] || default_elb_endpoint,
500
+ max_retries: aws_properties['max_retries'] || DEFAULT_MAX_RETRIES ,
501
+ logger: aws_logger
496
502
  }
497
503
 
498
- # old stemcells doesn't have name & version
499
- if cloud_properties["name"] && cloud_properties["version"]
500
- name = "#{cloud_properties['name']} #{cloud_properties['version']}"
501
- params[:description] = name
502
- end
504
+ aws_params[:proxy_uri] = aws_properties['proxy_uri'] if aws_properties['proxy_uri']
503
505
 
504
- params
505
- end
506
+ # AWS Ruby SDK is threadsafe but Ruby autoload isn't,
507
+ # so we need to trigger eager autoload while constructing CPI
508
+ AWS.eager_autoload!
509
+
510
+ AWS.config(aws_params)
506
511
 
507
- def find_aki(architecture, root_device_name)
508
- @aki_picker.pick(architecture, root_device_name)
512
+ @ec2 = AWS::EC2.new
513
+ @region = @ec2.regions[aws_region]
514
+ @az_selector = AvailabilityZoneSelector.new(@region, aws_properties['default_availability_zone'])
509
515
  end
510
516
 
511
- ##
512
- # Generates initial agent settings. These settings will be read by agent
513
- # from AWS registry (also a BOSH component) on a target instance. Disk
514
- # conventions for amazon are:
515
- # system disk: /dev/sda
516
- # ephemeral disk: /dev/sdb
517
- # EBS volumes can be configured to map to other device names later (sdf
518
- # through sdp, also some kernels will remap sd* to xvd*).
519
- #
520
- # @param [String] agent_id Agent id (will be picked up by agent to
521
- # assume its identity
522
- # @param [Hash] network_spec Agent network spec
523
- # @param [Hash] environment
524
- # @param [String] root_device_name root device, e.g. /dev/sda1
525
- # @return [Hash]
526
- def initial_agent_settings(agent_id, network_spec, environment,
527
- root_device_name)
528
- settings = {
529
- "vm" => {
530
- "name" => "vm-#{generate_unique_name}"
531
- },
532
- "agent_id" => agent_id,
533
- "networks" => network_spec,
534
- "disks" => {
535
- "system" => root_device_name,
536
- "ephemeral" => "/dev/sdb",
537
- "persistent" => {}
538
- }
539
- }
517
+ def initialize_registry
518
+ registry_properties = options.fetch('registry')
519
+ registry_endpoint = registry_properties.fetch('endpoint')
520
+ registry_user = registry_properties.fetch('user')
521
+ registry_password = registry_properties.fetch('password')
540
522
 
541
- settings["env"] = environment if environment
542
- settings.merge(@agent_properties)
523
+ # Registry updates are not really atomic in relation to
524
+ # EC2 API calls, so they might get out of sync. Cloudcheck
525
+ # is supposed to fix that.
526
+ @registry = Bosh::Registry::Client.new(registry_endpoint,
527
+ registry_user,
528
+ registry_password)
543
529
  end
544
530
 
545
531
  def update_agent_settings(instance)
@@ -547,80 +533,48 @@ module Bosh::AwsCloud
547
533
  raise ArgumentError, "block is not provided"
548
534
  end
549
535
 
550
- settings = @registry.read_settings(instance.id)
536
+ settings = registry.read_settings(instance.id)
551
537
  yield settings
552
- @registry.update_settings(instance.id, settings)
538
+ registry.update_settings(instance.id, settings)
553
539
  end
554
540
 
555
- def generate_unique_name
556
- UUIDTools::UUID.random_create.to_s
557
- end
558
-
559
- ##
560
- # Reads current instance id from EC2 metadata. We are assuming
561
- # instance id cannot change while current process is running
562
- # and thus memoizing it.
563
- def current_instance_id
564
- @metadata_lock.synchronize do
565
- return @current_instance_id if @current_instance_id
566
-
567
- client = HTTPClient.new
568
- client.connect_timeout = METADATA_TIMEOUT
569
- # Using 169.254.169.254 is an EC2 convention for getting
570
- # instance metadata
571
- uri = "http://169.254.169.254/1.0/meta-data/instance-id/"
541
+ def attach_ebs_volume(instance, volume)
542
+ device_name = select_device_name(instance)
543
+ cloud_error('Instance has too many disks attached') unless device_name
544
+
545
+ # Work around AWS eventual (in)consistency:
546
+ # even tough we don't call attach_disk until the disk is ready,
547
+ # AWS might still lie and say that the disk isn't ready yet, so
548
+ # we try again just to be really sure it is telling the truth
549
+ attachment = nil
550
+ Bosh::Common.retryable(tries: 15, on: AWS::EC2::Errors::IncorrectState) do
551
+ attachment = volume.attach_to(instance, device_name)
552
+ end
572
553
 
573
- response = client.get(uri)
574
- unless response.status == 200
575
- cloud_error("Instance metadata endpoint returned " \
576
- "HTTP #{response.status}")
577
- end
554
+ logger.info("Attaching '#{volume.id}' to '#{instance.id}' as '#{device_name}'")
555
+ ResourceWait.for_attachment(attachment: attachment, state: :attached)
578
556
 
579
- @current_instance_id = response.body
580
- end
557
+ device_name = attachment.device
558
+ logger.info("Attached '#{volume.id}' to '#{instance.id}' as '#{device_name}'")
581
559
 
582
- rescue HTTPClient::TimeoutError
583
- cloud_error("Timed out reading instance metadata, " \
584
- "please make sure CPI is running on EC2 instance")
560
+ device_name
585
561
  end
586
562
 
587
- def attach_ebs_volume(instance, volume)
588
- # TODO once we upgrade the aws-sdk gem to > 1.3.9, we need to use:
589
- # instance.block_device_mappings.to_hash.keys
563
+ def select_device_name(instance)
590
564
  device_names = Set.new(instance.block_device_mappings.to_hash.keys)
591
- new_attachment = nil
592
-
593
- ("f".."p").each do |char| # f..p is what console suggests
594
- # Some kernels will remap sdX to xvdX, so agent needs
595
- # to lookup both (sd, then xvd)
596
- dev_name = "/dev/sd#{char}"
597
- if device_names.include?(dev_name)
598
- @logger.warn("`#{dev_name}' on `#{instance.id}' is taken")
599
- next
600
- end
601
- new_attachment = volume.attach_to(instance, dev_name)
602
- break
603
- end
604
565
 
605
- if new_attachment.nil?
606
- # TODO: better messaging?
607
- cloud_error("Instance has too many disks attached")
566
+ ('f'..'p').each do |char| # f..p is what console suggests
567
+ # Some kernels will remap sdX to xvdX, so agent needs
568
+ # to lookup both (sd, then xvd)
569
+ device_name = "/dev/sd#{char}"
570
+ return device_name unless device_names.include?(device_name)
571
+ logger.warn("'#{device_name}' on '#{instance.id}' is taken")
608
572
  end
609
573
 
610
- @logger.info("Attaching `#{volume.id}' to `#{instance.id}'")
611
- wait_resource(new_attachment, :attached)
612
-
613
- device_name = new_attachment.device
614
-
615
- @logger.info("Attached `#{volume.id}' to `#{instance.id}', " \
616
- "device name is `#{device_name}'")
617
-
618
- device_name
574
+ nil
619
575
  end
620
576
 
621
- def detach_ebs_volume(instance, volume)
622
- # TODO once we upgrade the aws-sdk gem to > 1.3.9, we need to use:
623
- # instance.block_device_mappings.to_hash.keys
577
+ def detach_ebs_volume(instance, volume, force=false)
624
578
  mappings = instance.block_device_mappings.to_hash
625
579
 
626
580
  device_map = mappings.inject({}) do |hash, (device_name, attachment)|
@@ -629,98 +583,14 @@ module Bosh::AwsCloud
629
583
  end
630
584
 
631
585
  if device_map[volume.id].nil?
632
- cloud_error("Disk `#{volume.id}' is not attached " \
633
- "to instance `#{instance.id}'")
586
+ raise Bosh::Clouds::DiskNotAttached.new(true),
587
+ "Disk `#{volume.id}' is not attached to instance `#{instance.id}'"
634
588
  end
635
589
 
636
- attachment = volume.detach_from(instance, device_map[volume.id])
637
- @logger.info("Detaching `#{volume.id}' from `#{instance.id}'")
590
+ attachment = volume.detach_from(instance, device_map[volume.id], force: force)
591
+ logger.info("Detaching `#{volume.id}' from `#{instance.id}'")
638
592
 
639
- wait_resource(attachment, :detached) do |error|
640
- if error.is_a? AWS::Core::Resource::NotFound
641
- @logger.info("attachment is no longer found, assuming it to be detached")
642
- :detached
643
- end
644
- end
645
- end
646
-
647
- # This method tries to execute the helper script stemcell-copy
648
- # as root using sudo, since it needs to write to the ebs_volume.
649
- # If stemcell-copy isn't available, it falls back to writing directly
650
- # to the device, which is used in the micro bosh deployer.
651
- # The stemcell-copy script must be in the PATH of the user running
652
- # the director, and needs sudo privileges to execute without
653
- # password.
654
- def copy_root_image(image_path, ebs_volume)
655
- path = ENV["PATH"]
656
-
657
- if stemcell_copy = has_stemcell_copy(path)
658
- @logger.debug("copying stemcell using stemcell-copy script")
659
- # note that is is a potentially dangerous operation, but as the
660
- # stemcell-copy script sets PATH to a sane value this is safe
661
- out = `sudo #{stemcell_copy} #{image_path} #{ebs_volume} 2>&1`
662
- else
663
- @logger.info("falling back to using dd to copy stemcell")
664
- out = `tar -xzf #{image_path} -O root.img | dd of=#{ebs_volume} 2>&1`
665
- end
666
-
667
- unless $?.exitstatus == 0
668
- cloud_error("Unable to copy stemcell root image, " \
669
- "exit status #{$?.exitstatus}: #{out}")
670
- end
671
-
672
- @logger.debug("stemcell copy output:\n#{out}")
673
- end
674
-
675
- # checks if the stemcell-copy script can be found in
676
- # the current PATH
677
- def has_stemcell_copy(path)
678
- path.split(":").each do |dir|
679
- stemcell_copy = File.join(dir, "stemcell-copy")
680
- return stemcell_copy if File.exist?(stemcell_copy)
681
- end
682
- nil
683
- end
684
-
685
- def find_ebs_device(sd_name)
686
- xvd_name = sd_name.gsub(/^\/dev\/sd/, "/dev/xvd")
687
-
688
- DEVICE_POLL_TIMEOUT.times do
689
- if File.blockdev?(sd_name)
690
- return sd_name
691
- elsif File.blockdev?(xvd_name)
692
- return xvd_name
693
- end
694
- sleep(1)
695
- end
696
-
697
- cloud_error("Cannot find EBS volume on current instance")
698
- end
699
-
700
- ##
701
- # Soft reboots EC2 instance
702
- # @param [AWS::EC2::Instance] instance EC2 instance
703
- def soft_reboot(instance)
704
- # There is no trackable status change for the instance being
705
- # rebooted, so it's up to CPI client to keep track of agent
706
- # being ready after reboot.
707
- instance.reboot
708
- end
709
-
710
- ##
711
- # Hard reboots EC2 instance
712
- # @param [AWS::EC2::Instance] instance EC2 instance
713
- def hard_reboot(instance)
714
- # N.B. This will only work with ebs-store instances,
715
- # as instance-store instances don't support stop/start.
716
- instance.stop
717
-
718
- @logger.info("Stopping instance `#{instance.id}'")
719
- wait_resource(instance, :stopped)
720
-
721
- instance.start
722
- @logger.info("Starting instance `#{instance.id}'")
723
- wait_resource(instance, :running)
593
+ ResourceWait.for_attachment(attachment: attachment, state: :detached)
724
594
  end
725
595
 
726
596
  ##
@@ -728,41 +598,54 @@ module Bosh::AwsCloud
728
598
  # be used to create all required data structures etc.
729
599
  #
730
600
  def validate_options
731
- unless @options.has_key?("aws") &&
732
- @options["aws"].is_a?(Hash) &&
733
- @options["aws"]["access_key_id"] &&
734
- @options["aws"]["secret_access_key"]
735
- raise ArgumentError, "Invalid AWS configuration parameters"
736
- end
601
+ required_keys = {
602
+ "aws" => ["access_key_id", "secret_access_key", "region", "default_key_name"],
603
+ "registry" => ["endpoint", "user", "password"],
604
+ }
737
605
 
738
- unless @options.has_key?("registry") &&
739
- @options["registry"].is_a?(Hash) &&
740
- @options["registry"]["endpoint"] &&
741
- @options["registry"]["user"] &&
742
- @options["registry"]["password"]
743
- raise ArgumentError, "Invalid registry configuration parameters"
744
- end
745
- end
606
+ missing_keys = []
746
607
 
747
- def default_ec2_endpoint
748
- if @aws_region
749
- "ec2.#{@aws_region}.amazonaws.com"
750
- else
751
- DEFAULT_EC2_ENDPOINT
608
+ required_keys.each_pair do |key, values|
609
+ values.each do |value|
610
+ if (!options.has_key?(key) || !options[key].has_key?(value))
611
+ missing_keys << "#{key}:#{value}"
612
+ end
613
+ end
752
614
  end
753
- end
754
615
 
755
- def default_availability_zone
756
- if @aws_region
757
- "#{@aws_region}b"
758
- else
759
- DEFAULT_AVAILABILITY_ZONE
760
- end
616
+ raise ArgumentError, "missing configuration parameters > #{missing_keys.join(', ')}" unless missing_keys.empty?
761
617
  end
762
618
 
763
- def task_checkpoint
764
- Bosh::Clouds::Config.task_checkpoint
619
+ # Generates initial agent settings. These settings will be read by agent
620
+ # from AWS registry (also a BOSH component) on a target instance. Disk
621
+ # conventions for amazon are:
622
+ # system disk: /dev/sda
623
+ # ephemeral disk: /dev/sdb
624
+ # EBS volumes can be configured to map to other device names later (sdf
625
+ # through sdp, also some kernels will remap sd* to xvd*).
626
+ #
627
+ # @param [String] agent_id Agent id (will be picked up by agent to
628
+ # assume its identity
629
+ # @param [Hash] network_spec Agent network spec
630
+ # @param [Hash] environment
631
+ # @param [String] root_device_name root device, e.g. /dev/sda1
632
+ # @return [Hash]
633
+ def initial_agent_settings(agent_id, network_spec, environment, root_device_name)
634
+ settings = {
635
+ "vm" => {
636
+ "name" => "vm-#{SecureRandom.uuid}"
637
+ },
638
+ "agent_id" => agent_id,
639
+ "networks" => network_spec,
640
+ "disks" => {
641
+ "system" => root_device_name,
642
+ "ephemeral" => "/dev/sdb",
643
+ "persistent" => {}
644
+ }
645
+ }
646
+
647
+ settings["env"] = environment if environment
648
+ settings.merge(agent_properties)
765
649
  end
766
650
  end
767
-
768
651
  end