RubyGems - bosh_aws_cpi - Versions diffs - 0.7.0 → 1.5.0.pre.1113 - Mend

bosh_aws_cpi 0.7.0 → 1.5.0.pre.1113

Files changed (41) hide show

data/README.md +22 -19
data/bin/bosh_aws_console +1 -13
data/lib/bosh_aws_cpi.rb +1 -1
data/lib/cloud/aws/aki_picker.rb +7 -7
data/lib/cloud/aws/availability_zone_selector.rb +40 -0
data/lib/cloud/aws/cloud.rb +359 -476
data/lib/cloud/aws/dynamic_network.rb +0 -6
data/lib/cloud/aws/helpers.rb +10 -68
data/lib/cloud/aws/instance_manager.rb +171 -0
data/lib/cloud/aws/manual_network.rb +26 -0
data/lib/cloud/aws/network_configurator.rb +33 -62
data/lib/cloud/aws/resource_wait.rb +189 -0
data/lib/cloud/aws/stemcell.rb +68 -0
data/lib/cloud/aws/stemcell_creator.rb +114 -0
data/lib/cloud/aws/tag_manager.rb +30 -0
data/lib/cloud/aws/version.rb +1 -1
data/lib/cloud/aws/vip_network.rb +9 -7
data/lib/cloud/aws.rb +11 -2
data/scripts/stemcell-copy.sh +37 -0
metadata +45 -81
data/Rakefile +0 -50
data/lib/cloud/aws/registry_client.rb +0 -109
data/spec/assets/stemcell-copy +0 -31
data/spec/integration/cpi_test.rb +0 -78
data/spec/spec_helper.rb +0 -121
data/spec/unit/aki_picker_spec.rb +0 -29
data/spec/unit/attach_disk_spec.rb +0 -143
data/spec/unit/cloud_spec.rb +0 -32
data/spec/unit/configure_networks_spec.rb +0 -113
data/spec/unit/create_disk_spec.rb +0 -73
data/spec/unit/create_stemcell_spec.rb +0 -113
data/spec/unit/create_vm_spec.rb +0 -249
data/spec/unit/delete_disk_spec.rb +0 -34
data/spec/unit/delete_stemcell_spec.rb +0 -29
data/spec/unit/delete_vm_spec.rb +0 -25
data/spec/unit/detach_disk_spec.rb +0 -63
data/spec/unit/helpers_spec.rb +0 -64
data/spec/unit/network_configurator_spec.rb +0 -57
data/spec/unit/reboot_vm_spec.rb +0 -38
data/spec/unit/set_vm_metadata_spec.rb +0 -30
data/spec/unit/validate_deployment_spec.rb +0 -16

data/lib/cloud/aws/cloud.rb CHANGED Viewed

@@ -7,16 +7,12 @@ module Bosh::AwsCloud
     # default maximum number of times to retry an AWS API call
     DEFAULT_MAX_RETRIES = 2
-    # default availability zone for instances and disks
-    DEFAULT_AVAILABILITY_ZONE = "us-east-1a"
-    DEFAULT_EC2_ENDPOINT = "ec2.amazonaws.com"
-    METADATA_TIMEOUT = 5 # in seconds
+    METADATA_TIMEOUT    = 5 # in seconds
     DEVICE_POLL_TIMEOUT = 60 # in seconds
-    MAX_TAG_KEY_LENGTH = 127
-    MAX_TAG_VALUE_LENGTH = 255
-    attr_reader :ec2
-    attr_reader :registry
+    attr_reader   :ec2
+    attr_reader   :registry
+    attr_reader   :options
     attr_accessor :logger
     ##
@@ -26,48 +22,43 @@ module Bosh::AwsCloud
     # @option options [Hash] agent agent options
     # @option options [Hash] registry agent options
     def initialize(options)
-      @options = options.dup
+      @options = options.dup.freeze
       validate_options
       @logger = Bosh::Clouds::Config.logger
-      @aws_logger = @logger # TODO make configurable
-      @agent_properties = @options["agent"] || {}
-      @aws_properties = @options["aws"]
-      @aws_region = @aws_properties.delete("region")
-      @registry_properties = @options["registry"]
+      initialize_aws
+      initialize_registry
-      @default_key_name = @aws_properties["default_key_name"]
-      @default_security_groups = @aws_properties["default_security_groups"]
+      @metadata_lock = Mutex.new
+    end
-      aws_params = {
-        :access_key_id => @aws_properties["access_key_id"],
-        :secret_access_key => @aws_properties["secret_access_key"],
-        :ec2_endpoint => @aws_properties["ec2_endpoint"] || default_ec2_endpoint,
-        :max_retries => @aws_properties["max_retries"] || DEFAULT_MAX_RETRIES,
-        :logger => @aws_logger
-      }
+    ##
+    # Reads current instance id from EC2 metadata. We are assuming
+    # instance id cannot change while current process is running
+    # and thus memoizing it.
+    def current_vm_id
+      @metadata_lock.synchronize do
+        return @current_vm_id if @current_vm_id
-      registry_endpoint = @registry_properties["endpoint"]
-      registry_user = @registry_properties["user"]
-      registry_password = @registry_properties["password"]
+        client = HTTPClient.new
+        client.connect_timeout = METADATA_TIMEOUT
+        # Using 169.254.169.254 is an EC2 convention for getting
+        # instance metadata
+        uri = "http://169.254.169.254/latest/meta-data/instance-id/"
-      # AWS Ruby SDK is threadsafe but Ruby autoload isn't,
-      # so we need to trigger eager autoload while constructing CPI
-      AWS.eager_autoload!
-      @ec2 = AWS::EC2.new(aws_params)
+        response = client.get(uri)
+        unless response.status == 200
+          cloud_error("Instance metadata endpoint returned " \
+                      "HTTP #{response.status}")
+        end
-      # Registry updates are not really atomic in relation to
-      # EC2 API calls, so they might get out of sync. Cloudcheck
-      # is supposed to fix that.
-      @registry = RegistryClient.new(registry_endpoint,
-                                     registry_user,
-                                     registry_password)
+        @current_vm_id = response.body
+      end
-      @aki_picker = AKIPicker.new(@ec2)
-      @metadata_lock = Mutex.new
+    rescue HTTPClient::TimeoutError
+      cloud_error("Timed out reading instance metadata, " \
+                  "please make sure CPI is running on EC2 instance")
     end
     ##
@@ -86,49 +77,43 @@ module Bosh::AwsCloud
     # @param [optional, Hash] environment data to be merged into
     #   agent settings
     # @return [String] EC2 instance id of the new virtual machine
-    def create_vm(agent_id, stemcell_id, resource_pool,
-                  network_spec, disk_locality = nil, environment = nil)
+    def create_vm(agent_id, stemcell_id, resource_pool, network_spec, disk_locality = nil, environment = nil)
       with_thread_name("create_vm(#{agent_id}, ...)") do
-        network_configurator = NetworkConfigurator.new(network_spec)
-        security_groups =
-          network_configurator.security_groups(@default_security_groups)
-        @logger.debug("using security groups: #{security_groups.join(', ')}")
+        # do this early to fail fast
+        stemcell = Stemcell.find(region, stemcell_id)
-        response = @ec2.client.describe_images(:image_ids => [stemcell_id])
-        images_set = response.images_set
-        if images_set.empty?
-          cloud_error("no stemcell info for #{stemcell_id}")
+        begin
+          instance_manager = InstanceManager.new(region, registry, az_selector)
+          instance = instance_manager.
+              create(agent_id, stemcell_id, resource_pool, network_spec, (disk_locality || []), environment, options)
+          logger.info("Creating new instance '#{instance.id}'")
+          NetworkConfigurator.new(network_spec).configure(region, instance)
+          registry_settings = initial_agent_settings(
+              agent_id,
+              network_spec,
+              environment,
+              stemcell.root_device_name,
+          )
+          registry.update_settings(instance.id, registry_settings)
+          instance.id
+        rescue => e # is this rescuing too much?
+          logger.error(%Q[Failed to create instance: #{e.message}\n#{e.backtrace.join("\n")}])
+          instance_manager.terminate(instance.id, fast_path_delete?) if instance
+          raise e
         end
-        root_device_name = images_set.first.root_device_name
-        instance_params = {
-          :image_id => stemcell_id,
-          :count => 1,
-          :key_name => resource_pool["key_name"] || @default_key_name,
-          :security_groups => security_groups,
-          :instance_type => resource_pool["instance_type"],
-          :user_data => Yajl::Encoder.encode(user_data(network_spec))
-        }
-        instance_params[:availability_zone] =
-          select_availability_zone(disk_locality,
-          resource_pool["availability_zone"])
-        @logger.info("Creating new instance...")
-        instance = @ec2.instances.create(instance_params)
-        @logger.info("Creating new instance `#{instance.id}'")
-        wait_resource(instance, :running)
-        network_configurator.configure(@ec2, instance)
+      end
+    end
-        settings = initial_agent_settings(agent_id, network_spec, environment,
-                                          root_device_name)
-        @registry.update_settings(instance.id, settings)
+    def default_ec2_endpoint
+      ['ec2', aws_region, 'amazonaws.com'].compact.join('.')
+    end
-        instance.id
-      end
+    def default_elb_endpoint
+      ['elasticloadbalancing', aws_region, 'amazonaws.com'].compact.join('.')
     end
     ##
@@ -137,20 +122,8 @@ module Bosh::AwsCloud
     # @param [String] instance_id EC2 instance id
     def delete_vm(instance_id)
       with_thread_name("delete_vm(#{instance_id})") do
-        instance = @ec2.instances[instance_id]
-        instance.terminate
-        begin
-          # TODO: should this be done before or after deleting VM?
-          @logger.info("Deleting instance settings for `#{instance.id}'")
-          @registry.delete_settings(instance.id)
-          @logger.info("Deleting instance `#{instance.id}'")
-          wait_resource(instance, :terminated)
-        rescue AWS::EC2::Errors::InvalidInstanceID::NotFound
-          # It's OK, just means that instance has already been deleted
-        end
+        logger.info("Deleting instance '#{instance_id}'")
+        InstanceManager.new(region, registry).terminate(instance_id, fast_path_delete?)
       end
     end
@@ -159,8 +132,16 @@ module Bosh::AwsCloud
     # @param [String] instance_id EC2 instance id
     def reboot_vm(instance_id)
       with_thread_name("reboot_vm(#{instance_id})") do
-        instance = @ec2.instances[instance_id]
-        soft_reboot(instance)
+        InstanceManager.new(region, registry).reboot(instance_id)
+      end
+    end
+    ##
+    # Has EC2 instance
+    # @param [String] instance_id EC2 instance id
+    def has_vm?(instance_id)
+      with_thread_name("has_vm?(#{instance_id})") do
+        InstanceManager.new(region, registry).has_instance?(instance_id)
       end
     end
@@ -172,40 +153,26 @@ module Bosh::AwsCloud
     # @return [String] created EBS volume id
     def create_disk(size, instance_id = nil)
       with_thread_name("create_disk(#{size}, #{instance_id})") do
-        unless size.kind_of?(Integer)
-          raise ArgumentError, "disk size needs to be an integer"
-        end
+        validate_disk_size(size)
-        if size < 1024
-          cloud_error("AWS CPI minimum disk size is 1 GiB")
-        end
+        # if the disk is created for an instance, use the same availability zone as they must match
+        volume = @ec2.volumes.create(:size => (size / 1024.0).ceil,
+                                     :availability_zone => @az_selector.select_availability_zone(instance_id))
-        if size > 1024 * 1000
-          cloud_error("AWS CPI maximum disk size is 1 TiB")
-        end
-        # if the disk is created for an instance, use the same availability
-        # zone as they must match
-        if instance_id
-          instance = @ec2.instances[instance_id]
-          availability_zone = instance.availability_zone
-        else
-          availability_zone = default_availability_zone
-        end
-        volume_params = {
-          :size => (size / 1024.0).ceil,
-          :availability_zone => availability_zone
-        }
-        volume = @ec2.volumes.create(volume_params)
-        @logger.info("Creating volume `#{volume.id}'")
-        wait_resource(volume, :available)
+        logger.info("Creating volume '#{volume.id}'")
+        ResourceWait.for_volume(volume: volume, state: :available)
         volume.id
       end
     end
+    def validate_disk_size(size)
+      raise ArgumentError, "disk size needs to be an integer" unless size.kind_of?(Integer)
+      cloud_error("AWS CPI minimum disk size is 1 GiB") if size < 1024
+      cloud_error("AWS CPI maximum disk size is 1 TiB") if size > 1024 * 1000
+    end
     ##
     # Delete EBS volume
     # @param [String] disk_id EBS volume id
@@ -213,22 +180,35 @@ module Bosh::AwsCloud
     def delete_disk(disk_id)
       with_thread_name("delete_disk(#{disk_id})") do
         volume = @ec2.volumes[disk_id]
-        state = volume.state
-        if state != :available
-          cloud_error("Cannot delete volume `#{volume.id}', state is #{state}")
+        logger.info("Deleting volume `#{volume.id}'")
+        tries = 10
+        sleep_cb = ResourceWait.sleep_callback("Waiting for volume `#{volume.id}' to be deleted", tries)
+        ensure_cb = Proc.new do |retries|
+          cloud_error("Timed out waiting to delete volume `#{volume.id}'") if retries == tries
         end
+        error = AWS::EC2::Errors::Client::VolumeInUse
-        volume.delete
+        Bosh::Common.retryable(tries: tries, sleep: sleep_cb, on: error, ensure: ensure_cb) do
+          volume.delete
+          true # return true to only retry on Exceptions
+        end
-        begin
-          @logger.info("Deleting volume `#{volume.id}'")
-          wait_resource(volume, :deleted)
-        rescue AWS::EC2::Errors::InvalidVolume::NotFound
-          # It's OK, just means the volume has already been deleted
+        if fast_path_delete?
+          begin
+            TagManager.tag(volume, "Name", "to be deleted")
+            logger.info("Volume `#{disk_id}' has been marked for deletion")
+          rescue AWS::EC2::Errors::InvalidVolume::NotFound
+            # Once in a blue moon AWS if actually fast enough that the volume is already gone
+            # when we get here, and if it is, our work here is done!
+          end
+          return
         end
-        @logger.info("Volume `#{disk_id}' has been deleted")
+        ResourceWait.for_volume(volume: volume, state: :deleted)
+        logger.info("Volume `#{disk_id}' has been deleted")
       end
     end
@@ -247,7 +227,7 @@ module Bosh::AwsCloud
           settings["disks"]["persistent"] ||= {}
           settings["disks"]["persistent"][disk_id] = device_name
         end
-        @logger.info("Attached `#{disk_id}' to `#{instance_id}'")
+        logger.info("Attached `#{disk_id}' to `#{instance_id}'")
       end
     end
@@ -267,33 +247,76 @@ module Bosh::AwsCloud
         detach_ebs_volume(instance, volume)
-        @logger.info("Detached `#{disk_id}' from `#{instance_id}'")
+        logger.info("Detached `#{disk_id}' from `#{instance_id}'")
+      end
+    end
+    def get_disks(vm_id)
+      disks = []
+      @ec2.instances[vm_id].block_devices.each do |block_device|
+        if block_device[:ebs]
+          disks << block_device[:ebs][:volume_id]
+        end
+      end
+      disks
+    end
+    # Take snapshot of disk
+    # @param [String] disk_id disk id of the disk to take the snapshot of
+    # @return [String] snapshot id
+    def snapshot_disk(disk_id, metadata)
+      with_thread_name("snapshot_disk(#{disk_id})") do
+        volume = @ec2.volumes[disk_id]
+        devices = []
+        volume.attachments.each {|attachment| devices << attachment.device}
+        name = [:deployment, :job, :index].collect { |key| metadata[key] }
+        name << devices.first.split('/').last unless devices.empty?
+        snapshot = volume.create_snapshot(name.join('/'))
+        logger.info("snapshot '#{snapshot.id}' of volume '#{disk_id}' created")
+        [:agent_id, :instance_id, :director_name, :director_uuid].each do |key|
+          TagManager.tag(snapshot, key, metadata[key])
+        end
+        TagManager.tag(snapshot, :device, devices.first) unless devices.empty?
+        TagManager.tag(snapshot, 'Name', name.join('/'))
+        ResourceWait.for_snapshot(snapshot: snapshot, state: :completed)
+        snapshot.id
+      end
+    end
+    # Delete a disk snapshot
+    # @param [String] snapshot_id snapshot id to delete
+    def delete_snapshot(snapshot_id)
+      with_thread_name("delete_snapshot(#{snapshot_id})") do
+        snapshot = @ec2.snapshots[snapshot_id]
+        if snapshot.status == :in_use
+          raise Bosh::Clouds::CloudError, "snapshot '#{snapshot.id}' can not be deleted as it is in use"
+        end
+        snapshot.delete
+        logger.info("snapshot '#{snapshot_id}' deleted")
       end
     end
     # Configure network for an EC2 instance
     # @param [String] instance_id EC2 instance id
     # @param [Hash] network_spec network properties
-    # @raise [Bosh::Clouds:NotSupported] if the security groups change
+    # @raise [Bosh::Clouds:NotSupported] if there's a network change that requires the recreation of the VM
     def configure_networks(instance_id, network_spec)
       with_thread_name("configure_networks(#{instance_id}, ...)") do
-        @logger.info("Configuring `#{instance_id}' to use the following " \
-                     "network settings: #{network_spec.pretty_inspect}")
+        logger.info("Configuring '#{instance_id}' to use new network settings: #{network_spec.pretty_inspect}")
-        network_configurator = NetworkConfigurator.new(network_spec)
         instance = @ec2.instances[instance_id]
-        actual = instance.security_groups.collect {|sg| sg.name }.sort
-        new = network_configurator.security_groups(@default_security_groups)
+        network_configurator = NetworkConfigurator.new(network_spec)
+        compare_security_groups(instance, network_spec)
-        # If the security groups change, we need to recreate the VM
-        # as you can't change the security group of a running instance,
-        # we need to send the InstanceUpdater a request to do it for us
-        unless actual == new
-          raise Bosh::Clouds::NotSupported,
-                "security groups change requires VM recreation: %s to %s" %
-                [actual.join(", "), new.join(", ")]
-        end
+        compare_private_ip_addresses(instance, network_configurator.private_ip)
         network_configurator.configure(@ec2, instance)
@@ -303,6 +326,43 @@ module Bosh::AwsCloud
       end
     end
+    # If the security groups change, we need to recreate the VM
+    # as you can't change the security group of a running instance,
+    # we need to send the InstanceUpdater a request to do it for us
+    def compare_security_groups(instance, network_spec)
+      actual_group_names = instance.security_groups.collect { |sg| sg.name }
+      specified_group_names = extract_security_group_names(network_spec)
+      if specified_group_names.empty?
+        new_group_names = Array(aws_properties["default_security_groups"])
+      else
+        new_group_names = specified_group_names
+      end
+      unless actual_group_names.sort == new_group_names.sort
+        raise Bosh::Clouds::NotSupported,
+              "security groups change requires VM recreation: %s to %s" %
+                  [actual_group_names.join(", "), new_group_names.join(", ")]
+      end
+    end
+    ##
+    # Compares actual instance private IP addresses with the IP address specified at the network spec
+    #
+    # @param [AWS::EC2::Instance] instance EC2 instance
+    # @param [String] specified_ip_address IP address specified at the network spec (if Manual Network)
+    # @return [void]
+    # @raise [Bosh::Clouds:NotSupported] If the IP address change, we need to recreate the VM as you can't
+    # change the IP address of a running server, so we need to send the InstanceUpdater a request to do it for us
+    def compare_private_ip_addresses(instance, specified_ip_address)
+      actual_ip_address = instance.private_ip_address
+      unless specified_ip_address.nil? || actual_ip_address == specified_ip_address
+        raise Bosh::Clouds::NotSupported,
+              "IP address change requires VM recreation: %s to %s" %
+              [actual_ip_address, specified_ip_address]
+      end
+    end
     ##
     # Creates a new EC2 AMI using stemcell image.
     # This method can only be run on an EC2 instance, as image creation
@@ -319,43 +379,34 @@ module Bosh::AwsCloud
     # @option cloud_properties [String] disk (2048)
     #   root disk size
     # @return [String] EC2 AMI name of the stemcell
-    def create_stemcell(image_path, cloud_properties)
-      # TODO: refactor into several smaller methods
+    def create_stemcell(image_path, stemcell_properties)
       with_thread_name("create_stemcell(#{image_path}...)") do
+        creator = StemcellCreator.new(region, stemcell_properties)
+        return creator.fake.id if creator.fake?
         begin
           # These three variables are used in 'ensure' clause
           instance = nil
           volume = nil
           # 1. Create and mount new EBS volume (2GB default)
-          disk_size = cloud_properties["disk"] || 2048
-          volume_id = create_disk(disk_size, current_instance_id)
+          disk_size = stemcell_properties["disk"] || 2048
+          volume_id = create_disk(disk_size, current_vm_id)
           volume = @ec2.volumes[volume_id]
-          instance = @ec2.instances[current_instance_id]
+          instance = @ec2.instances[current_vm_id]
           sd_name = attach_ebs_volume(instance, volume)
           ebs_volume = find_ebs_device(sd_name)
-          # 2. Copy image to new EBS volume
-          @logger.info("Copying stemcell disk image to '#{ebs_volume}'")
-          copy_root_image(image_path, ebs_volume)
-          # 3. Create snapshot and then an image using this snapshot
-          snapshot = volume.create_snapshot
-          wait_resource(snapshot, :completed)
-          params = image_params(cloud_properties, snapshot.id)
-          image = @ec2.images.create(params)
-          wait_resource(image, :available, :state)
-          tag(image, "Name", params[:description]) if params[:description]
-          image.id
+          logger.info("Creating stemcell with: '#{volume.id}' and '#{stemcell_properties.inspect}'")
+          creator.create(volume, ebs_volume, image_path).id
         rescue => e
-          @logger.error(e)
+          logger.error(e)
           raise e
         ensure
           if instance && volume
-            detach_ebs_volume(instance, volume)
+            detach_ebs_volume(instance, volume, true)
             delete_disk(volume.id)
           end
         end
@@ -366,24 +417,8 @@ module Bosh::AwsCloud
     # @param [String] stemcell_id EC2 AMI name of the stemcell to be deleted
     def delete_stemcell(stemcell_id)
       with_thread_name("delete_stemcell(#{stemcell_id})") do
-        snapshots = []
-        image = @ec2.images[stemcell_id]
-        image.block_device_mappings.each do |device, map|
-          id = map[:snapshot_id]
-          if id
-            @logger.debug("queuing snapshot #{id} for deletion")
-            snapshots << id
-          end
-        end
-        image.deregister
-        snapshots.each do |id|
-          @logger.info("cleaning up snapshot #{id}")
-          snapshot = @ec2.snapshots[id]
-          snapshot.delete
-        end
+        stemcell = Stemcell.find(region, stemcell_id)
+        stemcell.delete
       end
     end
@@ -395,17 +430,21 @@ module Bosh::AwsCloud
     def set_vm_metadata(vm, metadata)
       instance = @ec2.instances[vm]
-      # TODO should we clear existing tags that don't exist in metadata?
       metadata.each_pair do |key, value|
-        tag(instance, key, value)
+        TagManager.tag(instance, key, value)
       end
-      # should deployment name be included too?
       job = metadata[:job]
       index = metadata[:index]
-      tag(instance, "Name", "#{job}/#{index}") if job && index
+      if job && index
+        name = "#{job}/#{index}"
+      elsif metadata[:compiling]
+        name = "compiling/#{metadata[:compiling]}"
+      end
+      TagManager.tag(instance, "Name", name) if name
     rescue AWS::EC2::Errors::TagLimitExceeded => e
-      @logger.error("could not tag #{instance.id}: #{e.message}")
+      logger.error("could not tag #{instance.id}: #{e.message}")
     end
     # @note Not implemented in the AWS CPI
@@ -414,132 +453,79 @@ module Bosh::AwsCloud
       not_implemented(:validate_deployment)
     end
-    # Selects the availability zone to use from a list of disk volumes,
-    # resource pool availability zone (if any) and the default availability
-    # zone.
-    # @param [Hash] volumes volume ids to attach to the vm
-    # @param [String] resource_pool_az availability zone specified in
-    #   the resource pool (may be nil)
-    # @return [String] availability zone to use
-    # @note this is a private method that is public to make it easier to test
-    def select_availability_zone(volumes, resource_pool_az)
-      if volumes && !volumes.empty?
-        disks = volumes.map { |vid| @ec2.volumes[vid] }
-        ensure_same_availability_zone(disks, resource_pool_az)
-        disks.first.availability_zone
-      else
-        resource_pool_az || default_availability_zone
+    def find_ebs_device(sd_name)
+      xvd_name = sd_name.gsub(/^\/dev\/sd/, "/dev/xvd")
+      DEVICE_POLL_TIMEOUT.times do
+        if File.blockdev?(sd_name)
+          return sd_name
+        elsif File.blockdev?(xvd_name)
+          return xvd_name
+        end
+        sleep(1)
       end
-    end
-    # ensure all supplied availability zones are the same
-    # @note this is a private method that is public to make it easier to test
-    def ensure_same_availability_zone(disks, default)
-      zones = disks.map { |disk| disk.availability_zone }
-      zones << default if default
-      zones.uniq!
-      cloud_error "can't use multiple availability zones: %s" %
-        zones.join(", ") unless zones.size == 1 || zones.empty?
+      cloud_error("Cannot find EBS volume on current instance")
     end
-    private
-    # add a tag to something
-    def tag(taggable, key, value)
-      trimmed_key = key[0..(MAX_TAG_KEY_LENGTH - 1)]
-      trimmed_value = value[0..(MAX_TAG_VALUE_LENGTH - 1)]
-      taggable.add_tag(trimmed_key, :value => trimmed_value)
-    rescue AWS::EC2::Errors::InvalidParameterValue => e
-      @logger.error("could not tag #{taggable.id}: #{e.message}")
-    end
-    # Prepare EC2 user data
-    # @param [Hash] network_spec network specification
-    # @return [Hash] EC2 user data
-    def user_data(network_spec)
-      data = {}
+    private
-      data["registry"] = { "endpoint" => @registry.endpoint }
+    attr_reader :az_selector
+    attr_reader :region
-      with_dns(network_spec) do |servers|
-        data["dns"] = { "nameserver" => servers }
-      end
+    def agent_properties
+      @agent_properties ||= options.fetch('agent', {})
+    end
-      data
+    def aws_properties
+      @aws_properties ||= options.fetch('aws')
     end
-    # extract dns server list from network spec and yield the the list
-    # @param [Hash] network_spec network specification for instance
-    # @yield [Array]
-    def with_dns(network_spec)
-      network_spec.each_value do |properties|
-        if properties["dns"]
-          yield properties["dns"]
-          return
-        end
-      end
+    def aws_region
+      @aws_region ||= aws_properties.fetch('region', nil)
     end
-    def image_params(cloud_properties, snapshot_id)
-      root_device_name = cloud_properties["root_device_name"]
-      architecture = cloud_properties["architecture"]
+    def fast_path_delete?
+      aws_properties.fetch('fast_path_delete', false)
+    end
-      params = {
-          :name => "BOSH-#{generate_unique_name}",
-          :architecture => architecture,
-          :kernel_id => find_aki(architecture, root_device_name),
-          :root_device_name =>  root_device_name,
-          :block_device_mappings => {
-              "/dev/sda" => { :snapshot_id => snapshot_id },
-              "/dev/sdb" => "ephemeral0"
-          }
+    def initialize_aws
+      aws_logger = logger
+      aws_params = {
+          access_key_id:     aws_properties['access_key_id'],
+          secret_access_key: aws_properties['secret_access_key'],
+          ec2_endpoint:      aws_properties['ec2_endpoint'] || default_ec2_endpoint,
+          elb_endpoint:      aws_properties['elb_endpoint'] || default_elb_endpoint,
+          max_retries:       aws_properties['max_retries']  || DEFAULT_MAX_RETRIES ,
+          logger:            aws_logger
       }
-      # old stemcells doesn't have name & version
-      if cloud_properties["name"] && cloud_properties["version"]
-        name = "#{cloud_properties['name']} #{cloud_properties['version']}"
-        params[:description] = name
-      end
+      aws_params[:proxy_uri] = aws_properties['proxy_uri'] if aws_properties['proxy_uri']
-      params
-    end
+      # AWS Ruby SDK is threadsafe but Ruby autoload isn't,
+      # so we need to trigger eager autoload while constructing CPI
+      AWS.eager_autoload!
+      AWS.config(aws_params)
-    def find_aki(architecture, root_device_name)
-      @aki_picker.pick(architecture, root_device_name)
+      @ec2 = AWS::EC2.new
+      @region = @ec2.regions[aws_region]
+      @az_selector = AvailabilityZoneSelector.new(@region, aws_properties['default_availability_zone'])
     end
-    ##
-    # Generates initial agent settings. These settings will be read by agent
-    # from AWS registry (also a BOSH component) on a target instance. Disk
-    # conventions for amazon are:
-    # system disk: /dev/sda
-    # ephemeral disk: /dev/sdb
-    # EBS volumes can be configured to map to other device names later (sdf
-    # through sdp, also some kernels will remap sd* to xvd*).
-    #
-    # @param [String] agent_id Agent id (will be picked up by agent to
-    #   assume its identity
-    # @param [Hash] network_spec Agent network spec
-    # @param [Hash] environment
-    # @param [String] root_device_name root device, e.g. /dev/sda1
-    # @return [Hash]
-    def initial_agent_settings(agent_id, network_spec, environment,
-                               root_device_name)
-      settings = {
-        "vm" => {
-          "name" => "vm-#{generate_unique_name}"
-        },
-        "agent_id" => agent_id,
-        "networks" => network_spec,
-        "disks" => {
-          "system" => root_device_name,
-          "ephemeral" => "/dev/sdb",
-          "persistent" => {}
-        }
-      }
+    def initialize_registry
+      registry_properties = options.fetch('registry')
+      registry_endpoint   = registry_properties.fetch('endpoint')
+      registry_user       = registry_properties.fetch('user')
+      registry_password   = registry_properties.fetch('password')
-      settings["env"] = environment if environment
-      settings.merge(@agent_properties)
+      # Registry updates are not really atomic in relation to
+      # EC2 API calls, so they might get out of sync. Cloudcheck
+      # is supposed to fix that.
+      @registry = Bosh::Registry::Client.new(registry_endpoint,
+                                             registry_user,
+                                             registry_password)
     end
     def update_agent_settings(instance)
@@ -547,80 +533,48 @@ module Bosh::AwsCloud
         raise ArgumentError, "block is not provided"
       end
-      settings = @registry.read_settings(instance.id)
+      settings = registry.read_settings(instance.id)
       yield settings
-      @registry.update_settings(instance.id, settings)
+      registry.update_settings(instance.id, settings)
     end
-    def generate_unique_name
-      UUIDTools::UUID.random_create.to_s
-    end
-    ##
-    # Reads current instance id from EC2 metadata. We are assuming
-    # instance id cannot change while current process is running
-    # and thus memoizing it.
-    def current_instance_id
-      @metadata_lock.synchronize do
-        return @current_instance_id if @current_instance_id
-        client = HTTPClient.new
-        client.connect_timeout = METADATA_TIMEOUT
-        # Using 169.254.169.254 is an EC2 convention for getting
-        # instance metadata
-        uri = "http://169.254.169.254/1.0/meta-data/instance-id/"
+    def attach_ebs_volume(instance, volume)
+      device_name = select_device_name(instance)
+      cloud_error('Instance has too many disks attached') unless device_name
+      # Work around AWS eventual (in)consistency:
+      # even tough we don't call attach_disk until the disk is ready,
+      # AWS might still lie and say that the disk isn't ready yet, so
+      # we try again just to be really sure it is telling the truth
+      attachment = nil
+      Bosh::Common.retryable(tries: 15, on: AWS::EC2::Errors::IncorrectState) do
+        attachment = volume.attach_to(instance, device_name)
+      end
-        response = client.get(uri)
-        unless response.status == 200
-          cloud_error("Instance metadata endpoint returned " \
-                      "HTTP #{response.status}")
-        end
+      logger.info("Attaching '#{volume.id}' to '#{instance.id}' as '#{device_name}'")
+      ResourceWait.for_attachment(attachment: attachment, state: :attached)
-        @current_instance_id = response.body
-      end
+      device_name = attachment.device
+      logger.info("Attached '#{volume.id}' to '#{instance.id}' as '#{device_name}'")
-    rescue HTTPClient::TimeoutError
-      cloud_error("Timed out reading instance metadata, " \
-                  "please make sure CPI is running on EC2 instance")
+      device_name
     end
-    def attach_ebs_volume(instance, volume)
-      # TODO once we upgrade the aws-sdk gem to > 1.3.9, we need to use:
-      # instance.block_device_mappings.to_hash.keys
+    def select_device_name(instance)
       device_names = Set.new(instance.block_device_mappings.to_hash.keys)
-      new_attachment = nil
-      ("f".."p").each do |char| # f..p is what console suggests
-        # Some kernels will remap sdX to xvdX, so agent needs
-        # to lookup both (sd, then xvd)
-        dev_name = "/dev/sd#{char}"
-        if device_names.include?(dev_name)
-          @logger.warn("`#{dev_name}' on `#{instance.id}' is taken")
-          next
-        end
-        new_attachment = volume.attach_to(instance, dev_name)
-        break
-      end
-      if new_attachment.nil?
-        # TODO: better messaging?
-        cloud_error("Instance has too many disks attached")
+      ('f'..'p').each do |char| # f..p is what console suggests
+                                # Some kernels will remap sdX to xvdX, so agent needs
+                                # to lookup both (sd, then xvd)
+        device_name = "/dev/sd#{char}"
+        return device_name unless device_names.include?(device_name)
+        logger.warn("'#{device_name}' on '#{instance.id}' is taken")
       end
-      @logger.info("Attaching `#{volume.id}' to `#{instance.id}'")
-      wait_resource(new_attachment, :attached)
-      device_name = new_attachment.device
-      @logger.info("Attached `#{volume.id}' to `#{instance.id}', " \
-                   "device name is `#{device_name}'")
-      device_name
+      nil
     end
-    def detach_ebs_volume(instance, volume)
-      # TODO once we upgrade the aws-sdk gem to > 1.3.9, we need to use:
-      # instance.block_device_mappings.to_hash.keys
+    def detach_ebs_volume(instance, volume, force=false)
       mappings = instance.block_device_mappings.to_hash
       device_map = mappings.inject({}) do |hash, (device_name, attachment)|
@@ -629,98 +583,14 @@ module Bosh::AwsCloud
       end
       if device_map[volume.id].nil?
-        cloud_error("Disk `#{volume.id}' is not attached " \
-                    "to instance `#{instance.id}'")
+        raise Bosh::Clouds::DiskNotAttached.new(true),
+              "Disk `#{volume.id}' is not attached to instance `#{instance.id}'"
       end
-      attachment = volume.detach_from(instance, device_map[volume.id])
-      @logger.info("Detaching `#{volume.id}' from `#{instance.id}'")
+      attachment = volume.detach_from(instance, device_map[volume.id], force: force)
+      logger.info("Detaching `#{volume.id}' from `#{instance.id}'")
-      wait_resource(attachment, :detached) do |error|
-        if error.is_a? AWS::Core::Resource::NotFound
-          @logger.info("attachment is no longer found, assuming it to be detached")
-          :detached
-        end
-      end
-    end
-    # This method tries to execute the helper script stemcell-copy
-    # as root using sudo, since it needs to write to the ebs_volume.
-    # If stemcell-copy isn't available, it falls back to writing directly
-    # to the device, which is used in the micro bosh deployer.
-    # The stemcell-copy script must be in the PATH of the user running
-    # the director, and needs sudo privileges to execute without
-    # password.
-    def copy_root_image(image_path, ebs_volume)
-      path = ENV["PATH"]
-      if stemcell_copy = has_stemcell_copy(path)
-        @logger.debug("copying stemcell using stemcell-copy script")
-        # note that is is a potentially dangerous operation, but as the
-        # stemcell-copy script sets PATH to a sane value this is safe
-        out = `sudo #{stemcell_copy} #{image_path} #{ebs_volume} 2>&1`
-      else
-        @logger.info("falling back to using dd to copy stemcell")
-        out = `tar -xzf #{image_path} -O root.img | dd of=#{ebs_volume} 2>&1`
-      end
-      unless $?.exitstatus == 0
-        cloud_error("Unable to copy stemcell root image, " \
-                    "exit status #{$?.exitstatus}: #{out}")
-      end
-      @logger.debug("stemcell copy output:\n#{out}")
-    end
-    # checks if the stemcell-copy script can be found in
-    # the current PATH
-    def has_stemcell_copy(path)
-      path.split(":").each do |dir|
-        stemcell_copy = File.join(dir, "stemcell-copy")
-        return stemcell_copy if File.exist?(stemcell_copy)
-      end
-      nil
-    end
-    def find_ebs_device(sd_name)
-      xvd_name = sd_name.gsub(/^\/dev\/sd/, "/dev/xvd")
-      DEVICE_POLL_TIMEOUT.times do
-        if File.blockdev?(sd_name)
-          return sd_name
-        elsif File.blockdev?(xvd_name)
-          return xvd_name
-        end
-        sleep(1)
-      end
-      cloud_error("Cannot find EBS volume on current instance")
-    end
-    ##
-    # Soft reboots EC2 instance
-    # @param [AWS::EC2::Instance] instance EC2 instance
-    def soft_reboot(instance)
-      # There is no trackable status change for the instance being
-      # rebooted, so it's up to CPI client to keep track of agent
-      # being ready after reboot.
-      instance.reboot
-    end
-    ##
-    # Hard reboots EC2 instance
-    # @param [AWS::EC2::Instance] instance EC2 instance
-    def hard_reboot(instance)
-      # N.B. This will only work with ebs-store instances,
-      # as instance-store instances don't support stop/start.
-      instance.stop
-      @logger.info("Stopping instance `#{instance.id}'")
-      wait_resource(instance, :stopped)
-      instance.start
-      @logger.info("Starting instance `#{instance.id}'")
-      wait_resource(instance, :running)
+      ResourceWait.for_attachment(attachment: attachment, state: :detached)
     end
     ##
@@ -728,41 +598,54 @@ module Bosh::AwsCloud
     # be used to create all required data structures etc.
     #
     def validate_options
-      unless @options.has_key?("aws") &&
-          @options["aws"].is_a?(Hash) &&
-          @options["aws"]["access_key_id"] &&
-          @options["aws"]["secret_access_key"]
-        raise ArgumentError, "Invalid AWS configuration parameters"
-      end
+      required_keys = {
+          "aws" => ["access_key_id", "secret_access_key", "region", "default_key_name"],
+          "registry" => ["endpoint", "user", "password"],
+      }
-      unless @options.has_key?("registry") &&
-          @options["registry"].is_a?(Hash) &&
-          @options["registry"]["endpoint"] &&
-          @options["registry"]["user"] &&
-          @options["registry"]["password"]
-        raise ArgumentError, "Invalid registry configuration parameters"
-      end
-    end
+      missing_keys = []
-    def default_ec2_endpoint
-      if @aws_region
-        "ec2.#{@aws_region}.amazonaws.com"
-      else
-        DEFAULT_EC2_ENDPOINT
+      required_keys.each_pair do |key, values|
+        values.each do |value|
+          if (!options.has_key?(key) || !options[key].has_key?(value))
+            missing_keys << "#{key}:#{value}"
+          end
+        end
       end
-    end
-    def default_availability_zone
-      if @aws_region
-        "#{@aws_region}b"
-      else
-        DEFAULT_AVAILABILITY_ZONE
-      end
+      raise ArgumentError, "missing configuration parameters > #{missing_keys.join(', ')}" unless missing_keys.empty?
     end
-    def task_checkpoint
-      Bosh::Clouds::Config.task_checkpoint
+    # Generates initial agent settings. These settings will be read by agent
+    # from AWS registry (also a BOSH component) on a target instance. Disk
+    # conventions for amazon are:
+    # system disk: /dev/sda
+    # ephemeral disk: /dev/sdb
+    # EBS volumes can be configured to map to other device names later (sdf
+    # through sdp, also some kernels will remap sd* to xvd*).
+    #
+    # @param [String] agent_id Agent id (will be picked up by agent to
+    #   assume its identity
+    # @param [Hash] network_spec Agent network spec
+    # @param [Hash] environment
+    # @param [String] root_device_name root device, e.g. /dev/sda1
+    # @return [Hash]
+    def initial_agent_settings(agent_id, network_spec, environment, root_device_name)
+      settings = {
+          "vm" => {
+              "name" => "vm-#{SecureRandom.uuid}"
+          },
+          "agent_id" => agent_id,
+          "networks" => network_spec,
+          "disks" => {
+              "system" => root_device_name,
+              "ephemeral" => "/dev/sdb",
+              "persistent" => {}
+          }
+      }
+      settings["env"] = environment if environment
+      settings.merge(agent_properties)
     end
   end
 end