ecs_deploy 0.2.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,209 @@
1
+ require "aws-sdk-autoscaling"
2
+ require "aws-sdk-ec2"
3
+
4
+ require "ecs_deploy"
5
+ require "ecs_deploy/auto_scaler/config_base"
6
+ require "ecs_deploy/auto_scaler/cluster_resource_manager"
7
+
8
+ module EcsDeploy
9
+ module AutoScaler
10
+ AutoScalingGroupConfig = Struct.new(:name, :region, :cluster, :buffer, :service_configs) do
11
+ include ConfigBase
12
+
13
+ MAX_DETACHABLE_INSTANCE_COUNT = 20
14
+
15
+ def initialize(attributes = {}, logger)
16
+ attributes = attributes.dup
17
+ services = attributes.delete("services")
18
+ super(attributes, logger)
19
+ self.service_configs = services.map do |s|
20
+ ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger)
21
+ end
22
+ end
23
+
24
+ def update_desired_capacity(required_capacity)
25
+ detach_and_terminate_orphan_instances
26
+
27
+ desired_capacity = (required_capacity + buffer.to_f).ceil
28
+
29
+ current_asg = client.describe_auto_scaling_groups({
30
+ auto_scaling_group_names: [name],
31
+ }).auto_scaling_groups[0]
32
+
33
+ if current_asg.desired_capacity > desired_capacity
34
+ decreased_capacity = decrease_desired_capacity(current_asg.desired_capacity - desired_capacity)
35
+ if decreased_capacity > 0
36
+ new_desired_capacity = current_asg.desired_capacity - decreased_capacity
37
+ cluster_resource_manager.trigger_capacity_update(current_asg.desired_capacity, new_desired_capacity)
38
+ @logger.info "#{log_prefix} Update desired_capacity to #{new_desired_capacity}"
39
+ else
40
+ @logger.info "#{log_prefix} Tried to Update desired_capacity but there were no deregisterable instances"
41
+ end
42
+ elsif current_asg.desired_capacity < desired_capacity
43
+ client.update_auto_scaling_group(
44
+ auto_scaling_group_name: name,
45
+ min_size: 0,
46
+ max_size: [current_asg.max_size, desired_capacity].max,
47
+ desired_capacity: desired_capacity,
48
+ )
49
+ cluster_resource_manager.trigger_capacity_update(current_asg.desired_capacity, desired_capacity)
50
+ @logger.info "#{log_prefix} Update desired_capacity to #{desired_capacity}"
51
+ end
52
+ rescue => e
53
+ AutoScaler.error_logger.error(e)
54
+ end
55
+
56
+ def cluster_resource_manager
57
+ @cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new(
58
+ region: region,
59
+ cluster: cluster,
60
+ service_configs: service_configs,
61
+ capacity_based_on: "instances",
62
+ logger: @logger,
63
+ )
64
+ end
65
+
66
+ def detach_instances(instance_ids:, should_decrement_desired_capacity:)
67
+ return if instance_ids.empty?
68
+
69
+ instance_ids.each_slice(MAX_DETACHABLE_INSTANCE_COUNT) do |ids|
70
+ client.detach_instances(
71
+ auto_scaling_group_name: name,
72
+ instance_ids: ids,
73
+ should_decrement_desired_capacity: should_decrement_desired_capacity,
74
+ )
75
+ end
76
+
77
+ @logger.info "#{log_prefix} Detach instances from ASG: #{instance_ids.inspect}"
78
+ end
79
+
80
+ private
81
+
82
+ def decrease_desired_capacity(count)
83
+ container_instance_arns_in_service = cluster_resource_manager.fetch_container_instance_arns_in_service
84
+ container_instances_in_cluster = cluster_resource_manager.fetch_container_instances_in_cluster
85
+ auto_scaling_group_instances = instances(reload: true)
86
+ deregisterable_instances = container_instances_in_cluster.select do |i|
87
+ i.pending_tasks_count == 0 &&
88
+ !running_essential_task?(i, container_instance_arns_in_service) &&
89
+ auto_scaling_group_instances.any? {|instance| instance.instance_id == i.ec2_instance_id }
90
+ end
91
+
92
+ @logger.info "#{log_prefix} Fetch deregisterable instances: #{deregisterable_instances.map(&:ec2_instance_id).inspect}"
93
+
94
+ az_to_instance_count = auto_scaling_group_instances.each_with_object(Hash.new(0)) { |i, h| h[i.availability_zone] += 1 }
95
+ az_to_deregisterable_instances = deregisterable_instances.group_by do |i|
96
+ i.attributes.find { |a| a.name == "ecs.availability-zone" }.value
97
+ end
98
+
99
+ deregistered_instance_ids = []
100
+ prev_max_count = nil
101
+ # Select instances to be deregistered keeping the balance of instance count per availability zone
102
+ while deregistered_instance_ids.size < count
103
+ max_count = az_to_instance_count.each_value.max
104
+ break if max_count == prev_max_count # No more deregistable instances with keeping the balance
105
+
106
+ azs = az_to_instance_count.select { |_, c| c == max_count }.keys
107
+ azs.each do |az|
108
+ instance = az_to_deregisterable_instances[az]&.pop
109
+ next if instance.nil?
110
+ begin
111
+ cluster_resource_manager.deregister_container_instance(instance.container_instance_arn)
112
+ deregistered_instance_ids << instance.ec2_instance_id
113
+ az_to_instance_count[az] -= 1
114
+ rescue EcsDeploy::AutoScaler::ClusterResourceManager::DeregisterContainerInstanceFailed
115
+ end
116
+ break if deregistered_instance_ids.size >= count
117
+ end
118
+ prev_max_count = max_count
119
+ end
120
+
121
+ @logger.info "#{log_prefix} Deregistered instances: #{deregistered_instance_ids.inspect}"
122
+
123
+ detach_and_terminate_instances(deregistered_instance_ids)
124
+
125
+ deregistered_instance_ids.size
126
+ end
127
+
128
+ def detach_and_terminate_instances(instance_ids)
129
+ return if instance_ids.empty?
130
+
131
+ detach_instances(
132
+ instance_ids: instance_ids,
133
+ should_decrement_desired_capacity: true
134
+ )
135
+
136
+ sleep 3
137
+
138
+ ec2_client.terminate_instances(instance_ids: instance_ids)
139
+
140
+ @logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}"
141
+ rescue => e
142
+ AutoScaler.error_logger.error(e)
143
+ end
144
+
145
+ def detach_and_terminate_orphan_instances
146
+ container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id)
147
+ orphans = instances(reload: true).reject do |i|
148
+ next true if container_instance_ids.include?(i.instance_id)
149
+
150
+ # The lifecycle state of terminated instances becomes "Terminating", "Terminating:Wait", or "Terminating:Proceed",
151
+ # and we can't detach instances in such a state.
152
+ if i.lifecycle_state.start_with?("Terminating")
153
+ AutoScaler.error_logger.warn("#{log_prefix} The lifesycle state of #{i.instance_id} is \"#{i.lifecycle_state}\", so ignore it")
154
+ next true
155
+ end
156
+ end.map(&:instance_id)
157
+
158
+ return if orphans.empty?
159
+
160
+ targets = ec2_client.describe_instances(instance_ids: orphans).reservations.flat_map(&:instances).select do |i|
161
+ (Time.now - i.launch_time) > 600
162
+ end
163
+
164
+ detach_and_terminate_instances(targets.map(&:instance_id))
165
+ rescue => e
166
+ AutoScaler.error_logger.error(e)
167
+ end
168
+
169
+ def client
170
+ Aws::AutoScaling::Client.new(
171
+ access_key_id: EcsDeploy.config.access_key_id,
172
+ secret_access_key: EcsDeploy.config.secret_access_key,
173
+ region: region,
174
+ logger: logger
175
+ )
176
+ end
177
+
178
+ def ec2_client
179
+ Aws::EC2::Client.new(
180
+ access_key_id: EcsDeploy.config.access_key_id,
181
+ secret_access_key: EcsDeploy.config.secret_access_key,
182
+ region: region,
183
+ logger: logger
184
+ )
185
+ end
186
+
187
+ def instances(reload: false)
188
+ if reload || @instances.nil?
189
+ resp = client.describe_auto_scaling_groups({
190
+ auto_scaling_group_names: [name],
191
+ })
192
+ @instances = resp.auto_scaling_groups[0].instances
193
+ else
194
+ @instances
195
+ end
196
+ end
197
+
198
+ def running_essential_task?(instance, container_instance_arns_in_service)
199
+ return false if instance.running_tasks_count == 0
200
+
201
+ container_instance_arns_in_service.include?(instance.container_instance_arn)
202
+ end
203
+
204
+ def log_prefix
205
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
206
+ end
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,149 @@
1
+ require "timeout"
2
+
3
+ require "aws-sdk-ecs"
4
+
5
+ module EcsDeploy
6
+ module AutoScaler
7
+ class ClusterResourceManager
8
+ class DeregisterContainerInstanceFailed < StandardError; end
9
+
10
+ MAX_DESCRIBABLE_SERVICE_COUNT = 10
11
+
12
+ def initialize(region:, cluster:, service_configs:, logger: nil, capacity_based_on:)
13
+ @region = region
14
+ @cluster = cluster
15
+ @logger = logger
16
+ @service_configs = service_configs
17
+ @capacity_based_on = capacity_based_on
18
+ if @capacity_based_on != "instances" && @capacity_based_on != "vCPUs"
19
+ raise ArgumentError, 'capacity_based_on should be either "instances" or "vCPUs"'
20
+ end
21
+
22
+ @mutex = Mutex.new
23
+ @resource = ConditionVariable.new
24
+ @used_capacity = @service_configs.sum { |s| s.desired_count * s.required_capacity }
25
+ @capacity = calculate_active_instance_capacity
26
+ end
27
+
28
+ def acquire(capacity, timeout: nil)
29
+ @mutex.synchronize do
30
+ @logger&.debug("#{log_prefix} Try to acquire #{capacity} capacity (capacity: #{@capacity}, used_capacity: #{@used_capacity})")
31
+ Timeout.timeout(timeout) do
32
+ while @capacity - @used_capacity < capacity
33
+ @resource.wait(@mutex)
34
+ end
35
+ end
36
+ @used_capacity += capacity
37
+ @logger&.debug("#{log_prefix} Acquired #{capacity} capacity (capacity: #{@capacity}, used_capacity: #{@used_capacity})")
38
+ end
39
+ true
40
+ rescue Timeout::Error
41
+ false
42
+ end
43
+
44
+ def release(capacity)
45
+ @mutex.synchronize do
46
+ @used_capacity -= capacity
47
+ @resource.broadcast
48
+ end
49
+ @logger&.debug("#{log_prefix} Released #{capacity} capacity (capacity: #{@capacity}, used_capacity: #{@used_capacity})")
50
+ true
51
+ end
52
+
53
+ def fetch_container_instances_in_cluster
54
+ cl = ecs_client
55
+ resp = cl.list_container_instances(cluster: @cluster)
56
+ if resp.container_instance_arns.empty?
57
+ []
58
+ else
59
+ resp.flat_map do |resp|
60
+ cl.describe_container_instances(cluster: @cluster, container_instances: resp.container_instance_arns).container_instances
61
+ end
62
+ end
63
+ end
64
+
65
+ def fetch_container_instance_arns_in_service
66
+ task_groups = @service_configs.map { |s| "service:#{s.name}" }
67
+ ecs_client.list_container_instances(cluster: @cluster, filter: "task:group in [#{task_groups.join(",")}]").flat_map(&:container_instance_arns)
68
+ end
69
+
70
+ def deregister_container_instance(container_instance_arn)
71
+ ecs_client.deregister_container_instance(cluster: @cluster, container_instance: container_instance_arn, force: true)
72
+ rescue Aws::ECS::Errors::InvalidParameterException
73
+ raise DeregisterContainerInstanceFailed
74
+ end
75
+
76
+ def trigger_capacity_update(old_desired_capacity, new_desired_capacity, interval: 5, wait_until_capacity_updated: false)
77
+ th = Thread.new do
78
+ @logger&.info "#{log_prefix} Start updating capacity: #{old_desired_capacity} -> #{new_desired_capacity}"
79
+ Timeout.timeout(180) do
80
+ until @capacity == new_desired_capacity || (new_desired_capacity >= old_desired_capacity && @capacity > new_desired_capacity)
81
+ @mutex.synchronize do
82
+ begin
83
+ @capacity = calculate_active_instance_capacity
84
+ @resource.broadcast
85
+ rescue => e
86
+ AutoScaler.error_logger.warn("#{log_prefix} `#{__method__}': #{e} (#{e.class})")
87
+ end
88
+ end
89
+
90
+ sleep interval
91
+ end
92
+ @logger&.info "#{log_prefix} capacity is updated to #{@capacity}"
93
+ end
94
+ end
95
+
96
+ if wait_until_capacity_updated
97
+ @logger&.info "#{log_prefix} Wait for the capacity of active instances to become #{new_desired_capacity} from #{old_desired_capacity}"
98
+ begin
99
+ th.join
100
+ rescue Timeout::Error => e
101
+ msg = "#{log_prefix} `#{__method__}': #{e} (#{e.class})"
102
+ if @capacity_based_on == "vCPUs"
103
+ # Timeout::Error sometimes occur.
104
+ # For example, @capacity won't be new_desired_capacity if new_desired_capacity is odd and all instances have 2 vCPUs
105
+ AutoScaler.error_logger.warn(msg)
106
+ else
107
+ AutoScaler.error_logger.error(msg)
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ def calculate_active_instance_capacity
114
+ cl = ecs_client
115
+
116
+ if @capacity_based_on == "instances"
117
+ return cl.list_container_instances(cluster: @cluster, status: "ACTIVE").sum do |resp|
118
+ resp.container_instance_arns.size
119
+ end
120
+ end
121
+
122
+ total_cpu = cl.list_container_instances(cluster: @cluster, status: "ACTIVE").sum do |resp|
123
+ next 0 if resp.container_instance_arns.empty?
124
+ ecs_client.describe_container_instances(
125
+ cluster: @cluster,
126
+ container_instances: resp.container_instance_arns,
127
+ ).container_instances.sum { |ci| ci.registered_resources.find { |r| r.name == "CPU" }.integer_value }
128
+ end
129
+
130
+ total_cpu / 1024
131
+ end
132
+
133
+ private
134
+
135
+ def ecs_client
136
+ Aws::ECS::Client.new(
137
+ access_key_id: EcsDeploy.config.access_key_id,
138
+ secret_access_key: EcsDeploy.config.secret_access_key,
139
+ region: @region,
140
+ logger: @logger,
141
+ )
142
+ end
143
+
144
+ def log_prefix
145
+ "[#{self.class.to_s.gsub(/\AEcsDeploy::AutoScaler::/, "")} #{@region} #{@cluster}]"
146
+ end
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,16 @@
1
+ module EcsDeploy
2
+ module AutoScaler
3
+ module ConfigBase
4
+ def initialize(attributes = {}, logger)
5
+ attributes.each do |key, val|
6
+ send("#{key}=", val)
7
+ end
8
+ @logger = logger
9
+ end
10
+
11
+ def logger
12
+ @logger
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,134 @@
1
+ require "aws-sdk-ec2"
2
+ require "aws-sdk-ecs"
3
+ require "aws-sdk-sqs"
4
+
5
+ require "ecs_deploy"
6
+
7
+ module EcsDeploy
8
+ module AutoScaler
9
+ class InstanceDrainer
10
+ def initialize(auto_scaling_group_configs:, spot_fleet_request_configs:, logger:)
11
+ @auto_scaling_group_configs = auto_scaling_group_configs || []
12
+ @spot_fleet_request_configs = spot_fleet_request_configs || []
13
+ @logger = logger
14
+ @stop = false
15
+ end
16
+
17
+ def poll_spot_instance_interruption_warnings(queue_url)
18
+ @logger.debug "Start polling spot instance interruption warnings of #{queue_url}"
19
+
20
+ # cf. https://docs.aws.amazon.com/general/latest/gr/rande.html#sqs_region
21
+ region = URI.parse(queue_url).host.split(".")[1]
22
+
23
+ poller = Aws::SQS::QueuePoller.new(queue_url, client: sqs_client(region))
24
+ poller.before_request do |stats|
25
+ throw :stop_polling if @stop
26
+ end
27
+
28
+ until @stop
29
+ begin
30
+ poller.poll(max_number_of_messages: 10, visibility_timeout: 15) do |messages, _|
31
+ instance_ids = messages.map do |msg|
32
+ JSON.parse(msg.body).dig("detail", "instance-id")
33
+ end
34
+
35
+ config_to_instance_ids = build_config_to_instance_ids(instance_ids, region)
36
+ set_instance_state_to_draining(config_to_instance_ids, region)
37
+ # Detach the instances to launch other instances
38
+ detach_instances_from_auto_scaling_groups(config_to_instance_ids, region)
39
+ end
40
+ rescue => e
41
+ AutoScaler.error_logger.error(e)
42
+ end
43
+ end
44
+
45
+ @logger.debug "Stop polling spot instance interruption warnings of #{queue_url}"
46
+ end
47
+
48
+ def stop
49
+ @stop = true
50
+ end
51
+
52
+ private
53
+
54
+ def build_config_to_instance_ids(instance_ids, region)
55
+ config_to_instance_ids = Hash.new{ |h, k| h[k] = [] }
56
+ ec2_client(region).describe_instances(instance_ids: instance_ids).each do |resp|
57
+ resp.reservations.each do |reservation|
58
+ reservation.instances.each do |i|
59
+ sfr_id = i.tags.find { |t| t.key == "aws:ec2spot:fleet-request-id" }&.value
60
+ if sfr_id
61
+ config = @spot_fleet_request_configs.find { |c| c.id == sfr_id && c.region == region }
62
+ config_to_instance_ids[config] << i.instance_id if config
63
+ next
64
+ end
65
+
66
+ asg_name = i.tags.find { |t| t.key == "aws:autoscaling:groupName" }&.value
67
+ if asg_name
68
+ config = @auto_scaling_group_configs.find { |c| c.name == asg_name && c.region == region }
69
+ config_to_instance_ids[config] << i.instance_id if config
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ config_to_instance_ids
76
+ end
77
+
78
+ def set_instance_state_to_draining(config_to_instance_ids, region)
79
+ cl = ecs_client(region)
80
+ config_to_instance_ids.each do |config, instance_ids|
81
+ arns = cl.list_container_instances(
82
+ cluster: config.cluster,
83
+ filter: "ec2InstanceId in [#{instance_ids.join(",")}]",
84
+ ).container_instance_arns
85
+
86
+ if instance_ids.size != arns.size
87
+ AutoScaler.error_logger.warn("The number of ARNs differs from the number of instance IDs: instance_ids: #{instance_ids.inspect}, container_instance_arns: #{arns.inspect}")
88
+ end
89
+ next if arns.empty?
90
+
91
+ cl.update_container_instances_state(
92
+ cluster: config.cluster,
93
+ container_instances: arns,
94
+ status: "DRAINING",
95
+ )
96
+ @logger.info "Draining instances: region: #{region}, cluster: #{config.cluster}, instance_ids: #{instance_ids.inspect}, container_instance_arns: #{arns.inspect}"
97
+ end
98
+ end
99
+
100
+ def detach_instances_from_auto_scaling_groups(config_to_instance_ids, region)
101
+ @auto_scaling_group_configs.each do |config|
102
+ config.detach_instances(instance_ids: config_to_instance_ids[config], should_decrement_desired_capacity: false)
103
+ end
104
+ end
105
+
106
+ def ec2_client(region)
107
+ Aws::EC2::Client.new(
108
+ access_key_id: EcsDeploy.config.access_key_id,
109
+ secret_access_key: EcsDeploy.config.secret_access_key,
110
+ region: region,
111
+ logger: @logger,
112
+ )
113
+ end
114
+
115
+ def ecs_client(region)
116
+ Aws::ECS::Client.new(
117
+ access_key_id: EcsDeploy.config.access_key_id,
118
+ secret_access_key: EcsDeploy.config.secret_access_key,
119
+ region: region,
120
+ logger: @logger,
121
+ )
122
+ end
123
+
124
+ def sqs_client(region)
125
+ Aws::SQS::Client.new(
126
+ access_key_id: EcsDeploy.config.access_key_id,
127
+ secret_access_key: EcsDeploy.config.secret_access_key,
128
+ region: region,
129
+ logger: @logger,
130
+ )
131
+ end
132
+ end
133
+ end
134
+ end