ecs_deploy 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,209 @@
1
+ require "aws-sdk-autoscaling"
2
+ require "aws-sdk-ec2"
3
+
4
+ require "ecs_deploy"
5
+ require "ecs_deploy/auto_scaler/config_base"
6
+ require "ecs_deploy/auto_scaler/cluster_resource_manager"
7
+
8
+ module EcsDeploy
9
+ module AutoScaler
10
+ AutoScalingGroupConfig = Struct.new(:name, :region, :cluster, :buffer, :service_configs) do
11
+ include ConfigBase
12
+
13
+ MAX_DETACHABLE_INSTANCE_COUNT = 20
14
+
15
+ def initialize(attributes = {}, logger)
16
+ attributes = attributes.dup
17
+ services = attributes.delete("services")
18
+ super(attributes, logger)
19
+ self.service_configs = services.map do |s|
20
+ ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger)
21
+ end
22
+ end
23
+
24
+ def update_desired_capacity(required_capacity)
25
+ detach_and_terminate_orphan_instances
26
+
27
+ desired_capacity = (required_capacity + buffer.to_f).ceil
28
+
29
+ current_asg = client.describe_auto_scaling_groups({
30
+ auto_scaling_group_names: [name],
31
+ }).auto_scaling_groups[0]
32
+
33
+ if current_asg.desired_capacity > desired_capacity
34
+ decreased_capacity = decrease_desired_capacity(current_asg.desired_capacity - desired_capacity)
35
+ if decreased_capacity > 0
36
+ new_desired_capacity = current_asg.desired_capacity - decreased_capacity
37
+ cluster_resource_manager.trigger_capacity_update(current_asg.desired_capacity, new_desired_capacity)
38
+ @logger.info "#{log_prefix} Update desired_capacity to #{new_desired_capacity}"
39
+ else
40
+ @logger.info "#{log_prefix} Tried to Update desired_capacity but there were no deregisterable instances"
41
+ end
42
+ elsif current_asg.desired_capacity < desired_capacity
43
+ client.update_auto_scaling_group(
44
+ auto_scaling_group_name: name,
45
+ min_size: 0,
46
+ max_size: [current_asg.max_size, desired_capacity].max,
47
+ desired_capacity: desired_capacity,
48
+ )
49
+ cluster_resource_manager.trigger_capacity_update(current_asg.desired_capacity, desired_capacity)
50
+ @logger.info "#{log_prefix} Update desired_capacity to #{desired_capacity}"
51
+ end
52
+ rescue => e
53
+ AutoScaler.error_logger.error(e)
54
+ end
55
+
56
+ def cluster_resource_manager
57
+ @cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new(
58
+ region: region,
59
+ cluster: cluster,
60
+ service_configs: service_configs,
61
+ capacity_based_on: "instances",
62
+ logger: @logger,
63
+ )
64
+ end
65
+
66
+ def detach_instances(instance_ids:, should_decrement_desired_capacity:)
67
+ return if instance_ids.empty?
68
+
69
+ instance_ids.each_slice(MAX_DETACHABLE_INSTANCE_COUNT) do |ids|
70
+ client.detach_instances(
71
+ auto_scaling_group_name: name,
72
+ instance_ids: ids,
73
+ should_decrement_desired_capacity: should_decrement_desired_capacity,
74
+ )
75
+ end
76
+
77
+ @logger.info "#{log_prefix} Detach instances from ASG: #{instance_ids.inspect}"
78
+ end
79
+
80
+ private
81
+
82
+ def decrease_desired_capacity(count)
83
+ container_instance_arns_in_service = cluster_resource_manager.fetch_container_instance_arns_in_service
84
+ container_instances_in_cluster = cluster_resource_manager.fetch_container_instances_in_cluster
85
+ auto_scaling_group_instances = instances(reload: true)
86
+ deregisterable_instances = container_instances_in_cluster.select do |i|
87
+ i.pending_tasks_count == 0 &&
88
+ !running_essential_task?(i, container_instance_arns_in_service) &&
89
+ auto_scaling_group_instances.any? {|instance| instance.instance_id == i.ec2_instance_id }
90
+ end
91
+
92
+ @logger.info "#{log_prefix} Fetch deregisterable instances: #{deregisterable_instances.map(&:ec2_instance_id).inspect}"
93
+
94
+ az_to_instance_count = auto_scaling_group_instances.each_with_object(Hash.new(0)) { |i, h| h[i.availability_zone] += 1 }
95
+ az_to_deregisterable_instances = deregisterable_instances.group_by do |i|
96
+ i.attributes.find { |a| a.name == "ecs.availability-zone" }.value
97
+ end
98
+
99
+ deregistered_instance_ids = []
100
+ prev_max_count = nil
101
+ # Select instances to be deregistered keeping the balance of instance count per availability zone
102
+ while deregistered_instance_ids.size < count
103
+ max_count = az_to_instance_count.each_value.max
104
+ break if max_count == prev_max_count # No more deregistable instances with keeping the balance
105
+
106
+ azs = az_to_instance_count.select { |_, c| c == max_count }.keys
107
+ azs.each do |az|
108
+ instance = az_to_deregisterable_instances[az]&.pop
109
+ next if instance.nil?
110
+ begin
111
+ cluster_resource_manager.deregister_container_instance(instance.container_instance_arn)
112
+ deregistered_instance_ids << instance.ec2_instance_id
113
+ az_to_instance_count[az] -= 1
114
+ rescue EcsDeploy::AutoScaler::ClusterResourceManager::DeregisterContainerInstanceFailed
115
+ end
116
+ break if deregistered_instance_ids.size >= count
117
+ end
118
+ prev_max_count = max_count
119
+ end
120
+
121
+ @logger.info "#{log_prefix} Deregistered instances: #{deregistered_instance_ids.inspect}"
122
+
123
+ detach_and_terminate_instances(deregistered_instance_ids)
124
+
125
+ deregistered_instance_ids.size
126
+ end
127
+
128
+ def detach_and_terminate_instances(instance_ids)
129
+ return if instance_ids.empty?
130
+
131
+ detach_instances(
132
+ instance_ids: instance_ids,
133
+ should_decrement_desired_capacity: true
134
+ )
135
+
136
+ sleep 3
137
+
138
+ ec2_client.terminate_instances(instance_ids: instance_ids)
139
+
140
+ @logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}"
141
+ rescue => e
142
+ AutoScaler.error_logger.error(e)
143
+ end
144
+
145
+ def detach_and_terminate_orphan_instances
146
+ container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id)
147
+ orphans = instances(reload: true).reject do |i|
148
+ next true if container_instance_ids.include?(i.instance_id)
149
+
150
+ # The lifecycle state of terminated instances becomes "Terminating", "Terminating:Wait", or "Terminating:Proceed",
151
+ # and we can't detach instances in such a state.
152
+ if i.lifecycle_state.start_with?("Terminating")
153
+ AutoScaler.error_logger.warn("#{log_prefix} The lifesycle state of #{i.instance_id} is \"#{i.lifecycle_state}\", so ignore it")
154
+ next true
155
+ end
156
+ end.map(&:instance_id)
157
+
158
+ return if orphans.empty?
159
+
160
+ targets = ec2_client.describe_instances(instance_ids: orphans).reservations.flat_map(&:instances).select do |i|
161
+ (Time.now - i.launch_time) > 600
162
+ end
163
+
164
+ detach_and_terminate_instances(targets.map(&:instance_id))
165
+ rescue => e
166
+ AutoScaler.error_logger.error(e)
167
+ end
168
+
169
+ def client
170
+ Aws::AutoScaling::Client.new(
171
+ access_key_id: EcsDeploy.config.access_key_id,
172
+ secret_access_key: EcsDeploy.config.secret_access_key,
173
+ region: region,
174
+ logger: logger
175
+ )
176
+ end
177
+
178
+ def ec2_client
179
+ Aws::EC2::Client.new(
180
+ access_key_id: EcsDeploy.config.access_key_id,
181
+ secret_access_key: EcsDeploy.config.secret_access_key,
182
+ region: region,
183
+ logger: logger
184
+ )
185
+ end
186
+
187
+ def instances(reload: false)
188
+ if reload || @instances.nil?
189
+ resp = client.describe_auto_scaling_groups({
190
+ auto_scaling_group_names: [name],
191
+ })
192
+ @instances = resp.auto_scaling_groups[0].instances
193
+ else
194
+ @instances
195
+ end
196
+ end
197
+
198
+ def running_essential_task?(instance, container_instance_arns_in_service)
199
+ return false if instance.running_tasks_count == 0
200
+
201
+ container_instance_arns_in_service.include?(instance.container_instance_arn)
202
+ end
203
+
204
+ def log_prefix
205
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
206
+ end
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,149 @@
1
+ require "timeout"
2
+
3
+ require "aws-sdk-ecs"
4
+
5
+ module EcsDeploy
6
+ module AutoScaler
7
+ class ClusterResourceManager
8
+ class DeregisterContainerInstanceFailed < StandardError; end
9
+
10
+ MAX_DESCRIBABLE_SERVICE_COUNT = 10
11
+
12
+ def initialize(region:, cluster:, service_configs:, logger: nil, capacity_based_on:)
13
+ @region = region
14
+ @cluster = cluster
15
+ @logger = logger
16
+ @service_configs = service_configs
17
+ @capacity_based_on = capacity_based_on
18
+ if @capacity_based_on != "instances" && @capacity_based_on != "vCPUs"
19
+ raise ArgumentError, 'capacity_based_on should be either "instances" or "vCPUs"'
20
+ end
21
+
22
+ @mutex = Mutex.new
23
+ @resource = ConditionVariable.new
24
+ @used_capacity = @service_configs.sum { |s| s.desired_count * s.required_capacity }
25
+ @capacity = calculate_active_instance_capacity
26
+ end
27
+
28
+ def acquire(capacity, timeout: nil)
29
+ @mutex.synchronize do
30
+ @logger&.debug("#{log_prefix} Try to acquire #{capacity} capacity (capacity: #{@capacity}, used_capacity: #{@used_capacity})")
31
+ Timeout.timeout(timeout) do
32
+ while @capacity - @used_capacity < capacity
33
+ @resource.wait(@mutex)
34
+ end
35
+ end
36
+ @used_capacity += capacity
37
+ @logger&.debug("#{log_prefix} Acquired #{capacity} capacity (capacity: #{@capacity}, used_capacity: #{@used_capacity})")
38
+ end
39
+ true
40
+ rescue Timeout::Error
41
+ false
42
+ end
43
+
44
+ def release(capacity)
45
+ @mutex.synchronize do
46
+ @used_capacity -= capacity
47
+ @resource.broadcast
48
+ end
49
+ @logger&.debug("#{log_prefix} Released #{capacity} capacity (capacity: #{@capacity}, used_capacity: #{@used_capacity})")
50
+ true
51
+ end
52
+
53
+ def fetch_container_instances_in_cluster
54
+ cl = ecs_client
55
+ resp = cl.list_container_instances(cluster: @cluster)
56
+ if resp.container_instance_arns.empty?
57
+ []
58
+ else
59
+ resp.flat_map do |resp|
60
+ cl.describe_container_instances(cluster: @cluster, container_instances: resp.container_instance_arns).container_instances
61
+ end
62
+ end
63
+ end
64
+
65
+ def fetch_container_instance_arns_in_service
66
+ task_groups = @service_configs.map { |s| "service:#{s.name}" }
67
+ ecs_client.list_container_instances(cluster: @cluster, filter: "task:group in [#{task_groups.join(",")}]").flat_map(&:container_instance_arns)
68
+ end
69
+
70
+ def deregister_container_instance(container_instance_arn)
71
+ ecs_client.deregister_container_instance(cluster: @cluster, container_instance: container_instance_arn, force: true)
72
+ rescue Aws::ECS::Errors::InvalidParameterException
73
+ raise DeregisterContainerInstanceFailed
74
+ end
75
+
76
+ def trigger_capacity_update(old_desired_capacity, new_desired_capacity, interval: 5, wait_until_capacity_updated: false)
77
+ th = Thread.new do
78
+ @logger&.info "#{log_prefix} Start updating capacity: #{old_desired_capacity} -> #{new_desired_capacity}"
79
+ Timeout.timeout(180) do
80
+ until @capacity == new_desired_capacity || (new_desired_capacity >= old_desired_capacity && @capacity > new_desired_capacity)
81
+ @mutex.synchronize do
82
+ begin
83
+ @capacity = calculate_active_instance_capacity
84
+ @resource.broadcast
85
+ rescue => e
86
+ AutoScaler.error_logger.warn("#{log_prefix} `#{__method__}': #{e} (#{e.class})")
87
+ end
88
+ end
89
+
90
+ sleep interval
91
+ end
92
+ @logger&.info "#{log_prefix} capacity is updated to #{@capacity}"
93
+ end
94
+ end
95
+
96
+ if wait_until_capacity_updated
97
+ @logger&.info "#{log_prefix} Wait for the capacity of active instances to become #{new_desired_capacity} from #{old_desired_capacity}"
98
+ begin
99
+ th.join
100
+ rescue Timeout::Error => e
101
+ msg = "#{log_prefix} `#{__method__}': #{e} (#{e.class})"
102
+ if @capacity_based_on == "vCPUs"
103
+ # Timeout::Error sometimes occur.
104
+ # For example, @capacity won't be new_desired_capacity if new_desired_capacity is odd and all instances have 2 vCPUs
105
+ AutoScaler.error_logger.warn(msg)
106
+ else
107
+ AutoScaler.error_logger.error(msg)
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ def calculate_active_instance_capacity
114
+ cl = ecs_client
115
+
116
+ if @capacity_based_on == "instances"
117
+ return cl.list_container_instances(cluster: @cluster, status: "ACTIVE").sum do |resp|
118
+ resp.container_instance_arns.size
119
+ end
120
+ end
121
+
122
+ total_cpu = cl.list_container_instances(cluster: @cluster, status: "ACTIVE").sum do |resp|
123
+ next 0 if resp.container_instance_arns.empty?
124
+ ecs_client.describe_container_instances(
125
+ cluster: @cluster,
126
+ container_instances: resp.container_instance_arns,
127
+ ).container_instances.sum { |ci| ci.registered_resources.find { |r| r.name == "CPU" }.integer_value }
128
+ end
129
+
130
+ total_cpu / 1024
131
+ end
132
+
133
+ private
134
+
135
+ def ecs_client
136
+ Aws::ECS::Client.new(
137
+ access_key_id: EcsDeploy.config.access_key_id,
138
+ secret_access_key: EcsDeploy.config.secret_access_key,
139
+ region: @region,
140
+ logger: @logger,
141
+ )
142
+ end
143
+
144
+ def log_prefix
145
+ "[#{self.class.to_s.gsub(/\AEcsDeploy::AutoScaler::/, "")} #{@region} #{@cluster}]"
146
+ end
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,16 @@
1
+ module EcsDeploy
2
+ module AutoScaler
3
+ module ConfigBase
4
+ def initialize(attributes = {}, logger)
5
+ attributes.each do |key, val|
6
+ send("#{key}=", val)
7
+ end
8
+ @logger = logger
9
+ end
10
+
11
+ def logger
12
+ @logger
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,134 @@
1
+ require "aws-sdk-ec2"
2
+ require "aws-sdk-ecs"
3
+ require "aws-sdk-sqs"
4
+
5
+ require "ecs_deploy"
6
+
7
+ module EcsDeploy
8
+ module AutoScaler
9
+ class InstanceDrainer
10
+ def initialize(auto_scaling_group_configs:, spot_fleet_request_configs:, logger:)
11
+ @auto_scaling_group_configs = auto_scaling_group_configs || []
12
+ @spot_fleet_request_configs = spot_fleet_request_configs || []
13
+ @logger = logger
14
+ @stop = false
15
+ end
16
+
17
+ def poll_spot_instance_interruption_warnings(queue_url)
18
+ @logger.debug "Start polling spot instance interruption warnings of #{queue_url}"
19
+
20
+ # cf. https://docs.aws.amazon.com/general/latest/gr/rande.html#sqs_region
21
+ region = URI.parse(queue_url).host.split(".")[1]
22
+
23
+ poller = Aws::SQS::QueuePoller.new(queue_url, client: sqs_client(region))
24
+ poller.before_request do |stats|
25
+ throw :stop_polling if @stop
26
+ end
27
+
28
+ until @stop
29
+ begin
30
+ poller.poll(max_number_of_messages: 10, visibility_timeout: 15) do |messages, _|
31
+ instance_ids = messages.map do |msg|
32
+ JSON.parse(msg.body).dig("detail", "instance-id")
33
+ end
34
+
35
+ config_to_instance_ids = build_config_to_instance_ids(instance_ids, region)
36
+ set_instance_state_to_draining(config_to_instance_ids, region)
37
+ # Detach the instances to launch other instances
38
+ detach_instances_from_auto_scaling_groups(config_to_instance_ids, region)
39
+ end
40
+ rescue => e
41
+ AutoScaler.error_logger.error(e)
42
+ end
43
+ end
44
+
45
+ @logger.debug "Stop polling spot instance interruption warnings of #{queue_url}"
46
+ end
47
+
48
+ def stop
49
+ @stop = true
50
+ end
51
+
52
+ private
53
+
54
+ def build_config_to_instance_ids(instance_ids, region)
55
+ config_to_instance_ids = Hash.new{ |h, k| h[k] = [] }
56
+ ec2_client(region).describe_instances(instance_ids: instance_ids).each do |resp|
57
+ resp.reservations.each do |reservation|
58
+ reservation.instances.each do |i|
59
+ sfr_id = i.tags.find { |t| t.key == "aws:ec2spot:fleet-request-id" }&.value
60
+ if sfr_id
61
+ config = @spot_fleet_request_configs.find { |c| c.id == sfr_id && c.region == region }
62
+ config_to_instance_ids[config] << i.instance_id if config
63
+ next
64
+ end
65
+
66
+ asg_name = i.tags.find { |t| t.key == "aws:autoscaling:groupName" }&.value
67
+ if asg_name
68
+ config = @auto_scaling_group_configs.find { |c| c.name == asg_name && c.region == region }
69
+ config_to_instance_ids[config] << i.instance_id if config
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ config_to_instance_ids
76
+ end
77
+
78
+ def set_instance_state_to_draining(config_to_instance_ids, region)
79
+ cl = ecs_client(region)
80
+ config_to_instance_ids.each do |config, instance_ids|
81
+ arns = cl.list_container_instances(
82
+ cluster: config.cluster,
83
+ filter: "ec2InstanceId in [#{instance_ids.join(",")}]",
84
+ ).container_instance_arns
85
+
86
+ if instance_ids.size != arns.size
87
+ AutoScaler.error_logger.warn("The number of ARNs differs from the number of instance IDs: instance_ids: #{instance_ids.inspect}, container_instance_arns: #{arns.inspect}")
88
+ end
89
+ next if arns.empty?
90
+
91
+ cl.update_container_instances_state(
92
+ cluster: config.cluster,
93
+ container_instances: arns,
94
+ status: "DRAINING",
95
+ )
96
+ @logger.info "Draining instances: region: #{region}, cluster: #{config.cluster}, instance_ids: #{instance_ids.inspect}, container_instance_arns: #{arns.inspect}"
97
+ end
98
+ end
99
+
100
+ def detach_instances_from_auto_scaling_groups(config_to_instance_ids, region)
101
+ @auto_scaling_group_configs.each do |config|
102
+ config.detach_instances(instance_ids: config_to_instance_ids[config], should_decrement_desired_capacity: false)
103
+ end
104
+ end
105
+
106
+ def ec2_client(region)
107
+ Aws::EC2::Client.new(
108
+ access_key_id: EcsDeploy.config.access_key_id,
109
+ secret_access_key: EcsDeploy.config.secret_access_key,
110
+ region: region,
111
+ logger: @logger,
112
+ )
113
+ end
114
+
115
+ def ecs_client(region)
116
+ Aws::ECS::Client.new(
117
+ access_key_id: EcsDeploy.config.access_key_id,
118
+ secret_access_key: EcsDeploy.config.secret_access_key,
119
+ region: region,
120
+ logger: @logger,
121
+ )
122
+ end
123
+
124
+ def sqs_client(region)
125
+ Aws::SQS::Client.new(
126
+ access_key_id: EcsDeploy.config.access_key_id,
127
+ secret_access_key: EcsDeploy.config.secret_access_key,
128
+ region: region,
129
+ logger: @logger,
130
+ )
131
+ end
132
+ end
133
+ end
134
+ end