ecs_deploy 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,222 @@
1
+ require "aws-sdk-ecs"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler/config_base"
4
+ require "ecs_deploy/auto_scaler/trigger_config"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count required_capacity)
9
+ ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
10
+ include ConfigBase
11
+
12
+ MAX_DESCRIBABLE_TASK_COUNT = 100
13
+
14
+ def initialize(attributes = {}, logger)
15
+ super
16
+ self.idle_time ||= 60
17
+ self.max_task_count = Array(max_task_count)
18
+ self.upscale_triggers = upscale_triggers.to_a.map do |t|
19
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
20
+ end
21
+ self.downscale_triggers = downscale_triggers.to_a.map do |t|
22
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
23
+ end
24
+ self.max_task_count.sort!
25
+ self.desired_count = fetch_service.desired_count
26
+ self.required_capacity ||= 1
27
+ @reach_max_at = nil
28
+ @last_updated_at = nil
29
+ @logger = logger
30
+ end
31
+
32
+ def adjust_desired_count(cluster_resource_manager)
33
+ if idle?
34
+ @logger.debug "#{name} is idling"
35
+ return
36
+ end
37
+
38
+ difference = 0
39
+ upscale_triggers.each do |trigger|
40
+ next if difference >= trigger.step
41
+
42
+ if trigger.match?
43
+ @logger.info "#{log_prefix} Fire upscale trigger by #{trigger.alarm_name} #{trigger.state}"
44
+ difference = trigger.step
45
+ end
46
+ end
47
+
48
+ if difference == 0 && desired_count > current_min_task_count
49
+ downscale_triggers.each do |trigger|
50
+ next unless trigger.match?
51
+
52
+ @logger.info "#{log_prefix} Fire downscale trigger by #{trigger.alarm_name} #{trigger.state}"
53
+ difference = [difference, -trigger.step].min
54
+ end
55
+ end
56
+
57
+ if current_min_task_count > desired_count + difference
58
+ difference = current_min_task_count - desired_count
59
+ end
60
+
61
+ if difference >= 0 && desired_count > max_task_count.max
62
+ difference = max_task_count.max - desired_count
63
+ end
64
+
65
+ if difference != 0
66
+ update_service(difference, cluster_resource_manager)
67
+ end
68
+ end
69
+
70
+ def wait_until_desired_count_updated
71
+ @increase_desired_count_thread&.join
72
+ rescue => e
73
+ AutoScaler.error_logger.warn("`#{__method__}': #{e} (#{e.class})")
74
+ ensure
75
+ @increase_desired_count_thread = nil
76
+ end
77
+
78
+ private
79
+
80
+ def client
81
+ Aws::ECS::Client.new(
82
+ access_key_id: EcsDeploy.config.access_key_id,
83
+ secret_access_key: EcsDeploy.config.secret_access_key,
84
+ region: region,
85
+ logger: logger
86
+ )
87
+ end
88
+
89
+ def idle?
90
+ return false unless @last_updated_at
91
+
92
+ diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
93
+ diff < idle_time
94
+ end
95
+
96
+ def current_min_task_count
97
+ return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
98
+
99
+ scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
100
+ from = Time.parse(s["from"])
101
+ to = Time.parse(s["to"])
102
+ (from..to).cover?(Time.now)
103
+ }["count"]
104
+ end
105
+
106
+ def overheat?
107
+ return false unless @reach_max_at
108
+ (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
109
+ end
110
+
111
+ def fetch_service
112
+ res = client.describe_services(cluster: cluster, services: [name])
113
+ raise "Service \"#{name}\" is not found" if res.services.empty?
114
+ res.services[0]
115
+ rescue => e
116
+ AutoScaler.error_logger.error(e)
117
+ end
118
+
119
+ def update_service(difference, cluster_resource_manager)
120
+ next_desired_count = desired_count + difference
121
+ current_level = max_task_level(desired_count)
122
+ next_level = max_task_level(next_desired_count)
123
+ if current_level < next_level && overheat? # next max
124
+ level = next_level
125
+ @reach_max_at = nil
126
+ @logger.info "#{log_prefix} Service is overheat, uses next max count"
127
+ elsif current_level < next_level && !overheat? # wait cooldown
128
+ level = current_level
129
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
130
+ @reach_max_at ||= now
131
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
132
+ elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
133
+ level = current_level
134
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
135
+ @reach_max_at ||= now
136
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
137
+ elsif current_level == next_level && next_desired_count < max_task_count[current_level]
138
+ level = current_level
139
+ @reach_max_at = nil
140
+ @logger.info "#{log_prefix} Service clears cooldown state"
141
+ elsif current_level > next_level
142
+ level = next_level
143
+ @reach_max_at = nil
144
+ @logger.info "#{log_prefix} Service clears cooldown state"
145
+ end
146
+
147
+ next_desired_count = [next_desired_count, max_task_count[level]].min
148
+ if next_desired_count > desired_count
149
+ increase_desired_count(next_desired_count - desired_count, cluster_resource_manager)
150
+ else
151
+ decrease_desired_count(desired_count - next_desired_count, cluster_resource_manager)
152
+ end
153
+
154
+ @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
155
+ @logger.info "#{log_prefix} Update desired_count to #{next_desired_count}"
156
+ rescue => e
157
+ AutoScaler.error_logger.error(e)
158
+ end
159
+
160
+ def increase_desired_count(by, cluster_resource_manager)
161
+ applied_desired_count = desired_count
162
+ self.desired_count += by
163
+
164
+ wait_until = Process.clock_gettime(Process::CLOCK_MONOTONIC) + 180
165
+ @increase_desired_count_thread = Thread.new do
166
+ cl = client
167
+ by.times do
168
+ timeout = wait_until - Process.clock_gettime(Process::CLOCK_MONOTONIC)
169
+ break if timeout <= 0
170
+ break unless cluster_resource_manager.acquire(required_capacity, timeout: timeout)
171
+ begin
172
+ cl.update_service(cluster: cluster, service: name, desired_count: applied_desired_count + 1)
173
+ applied_desired_count += 1
174
+ rescue => e
175
+ cluster_resource_manager.release(required_capacity)
176
+ AutoScaler.error_logger.error(e)
177
+ break
178
+ end
179
+ end
180
+
181
+ if applied_desired_count != desired_count
182
+ self.desired_count = applied_desired_count
183
+ @logger.info "#{log_prefix} Failed to update service and set desired_count to #{desired_count}"
184
+ end
185
+ end
186
+ end
187
+
188
+ def decrease_desired_count(by, cluster_resource_manager)
189
+ cl = client
190
+ running_task_arns = cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
191
+
192
+ cl.update_service(cluster: cluster, service: name, desired_count: desired_count - by)
193
+
194
+ cl.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
195
+ w.before_wait do
196
+ @logger.debug "#{log_prefix} wait service stable"
197
+ end
198
+ end
199
+
200
+ stopping_task_arns = running_task_arns - cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
201
+ stopping_task_arns.each_slice(MAX_DESCRIBABLE_TASK_COUNT) do |arns|
202
+ cl.wait_until(:tasks_stopped, cluster: cluster, tasks: arns) do |w|
203
+ w.before_wait do
204
+ @logger.debug "#{log_prefix} wait stopping tasks stopped"
205
+ end
206
+ end
207
+ end
208
+
209
+ cluster_resource_manager.release(required_capacity * by)
210
+ self.desired_count -= by
211
+ end
212
+
213
+ def max_task_level(count)
214
+ max_task_count.index { |i| count <= i } || max_task_count.size - 1
215
+ end
216
+
217
+ def log_prefix
218
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
219
+ end
220
+ end
221
+ end
222
+ end
@@ -0,0 +1,102 @@
1
+ require "json"
2
+ require "timeout"
3
+
4
+ require "aws-sdk-ec2"
5
+ require "ecs_deploy"
6
+ require "ecs_deploy/auto_scaler/config_base"
7
+ require "ecs_deploy/auto_scaler/cluster_resource_manager"
8
+
9
+ module EcsDeploy
10
+ module AutoScaler
11
+ SpotFleetRequestConfig = Struct.new(:id, :region, :cluster, :buffer, :service_configs) do
12
+ include ConfigBase
13
+
14
+ def initialize(attributes = {}, logger)
15
+ attributes = attributes.dup
16
+ services = attributes.delete("services")
17
+ super(attributes, logger)
18
+ self.service_configs = services.map do |s|
19
+ ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger)
20
+ end
21
+ end
22
+
23
+ def name
24
+ id
25
+ end
26
+
27
+ def update_desired_capacity(required_capacity)
28
+ terminate_orphan_instances
29
+
30
+ desired_capacity = (required_capacity + buffer.to_f).ceil
31
+
32
+ request_config = ec2_client.describe_spot_fleet_requests(
33
+ spot_fleet_request_ids: [id]
34
+ ).spot_fleet_request_configs[0].spot_fleet_request_config
35
+
36
+ return if desired_capacity == request_config.target_capacity
37
+
38
+ ec2_client.modify_spot_fleet_request(spot_fleet_request_id: id, target_capacity: desired_capacity)
39
+
40
+ cluster_resource_manager.trigger_capacity_update(
41
+ request_config.target_capacity,
42
+ desired_capacity,
43
+ # Wait until the capacity is updated to prevent the process from terminating before container draining is completed
44
+ wait_until_capacity_updated: desired_capacity < request_config.target_capacity,
45
+ )
46
+ @logger.info "#{log_prefix} Update desired_capacity to #{desired_capacity}"
47
+ rescue => e
48
+ AutoScaler.error_logger.error(e)
49
+ end
50
+
51
+ def cluster_resource_manager
52
+ @cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new(
53
+ region: region,
54
+ cluster: cluster,
55
+ service_configs: service_configs,
56
+ capacity_based_on: "vCPUs",
57
+ logger: @logger,
58
+ )
59
+ end
60
+
61
+ private
62
+
63
+ def terminate_orphan_instances
64
+ container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id)
65
+ spot_fleet_instances = ec2_client.describe_spot_fleet_instances(spot_fleet_request_id: id).active_instances
66
+ orphans = spot_fleet_instances.reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
67
+
68
+ return if orphans.empty?
69
+
70
+ running_instances = ec2_client.describe_instances(
71
+ instance_ids: orphans,
72
+ filters: [{ name: "instance-state-name", values: ["running"] }],
73
+ ).reservations.flat_map(&:instances)
74
+ # instances which have just launched might not be registered to the cluster yet.
75
+ instance_ids = running_instances.select { |i| (Time.now - i.launch_time) > 600 }.map(&:instance_id)
76
+
77
+ return if instance_ids.empty?
78
+
79
+ # Terminate orpahns without canceling spot instance request
80
+ # because we can't terminate canceled spot instances by decreasing the capacity
81
+ ec2_client.terminate_instances(instance_ids: instance_ids)
82
+
83
+ @logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}"
84
+ rescue => e
85
+ AutoScaler.error_logger.error(e)
86
+ end
87
+
88
+ def ec2_client
89
+ Aws::EC2::Client.new(
90
+ access_key_id: EcsDeploy.config.access_key_id,
91
+ secret_access_key: EcsDeploy.config.secret_access_key,
92
+ region: region,
93
+ logger: logger,
94
+ )
95
+ end
96
+
97
+ def log_prefix
98
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,36 @@
1
+ require "aws-sdk-cloudwatch"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler"
4
+ require "ecs_deploy/auto_scaler/config_base"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ TriggerConfig = Struct.new(:alarm_name, :region, :state, :step) do
9
+ include ConfigBase
10
+
11
+ def client
12
+ Aws::CloudWatch::Client.new(
13
+ access_key_id: EcsDeploy.config.access_key_id,
14
+ secret_access_key: EcsDeploy.config.secret_access_key,
15
+ region: region,
16
+ logger: logger
17
+ )
18
+ end
19
+
20
+ def match?
21
+ fetch_alarm.state_value == state
22
+ end
23
+
24
+ def fetch_alarm
25
+ res = client.describe_alarms(alarm_names: [alarm_name])
26
+
27
+ raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
28
+ res.metric_alarms[0].tap do |alarm|
29
+ AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
30
+ end
31
+ rescue => e
32
+ AutoScaler.error_logger.error(e)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -1,4 +1,5 @@
1
1
  require 'ecs_deploy'
2
+ require 'ecs_deploy/instance_fluctuation_manager'
2
3
 
3
4
  namespace :ecs do
4
5
  task :configure do
@@ -7,6 +8,8 @@ namespace :ecs do
7
8
  c.deploy_wait_timeout = fetch(:ecs_deploy_wait_timeout) if fetch(:ecs_deploy_wait_timeout)
8
9
  c.ecs_service_role = fetch(:ecs_service_role) if fetch(:ecs_service_role)
9
10
  c.default_region = Array(fetch(:ecs_region))[0] if fetch(:ecs_region)
11
+ c.ecs_wait_until_services_stable_max_attempts = fetch(:ecs_wait_until_services_stable_max_attempts) if fetch(:ecs_wait_until_services_stable_max_attempts)
12
+ c.ecs_wait_until_services_stable_delay = fetch(:ecs_wait_until_services_stable_delay) if fetch(:ecs_wait_until_services_stable_delay)
10
13
  end
11
14
 
12
15
  if ENV["TARGET_CLUSTER"]
@@ -20,7 +23,7 @@ namespace :ecs do
20
23
  task register_task_definition: [:configure] do
21
24
  if fetch(:ecs_tasks)
22
25
  regions = Array(fetch(:ecs_region))
23
- regions = [EcsDeploy.config.default_region || ENV["AWS_DEFAULT_REGION"]] if regions.empty?
26
+ regions = [EcsDeploy.config.default_region] if regions.empty?
24
27
  ecs_registered_tasks = {}
25
28
  regions.each do |region|
26
29
  ecs_registered_tasks[region] = {}
@@ -30,9 +33,14 @@ namespace :ecs do
30
33
  task_definition_name: t[:name],
31
34
  container_definitions: t[:container_definitions],
32
35
  task_role_arn: t[:task_role_arn],
36
+ execution_role_arn: t[:execution_role_arn],
33
37
  volumes: t[:volumes],
34
38
  network_mode: t[:network_mode],
35
39
  placement_constraints: t[:placement_constraints],
40
+ requires_compatibilities: t[:requires_compatibilities],
41
+ cpu: t[:cpu],
42
+ memory: t[:memory],
43
+ tags: t[:tags],
36
44
  )
37
45
  result = task_definition.register
38
46
  ecs_registered_tasks[region][t[:name]] = result
@@ -58,6 +66,10 @@ namespace :ecs do
58
66
  description: t[:description],
59
67
  target_id: t[:target_id],
60
68
  task_definition_name: t[:task_definition_name],
69
+ network_configuration: t[:network_configuration],
70
+ launch_type: t[:launch_type],
71
+ platform_version: t[:platform_version],
72
+ group: t[:group],
61
73
  revision: t[:revision],
62
74
  task_count: t[:task_count],
63
75
  role_arn: t[:role_arn],
@@ -89,8 +101,18 @@ namespace :ecs do
89
101
  task_definition_name: service[:task_definition_name],
90
102
  load_balancers: service[:load_balancers],
91
103
  desired_count: service[:desired_count],
104
+ launch_type: service[:launch_type],
105
+ network_configuration: service[:network_configuration],
106
+ health_check_grace_period_seconds: service[:health_check_grace_period_seconds],
107
+ delete: service[:delete],
108
+ enable_ecs_managed_tags: service[:enable_ecs_managed_tags],
109
+ tags: service[:tags],
110
+ propagate_tags: service[:propagate_tags],
92
111
  }
93
112
  service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration]
113
+ service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints]
114
+ service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy]
115
+ service_options[:scheduling_strategy] = service[:scheduling_strategy] if service[:scheduling_strategy]
94
116
  s = EcsDeploy::Service.new(service_options)
95
117
  s.deploy
96
118
  s
@@ -149,8 +171,13 @@ namespace :ecs do
149
171
  task_definition_name: rollback_arn,
150
172
  load_balancers: service[:load_balancers],
151
173
  desired_count: service[:desired_count],
174
+ launch_type: service[:launch_type],
175
+ network_configuration: service[:network_configuration],
176
+ health_check_grace_period_seconds: service[:health_check_grace_period_seconds],
152
177
  }
153
178
  service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration]
179
+ service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints]
180
+ service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy]
154
181
  s = EcsDeploy::Service.new(service_options)
155
182
  s.deploy
156
183
  EcsDeploy::TaskDefinition.deregister(current_task_definition_arn, region: r)
@@ -160,4 +187,46 @@ namespace :ecs do
160
187
  end
161
188
  end
162
189
  end
190
+
191
+ task increase_instances_to_max_size: [:configure] do
192
+ configs = fetch(:ecs_instance_fluctuation_manager_configs, [])
193
+ unless configs.empty?
194
+ regions = Array(fetch(:ecs_region))
195
+ regions = [EcsDeploy.config.default_region] if regions.empty?
196
+ regions.each do |region|
197
+ configs.each do |config|
198
+ logger = config.fetch(:logger, EcsDeploy.logger)
199
+ m = EcsDeploy::InstanceFluctuationManager.new(
200
+ region: config[:region] || region,
201
+ cluster: config[:cluster] || fetch(:ecs_default_cluster),
202
+ auto_scaling_group_name: config[:auto_scaling_group_name],
203
+ desired_capacity: config[:desired_capacity],
204
+ logger: logger
205
+ )
206
+ m.increase
207
+ end
208
+ end
209
+ end
210
+ end
211
+
212
+ task terminate_redundant_instances: [:configure] do
213
+ configs = fetch(:ecs_instance_fluctuation_manager_configs, [])
214
+ unless configs.empty?
215
+ regions = Array(fetch(:ecs_region))
216
+ regions = [EcsDeploy.config.default_region] if regions.empty?
217
+ regions.each do |region|
218
+ configs.each do |config|
219
+ logger = config.fetch(:logger, EcsDeploy.logger)
220
+ m = EcsDeploy::InstanceFluctuationManager.new(
221
+ region: config[:region] || region,
222
+ cluster: config[:cluster] || fetch(:ecs_default_cluster),
223
+ auto_scaling_group_name: config[:auto_scaling_group_name],
224
+ desired_capacity: config[:desired_capacity],
225
+ logger: logger
226
+ )
227
+ m.decrease
228
+ end
229
+ end
230
+ end
231
+ end
163
232
  end