ecs_deploy 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,222 @@
1
+ require "aws-sdk-ecs"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler/config_base"
4
+ require "ecs_deploy/auto_scaler/trigger_config"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count required_capacity)
9
+ ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
10
+ include ConfigBase
11
+
12
+ MAX_DESCRIBABLE_TASK_COUNT = 100
13
+
14
+ def initialize(attributes = {}, logger)
15
+ super
16
+ self.idle_time ||= 60
17
+ self.max_task_count = Array(max_task_count)
18
+ self.upscale_triggers = upscale_triggers.to_a.map do |t|
19
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
20
+ end
21
+ self.downscale_triggers = downscale_triggers.to_a.map do |t|
22
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
23
+ end
24
+ self.max_task_count.sort!
25
+ self.desired_count = fetch_service.desired_count
26
+ self.required_capacity ||= 1
27
+ @reach_max_at = nil
28
+ @last_updated_at = nil
29
+ @logger = logger
30
+ end
31
+
32
+ def adjust_desired_count(cluster_resource_manager)
33
+ if idle?
34
+ @logger.debug "#{name} is idling"
35
+ return
36
+ end
37
+
38
+ difference = 0
39
+ upscale_triggers.each do |trigger|
40
+ next if difference >= trigger.step
41
+
42
+ if trigger.match?
43
+ @logger.info "#{log_prefix} Fire upscale trigger by #{trigger.alarm_name} #{trigger.state}"
44
+ difference = trigger.step
45
+ end
46
+ end
47
+
48
+ if difference == 0 && desired_count > current_min_task_count
49
+ downscale_triggers.each do |trigger|
50
+ next unless trigger.match?
51
+
52
+ @logger.info "#{log_prefix} Fire downscale trigger by #{trigger.alarm_name} #{trigger.state}"
53
+ difference = [difference, -trigger.step].min
54
+ end
55
+ end
56
+
57
+ if current_min_task_count > desired_count + difference
58
+ difference = current_min_task_count - desired_count
59
+ end
60
+
61
+ if difference >= 0 && desired_count > max_task_count.max
62
+ difference = max_task_count.max - desired_count
63
+ end
64
+
65
+ if difference != 0
66
+ update_service(difference, cluster_resource_manager)
67
+ end
68
+ end
69
+
70
+ def wait_until_desired_count_updated
71
+ @increase_desired_count_thread&.join
72
+ rescue => e
73
+ AutoScaler.error_logger.warn("`#{__method__}': #{e} (#{e.class})")
74
+ ensure
75
+ @increase_desired_count_thread = nil
76
+ end
77
+
78
+ private
79
+
80
+ def client
81
+ Aws::ECS::Client.new(
82
+ access_key_id: EcsDeploy.config.access_key_id,
83
+ secret_access_key: EcsDeploy.config.secret_access_key,
84
+ region: region,
85
+ logger: logger
86
+ )
87
+ end
88
+
89
+ def idle?
90
+ return false unless @last_updated_at
91
+
92
+ diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
93
+ diff < idle_time
94
+ end
95
+
96
+ def current_min_task_count
97
+ return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
98
+
99
+ scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
100
+ from = Time.parse(s["from"])
101
+ to = Time.parse(s["to"])
102
+ (from..to).cover?(Time.now)
103
+ }["count"]
104
+ end
105
+
106
+ def overheat?
107
+ return false unless @reach_max_at
108
+ (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
109
+ end
110
+
111
+ def fetch_service
112
+ res = client.describe_services(cluster: cluster, services: [name])
113
+ raise "Service \"#{name}\" is not found" if res.services.empty?
114
+ res.services[0]
115
+ rescue => e
116
+ AutoScaler.error_logger.error(e)
117
+ end
118
+
119
+ def update_service(difference, cluster_resource_manager)
120
+ next_desired_count = desired_count + difference
121
+ current_level = max_task_level(desired_count)
122
+ next_level = max_task_level(next_desired_count)
123
+ if current_level < next_level && overheat? # next max
124
+ level = next_level
125
+ @reach_max_at = nil
126
+ @logger.info "#{log_prefix} Service is overheat, uses next max count"
127
+ elsif current_level < next_level && !overheat? # wait cooldown
128
+ level = current_level
129
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
130
+ @reach_max_at ||= now
131
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
132
+ elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
133
+ level = current_level
134
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
135
+ @reach_max_at ||= now
136
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
137
+ elsif current_level == next_level && next_desired_count < max_task_count[current_level]
138
+ level = current_level
139
+ @reach_max_at = nil
140
+ @logger.info "#{log_prefix} Service clears cooldown state"
141
+ elsif current_level > next_level
142
+ level = next_level
143
+ @reach_max_at = nil
144
+ @logger.info "#{log_prefix} Service clears cooldown state"
145
+ end
146
+
147
+ next_desired_count = [next_desired_count, max_task_count[level]].min
148
+ if next_desired_count > desired_count
149
+ increase_desired_count(next_desired_count - desired_count, cluster_resource_manager)
150
+ else
151
+ decrease_desired_count(desired_count - next_desired_count, cluster_resource_manager)
152
+ end
153
+
154
+ @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
155
+ @logger.info "#{log_prefix} Update desired_count to #{next_desired_count}"
156
+ rescue => e
157
+ AutoScaler.error_logger.error(e)
158
+ end
159
+
160
+ def increase_desired_count(by, cluster_resource_manager)
161
+ applied_desired_count = desired_count
162
+ self.desired_count += by
163
+
164
+ wait_until = Process.clock_gettime(Process::CLOCK_MONOTONIC) + 180
165
+ @increase_desired_count_thread = Thread.new do
166
+ cl = client
167
+ by.times do
168
+ timeout = wait_until - Process.clock_gettime(Process::CLOCK_MONOTONIC)
169
+ break if timeout <= 0
170
+ break unless cluster_resource_manager.acquire(required_capacity, timeout: timeout)
171
+ begin
172
+ cl.update_service(cluster: cluster, service: name, desired_count: applied_desired_count + 1)
173
+ applied_desired_count += 1
174
+ rescue => e
175
+ cluster_resource_manager.release(required_capacity)
176
+ AutoScaler.error_logger.error(e)
177
+ break
178
+ end
179
+ end
180
+
181
+ if applied_desired_count != desired_count
182
+ self.desired_count = applied_desired_count
183
+ @logger.info "#{log_prefix} Failed to update service and set desired_count to #{desired_count}"
184
+ end
185
+ end
186
+ end
187
+
188
+ def decrease_desired_count(by, cluster_resource_manager)
189
+ cl = client
190
+ running_task_arns = cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
191
+
192
+ cl.update_service(cluster: cluster, service: name, desired_count: desired_count - by)
193
+
194
+ cl.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
195
+ w.before_wait do
196
+ @logger.debug "#{log_prefix} wait service stable"
197
+ end
198
+ end
199
+
200
+ stopping_task_arns = running_task_arns - cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
201
+ stopping_task_arns.each_slice(MAX_DESCRIBABLE_TASK_COUNT) do |arns|
202
+ cl.wait_until(:tasks_stopped, cluster: cluster, tasks: arns) do |w|
203
+ w.before_wait do
204
+ @logger.debug "#{log_prefix} wait stopping tasks stopped"
205
+ end
206
+ end
207
+ end
208
+
209
+ cluster_resource_manager.release(required_capacity * by)
210
+ self.desired_count -= by
211
+ end
212
+
213
+ def max_task_level(count)
214
+ max_task_count.index { |i| count <= i } || max_task_count.size - 1
215
+ end
216
+
217
+ def log_prefix
218
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
219
+ end
220
+ end
221
+ end
222
+ end
@@ -0,0 +1,102 @@
1
+ require "json"
2
+ require "timeout"
3
+
4
+ require "aws-sdk-ec2"
5
+ require "ecs_deploy"
6
+ require "ecs_deploy/auto_scaler/config_base"
7
+ require "ecs_deploy/auto_scaler/cluster_resource_manager"
8
+
9
+ module EcsDeploy
10
+ module AutoScaler
11
+ SpotFleetRequestConfig = Struct.new(:id, :region, :cluster, :buffer, :service_configs) do
12
+ include ConfigBase
13
+
14
+ def initialize(attributes = {}, logger)
15
+ attributes = attributes.dup
16
+ services = attributes.delete("services")
17
+ super(attributes, logger)
18
+ self.service_configs = services.map do |s|
19
+ ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger)
20
+ end
21
+ end
22
+
23
+ def name
24
+ id
25
+ end
26
+
27
+ def update_desired_capacity(required_capacity)
28
+ terminate_orphan_instances
29
+
30
+ desired_capacity = (required_capacity + buffer.to_f).ceil
31
+
32
+ request_config = ec2_client.describe_spot_fleet_requests(
33
+ spot_fleet_request_ids: [id]
34
+ ).spot_fleet_request_configs[0].spot_fleet_request_config
35
+
36
+ return if desired_capacity == request_config.target_capacity
37
+
38
+ ec2_client.modify_spot_fleet_request(spot_fleet_request_id: id, target_capacity: desired_capacity)
39
+
40
+ cluster_resource_manager.trigger_capacity_update(
41
+ request_config.target_capacity,
42
+ desired_capacity,
43
+ # Wait until the capacity is updated to prevent the process from terminating before container draining is completed
44
+ wait_until_capacity_updated: desired_capacity < request_config.target_capacity,
45
+ )
46
+ @logger.info "#{log_prefix} Update desired_capacity to #{desired_capacity}"
47
+ rescue => e
48
+ AutoScaler.error_logger.error(e)
49
+ end
50
+
51
+ def cluster_resource_manager
52
+ @cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new(
53
+ region: region,
54
+ cluster: cluster,
55
+ service_configs: service_configs,
56
+ capacity_based_on: "vCPUs",
57
+ logger: @logger,
58
+ )
59
+ end
60
+
61
+ private
62
+
63
+ def terminate_orphan_instances
64
+ container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id)
65
+ spot_fleet_instances = ec2_client.describe_spot_fleet_instances(spot_fleet_request_id: id).active_instances
66
+ orphans = spot_fleet_instances.reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
67
+
68
+ return if orphans.empty?
69
+
70
+ running_instances = ec2_client.describe_instances(
71
+ instance_ids: orphans,
72
+ filters: [{ name: "instance-state-name", values: ["running"] }],
73
+ ).reservations.flat_map(&:instances)
74
+ # instances which have just launched might not be registered to the cluster yet.
75
+ instance_ids = running_instances.select { |i| (Time.now - i.launch_time) > 600 }.map(&:instance_id)
76
+
77
+ return if instance_ids.empty?
78
+
79
+ # Terminate orpahns without canceling spot instance request
80
+ # because we can't terminate canceled spot instances by decreasing the capacity
81
+ ec2_client.terminate_instances(instance_ids: instance_ids)
82
+
83
+ @logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}"
84
+ rescue => e
85
+ AutoScaler.error_logger.error(e)
86
+ end
87
+
88
+ def ec2_client
89
+ Aws::EC2::Client.new(
90
+ access_key_id: EcsDeploy.config.access_key_id,
91
+ secret_access_key: EcsDeploy.config.secret_access_key,
92
+ region: region,
93
+ logger: logger,
94
+ )
95
+ end
96
+
97
+ def log_prefix
98
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,36 @@
1
+ require "aws-sdk-cloudwatch"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler"
4
+ require "ecs_deploy/auto_scaler/config_base"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ TriggerConfig = Struct.new(:alarm_name, :region, :state, :step) do
9
+ include ConfigBase
10
+
11
+ def client
12
+ Aws::CloudWatch::Client.new(
13
+ access_key_id: EcsDeploy.config.access_key_id,
14
+ secret_access_key: EcsDeploy.config.secret_access_key,
15
+ region: region,
16
+ logger: logger
17
+ )
18
+ end
19
+
20
+ def match?
21
+ fetch_alarm.state_value == state
22
+ end
23
+
24
+ def fetch_alarm
25
+ res = client.describe_alarms(alarm_names: [alarm_name])
26
+
27
+ raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
28
+ res.metric_alarms[0].tap do |alarm|
29
+ AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
30
+ end
31
+ rescue => e
32
+ AutoScaler.error_logger.error(e)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -1,4 +1,5 @@
1
1
  require 'ecs_deploy'
2
+ require 'ecs_deploy/instance_fluctuation_manager'
2
3
 
3
4
  namespace :ecs do
4
5
  task :configure do
@@ -7,6 +8,8 @@ namespace :ecs do
7
8
  c.deploy_wait_timeout = fetch(:ecs_deploy_wait_timeout) if fetch(:ecs_deploy_wait_timeout)
8
9
  c.ecs_service_role = fetch(:ecs_service_role) if fetch(:ecs_service_role)
9
10
  c.default_region = Array(fetch(:ecs_region))[0] if fetch(:ecs_region)
11
+ c.ecs_wait_until_services_stable_max_attempts = fetch(:ecs_wait_until_services_stable_max_attempts) if fetch(:ecs_wait_until_services_stable_max_attempts)
12
+ c.ecs_wait_until_services_stable_delay = fetch(:ecs_wait_until_services_stable_delay) if fetch(:ecs_wait_until_services_stable_delay)
10
13
  end
11
14
 
12
15
  if ENV["TARGET_CLUSTER"]
@@ -20,7 +23,7 @@ namespace :ecs do
20
23
  task register_task_definition: [:configure] do
21
24
  if fetch(:ecs_tasks)
22
25
  regions = Array(fetch(:ecs_region))
23
- regions = [EcsDeploy.config.default_region || ENV["AWS_DEFAULT_REGION"]] if regions.empty?
26
+ regions = [EcsDeploy.config.default_region] if regions.empty?
24
27
  ecs_registered_tasks = {}
25
28
  regions.each do |region|
26
29
  ecs_registered_tasks[region] = {}
@@ -30,9 +33,14 @@ namespace :ecs do
30
33
  task_definition_name: t[:name],
31
34
  container_definitions: t[:container_definitions],
32
35
  task_role_arn: t[:task_role_arn],
36
+ execution_role_arn: t[:execution_role_arn],
33
37
  volumes: t[:volumes],
34
38
  network_mode: t[:network_mode],
35
39
  placement_constraints: t[:placement_constraints],
40
+ requires_compatibilities: t[:requires_compatibilities],
41
+ cpu: t[:cpu],
42
+ memory: t[:memory],
43
+ tags: t[:tags],
36
44
  )
37
45
  result = task_definition.register
38
46
  ecs_registered_tasks[region][t[:name]] = result
@@ -58,6 +66,10 @@ namespace :ecs do
58
66
  description: t[:description],
59
67
  target_id: t[:target_id],
60
68
  task_definition_name: t[:task_definition_name],
69
+ network_configuration: t[:network_configuration],
70
+ launch_type: t[:launch_type],
71
+ platform_version: t[:platform_version],
72
+ group: t[:group],
61
73
  revision: t[:revision],
62
74
  task_count: t[:task_count],
63
75
  role_arn: t[:role_arn],
@@ -89,8 +101,18 @@ namespace :ecs do
89
101
  task_definition_name: service[:task_definition_name],
90
102
  load_balancers: service[:load_balancers],
91
103
  desired_count: service[:desired_count],
104
+ launch_type: service[:launch_type],
105
+ network_configuration: service[:network_configuration],
106
+ health_check_grace_period_seconds: service[:health_check_grace_period_seconds],
107
+ delete: service[:delete],
108
+ enable_ecs_managed_tags: service[:enable_ecs_managed_tags],
109
+ tags: service[:tags],
110
+ propagate_tags: service[:propagate_tags],
92
111
  }
93
112
  service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration]
113
+ service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints]
114
+ service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy]
115
+ service_options[:scheduling_strategy] = service[:scheduling_strategy] if service[:scheduling_strategy]
94
116
  s = EcsDeploy::Service.new(service_options)
95
117
  s.deploy
96
118
  s
@@ -149,8 +171,13 @@ namespace :ecs do
149
171
  task_definition_name: rollback_arn,
150
172
  load_balancers: service[:load_balancers],
151
173
  desired_count: service[:desired_count],
174
+ launch_type: service[:launch_type],
175
+ network_configuration: service[:network_configuration],
176
+ health_check_grace_period_seconds: service[:health_check_grace_period_seconds],
152
177
  }
153
178
  service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration]
179
+ service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints]
180
+ service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy]
154
181
  s = EcsDeploy::Service.new(service_options)
155
182
  s.deploy
156
183
  EcsDeploy::TaskDefinition.deregister(current_task_definition_arn, region: r)
@@ -160,4 +187,46 @@ namespace :ecs do
160
187
  end
161
188
  end
162
189
  end
190
+
191
+ task increase_instances_to_max_size: [:configure] do
192
+ configs = fetch(:ecs_instance_fluctuation_manager_configs, [])
193
+ unless configs.empty?
194
+ regions = Array(fetch(:ecs_region))
195
+ regions = [EcsDeploy.config.default_region] if regions.empty?
196
+ regions.each do |region|
197
+ configs.each do |config|
198
+ logger = config.fetch(:logger, EcsDeploy.logger)
199
+ m = EcsDeploy::InstanceFluctuationManager.new(
200
+ region: config[:region] || region,
201
+ cluster: config[:cluster] || fetch(:ecs_default_cluster),
202
+ auto_scaling_group_name: config[:auto_scaling_group_name],
203
+ desired_capacity: config[:desired_capacity],
204
+ logger: logger
205
+ )
206
+ m.increase
207
+ end
208
+ end
209
+ end
210
+ end
211
+
212
+ task terminate_redundant_instances: [:configure] do
213
+ configs = fetch(:ecs_instance_fluctuation_manager_configs, [])
214
+ unless configs.empty?
215
+ regions = Array(fetch(:ecs_region))
216
+ regions = [EcsDeploy.config.default_region] if regions.empty?
217
+ regions.each do |region|
218
+ configs.each do |config|
219
+ logger = config.fetch(:logger, EcsDeploy.logger)
220
+ m = EcsDeploy::InstanceFluctuationManager.new(
221
+ region: config[:region] || region,
222
+ cluster: config[:cluster] || fetch(:ecs_default_cluster),
223
+ auto_scaling_group_name: config[:auto_scaling_group_name],
224
+ desired_capacity: config[:desired_capacity],
225
+ logger: logger
226
+ )
227
+ m.decrease
228
+ end
229
+ end
230
+ end
231
+ end
163
232
  end