ecs_deploy 0.3.0 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,223 @@
1
+ require "aws-sdk-ecs"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler/config_base"
4
+ require "ecs_deploy/auto_scaler/trigger_config"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count required_capacity)
9
+ ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
10
+ include ConfigBase
11
+
12
+ MAX_DESCRIBABLE_TASK_COUNT = 100
13
+
14
+ def initialize(attributes = {}, logger)
15
+ super
16
+ self.idle_time ||= 60
17
+ self.max_task_count = Array(max_task_count)
18
+ self.upscale_triggers = upscale_triggers.to_a.map do |t|
19
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
20
+ end
21
+ self.downscale_triggers = downscale_triggers.to_a.map do |t|
22
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
23
+ end
24
+ self.max_task_count.sort!
25
+ self.desired_count = fetch_service.desired_count
26
+ self.required_capacity ||= 1
27
+ @reach_max_at = nil
28
+ @last_updated_at = nil
29
+ @logger = logger
30
+ end
31
+
32
+ def adjust_desired_count(cluster_resource_manager)
33
+ if idle?
34
+ @logger.debug "#{name} is idling"
35
+ return
36
+ end
37
+
38
+ difference = 0
39
+ upscale_triggers.each do |trigger|
40
+ next if difference >= trigger.step
41
+
42
+ if trigger.match?
43
+ @logger.info "#{log_prefix} Fire upscale trigger by #{trigger.alarm_name} #{trigger.state}"
44
+ difference = trigger.step
45
+ end
46
+ end
47
+
48
+ if desired_count > current_min_task_count
49
+ downscale_triggers.each do |trigger|
50
+ next if difference > 0 && !trigger.prioritized_over_upscale_triggers?
51
+ next unless trigger.match?
52
+
53
+ @logger.info "#{log_prefix} Fire downscale trigger by #{trigger.alarm_name} #{trigger.state}"
54
+ difference = [difference, -trigger.step].min
55
+ end
56
+ end
57
+
58
+ if current_min_task_count > desired_count + difference
59
+ difference = current_min_task_count - desired_count
60
+ end
61
+
62
+ if difference >= 0 && desired_count > max_task_count.max
63
+ difference = max_task_count.max - desired_count
64
+ end
65
+
66
+ if difference != 0
67
+ update_service(difference, cluster_resource_manager)
68
+ end
69
+ end
70
+
71
+ def wait_until_desired_count_updated
72
+ @increase_desired_count_thread&.join
73
+ rescue => e
74
+ AutoScaler.error_logger.warn("`#{__method__}': #{e} (#{e.class})")
75
+ ensure
76
+ @increase_desired_count_thread = nil
77
+ end
78
+
79
+ private
80
+
81
+ def client
82
+ Aws::ECS::Client.new(
83
+ access_key_id: EcsDeploy.config.access_key_id,
84
+ secret_access_key: EcsDeploy.config.secret_access_key,
85
+ region: region,
86
+ logger: logger
87
+ )
88
+ end
89
+
90
+ def idle?
91
+ return false unless @last_updated_at
92
+
93
+ diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
94
+ diff < idle_time
95
+ end
96
+
97
+ def current_min_task_count
98
+ return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
99
+
100
+ scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
101
+ from = Time.parse(s["from"])
102
+ to = Time.parse(s["to"])
103
+ (from..to).cover?(Time.now)
104
+ }["count"]
105
+ end
106
+
107
+ def overheat?
108
+ return false unless @reach_max_at
109
+ (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
110
+ end
111
+
112
+ def fetch_service
113
+ res = client.describe_services(cluster: cluster, services: [name])
114
+ raise "Service \"#{name}\" is not found" if res.services.empty?
115
+ res.services[0]
116
+ rescue => e
117
+ AutoScaler.error_logger.error(e)
118
+ end
119
+
120
+ def update_service(difference, cluster_resource_manager)
121
+ next_desired_count = desired_count + difference
122
+ current_level = max_task_level(desired_count)
123
+ next_level = max_task_level(next_desired_count)
124
+ if current_level < next_level && overheat? # next max
125
+ level = next_level
126
+ @reach_max_at = nil
127
+ @logger.info "#{log_prefix} Service is overheat, uses next max count"
128
+ elsif current_level < next_level && !overheat? # wait cooldown
129
+ level = current_level
130
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
131
+ @reach_max_at ||= now
132
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
133
+ elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
134
+ level = current_level
135
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
136
+ @reach_max_at ||= now
137
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
138
+ elsif current_level == next_level && next_desired_count < max_task_count[current_level]
139
+ level = current_level
140
+ @reach_max_at = nil
141
+ @logger.info "#{log_prefix} Service clears cooldown state"
142
+ elsif current_level > next_level
143
+ level = next_level
144
+ @reach_max_at = nil
145
+ @logger.info "#{log_prefix} Service clears cooldown state"
146
+ end
147
+
148
+ next_desired_count = [next_desired_count, max_task_count[level]].min
149
+ if next_desired_count > desired_count
150
+ increase_desired_count(next_desired_count - desired_count, cluster_resource_manager)
151
+ else
152
+ decrease_desired_count(desired_count - next_desired_count, cluster_resource_manager)
153
+ end
154
+
155
+ @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
156
+ @logger.info "#{log_prefix} Update desired_count to #{next_desired_count}"
157
+ rescue => e
158
+ AutoScaler.error_logger.error(e)
159
+ end
160
+
161
+ def increase_desired_count(by, cluster_resource_manager)
162
+ applied_desired_count = desired_count
163
+ self.desired_count += by
164
+
165
+ wait_until = Process.clock_gettime(Process::CLOCK_MONOTONIC) + 180
166
+ @increase_desired_count_thread = Thread.new do
167
+ cl = client
168
+ by.times do
169
+ timeout = wait_until - Process.clock_gettime(Process::CLOCK_MONOTONIC)
170
+ break if timeout <= 0
171
+ break unless cluster_resource_manager.acquire(required_capacity, timeout: timeout)
172
+ begin
173
+ cl.update_service(cluster: cluster, service: name, desired_count: applied_desired_count + 1)
174
+ applied_desired_count += 1
175
+ rescue => e
176
+ cluster_resource_manager.release(required_capacity)
177
+ AutoScaler.error_logger.error(e)
178
+ break
179
+ end
180
+ end
181
+
182
+ if applied_desired_count != desired_count
183
+ self.desired_count = applied_desired_count
184
+ @logger.info "#{log_prefix} Failed to update service and set desired_count to #{desired_count}"
185
+ end
186
+ end
187
+ end
188
+
189
+ def decrease_desired_count(by, cluster_resource_manager)
190
+ cl = client
191
+ running_task_arns = cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
192
+
193
+ cl.update_service(cluster: cluster, service: name, desired_count: desired_count - by)
194
+
195
+ cl.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
196
+ w.before_wait do
197
+ @logger.debug "#{log_prefix} wait service stable"
198
+ end
199
+ end
200
+
201
+ stopping_task_arns = running_task_arns - cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
202
+ stopping_task_arns.each_slice(MAX_DESCRIBABLE_TASK_COUNT) do |arns|
203
+ cl.wait_until(:tasks_stopped, cluster: cluster, tasks: arns) do |w|
204
+ w.before_wait do
205
+ @logger.debug "#{log_prefix} wait stopping tasks stopped"
206
+ end
207
+ end
208
+ end
209
+
210
+ cluster_resource_manager.release(required_capacity * by)
211
+ self.desired_count -= by
212
+ end
213
+
214
+ def max_task_level(count)
215
+ max_task_count.index { |i| count <= i } || max_task_count.size - 1
216
+ end
217
+
218
+ def log_prefix
219
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
220
+ end
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,102 @@
1
+ require "json"
2
+ require "timeout"
3
+
4
+ require "aws-sdk-ec2"
5
+ require "ecs_deploy"
6
+ require "ecs_deploy/auto_scaler/config_base"
7
+ require "ecs_deploy/auto_scaler/cluster_resource_manager"
8
+
9
+ module EcsDeploy
10
+ module AutoScaler
11
+ SpotFleetRequestConfig = Struct.new(:id, :region, :cluster, :buffer, :service_configs) do
12
+ include ConfigBase
13
+
14
+ def initialize(attributes = {}, logger)
15
+ attributes = attributes.dup
16
+ services = attributes.delete("services")
17
+ super(attributes, logger)
18
+ self.service_configs = services.map do |s|
19
+ ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger)
20
+ end
21
+ end
22
+
23
+ def name
24
+ id
25
+ end
26
+
27
+ def update_desired_capacity(required_capacity)
28
+ terminate_orphan_instances
29
+
30
+ desired_capacity = (required_capacity + buffer.to_f).ceil
31
+
32
+ request_config = ec2_client.describe_spot_fleet_requests(
33
+ spot_fleet_request_ids: [id]
34
+ ).spot_fleet_request_configs[0].spot_fleet_request_config
35
+
36
+ return if desired_capacity == request_config.target_capacity
37
+
38
+ ec2_client.modify_spot_fleet_request(spot_fleet_request_id: id, target_capacity: desired_capacity)
39
+
40
+ cluster_resource_manager.trigger_capacity_update(
41
+ request_config.target_capacity,
42
+ desired_capacity,
43
+ # Wait until the capacity is updated to prevent the process from terminating before container draining is completed
44
+ wait_until_capacity_updated: desired_capacity < request_config.target_capacity,
45
+ )
46
+ @logger.info "#{log_prefix} Update desired_capacity to #{desired_capacity}"
47
+ rescue => e
48
+ AutoScaler.error_logger.error(e)
49
+ end
50
+
51
+ def cluster_resource_manager
52
+ @cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new(
53
+ region: region,
54
+ cluster: cluster,
55
+ service_configs: service_configs,
56
+ capacity_based_on: "vCPUs",
57
+ logger: @logger,
58
+ )
59
+ end
60
+
61
+ private
62
+
63
+ def terminate_orphan_instances
64
+ container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id)
65
+ spot_fleet_instances = ec2_client.describe_spot_fleet_instances(spot_fleet_request_id: id).active_instances
66
+ orphans = spot_fleet_instances.reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
67
+
68
+ return if orphans.empty?
69
+
70
+ running_instances = ec2_client.describe_instances(
71
+ instance_ids: orphans,
72
+ filters: [{ name: "instance-state-name", values: ["running"] }],
73
+ ).reservations.flat_map(&:instances)
74
+ # instances which have just launched might not be registered to the cluster yet.
75
+ instance_ids = running_instances.select { |i| (Time.now - i.launch_time) > 600 }.map(&:instance_id)
76
+
77
+ return if instance_ids.empty?
78
+
79
+ # Terminate orpahns without canceling spot instance request
80
+ # because we can't terminate canceled spot instances by decreasing the capacity
81
+ ec2_client.terminate_instances(instance_ids: instance_ids)
82
+
83
+ @logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}"
84
+ rescue => e
85
+ AutoScaler.error_logger.error(e)
86
+ end
87
+
88
+ def ec2_client
89
+ Aws::EC2::Client.new(
90
+ access_key_id: EcsDeploy.config.access_key_id,
91
+ secret_access_key: EcsDeploy.config.secret_access_key,
92
+ region: region,
93
+ logger: logger,
94
+ )
95
+ end
96
+
97
+ def log_prefix
98
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,42 @@
1
+ require "aws-sdk-cloudwatch"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler"
4
+ require "ecs_deploy/auto_scaler/config_base"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ TriggerConfig = Struct.new(:alarm_name, :region, :state, :step, :prioritized_over_upscale_triggers) do
9
+ include ConfigBase
10
+
11
+ def match?
12
+ fetch_alarm.state_value == state
13
+ end
14
+
15
+ def prioritized_over_upscale_triggers?
16
+ !!prioritized_over_upscale_triggers
17
+ end
18
+
19
+ private
20
+
21
+ def client
22
+ Aws::CloudWatch::Client.new(
23
+ access_key_id: EcsDeploy.config.access_key_id,
24
+ secret_access_key: EcsDeploy.config.secret_access_key,
25
+ region: region,
26
+ logger: logger
27
+ )
28
+ end
29
+
30
+ def fetch_alarm
31
+ res = client.describe_alarms(alarm_names: [alarm_name])
32
+
33
+ raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
34
+ res.metric_alarms[0].tap do |alarm|
35
+ AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
36
+ end
37
+ rescue => e
38
+ AutoScaler.error_logger.error(e)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -1,4 +1,5 @@
1
1
  require 'ecs_deploy'
2
+ require 'ecs_deploy/instance_fluctuation_manager'
2
3
 
3
4
  namespace :ecs do
4
5
  task :configure do
@@ -7,6 +8,8 @@ namespace :ecs do
7
8
  c.deploy_wait_timeout = fetch(:ecs_deploy_wait_timeout) if fetch(:ecs_deploy_wait_timeout)
8
9
  c.ecs_service_role = fetch(:ecs_service_role) if fetch(:ecs_service_role)
9
10
  c.default_region = Array(fetch(:ecs_region))[0] if fetch(:ecs_region)
11
+ c.ecs_wait_until_services_stable_max_attempts = fetch(:ecs_wait_until_services_stable_max_attempts) if fetch(:ecs_wait_until_services_stable_max_attempts)
12
+ c.ecs_wait_until_services_stable_delay = fetch(:ecs_wait_until_services_stable_delay) if fetch(:ecs_wait_until_services_stable_delay)
10
13
  end
11
14
 
12
15
  if ENV["TARGET_CLUSTER"]
@@ -20,9 +23,9 @@ namespace :ecs do
20
23
  task register_task_definition: [:configure] do
21
24
  if fetch(:ecs_tasks)
22
25
  regions = Array(fetch(:ecs_region))
23
- regions = [EcsDeploy.config.default_region || ENV["AWS_DEFAULT_REGION"]] if regions.empty?
26
+ regions = [EcsDeploy.config.default_region] if regions.empty?
24
27
  ecs_registered_tasks = {}
25
- regions.each do |r|
28
+ regions.each do |region|
26
29
  ecs_registered_tasks[region] = {}
27
30
  fetch(:ecs_tasks).each do |t|
28
31
  task_definition = EcsDeploy::TaskDefinition.new(
@@ -30,21 +33,17 @@ namespace :ecs do
30
33
  task_definition_name: t[:name],
31
34
  container_definitions: t[:container_definitions],
32
35
  task_role_arn: t[:task_role_arn],
36
+ execution_role_arn: t[:execution_role_arn],
33
37
  volumes: t[:volumes],
34
38
  network_mode: t[:network_mode],
35
39
  placement_constraints: t[:placement_constraints],
40
+ requires_compatibilities: t[:requires_compatibilities],
41
+ cpu: t[:cpu],
42
+ memory: t[:memory],
43
+ tags: t[:tags],
36
44
  )
37
45
  result = task_definition.register
38
46
  ecs_registered_tasks[region][t[:name]] = result
39
-
40
- executions = t[:executions].to_a
41
- unless executions.empty?
42
- warn "`executions` config is deprecated. I will remove this in near future"
43
- end
44
- executions.each do |exec|
45
- exec[:cluster] ||= fetch(:ecs_default_cluster)
46
- task_definition.run(exec)
47
- end
48
47
  end
49
48
  end
50
49
 
@@ -53,23 +52,28 @@ namespace :ecs do
53
52
  end
54
53
 
55
54
  task deploy_scheduled_task: [:configure, :register_task_definition] do
56
- if fetch(:ecs_tasks)
55
+ if fetch(:ecs_scheduled_tasks)
57
56
  regions = Array(fetch(:ecs_region))
58
57
  regions = [nil] if regions.empty?
59
58
  regions.each do |r|
60
59
  fetch(:ecs_scheduled_tasks).each do |t|
61
60
  scheduled_task = EcsDeploy::ScheduledTask.new(
62
61
  region: r,
63
- cluster: t[:cluster],
62
+ cluster: t[:cluster] || fetch(:ecs_default_cluster),
64
63
  rule_name: t[:rule_name],
65
64
  schedule_expression: t[:schedule_expression],
66
65
  enabled: t[:enabled] != false,
67
66
  description: t[:description],
68
67
  target_id: t[:target_id],
69
68
  task_definition_name: t[:task_definition_name],
69
+ network_configuration: t[:network_configuration],
70
+ launch_type: t[:launch_type],
71
+ platform_version: t[:platform_version],
72
+ group: t[:group],
70
73
  revision: t[:revision],
71
74
  task_count: t[:task_count],
72
75
  role_arn: t[:role_arn],
76
+ container_overrides: t[:container_overrides],
73
77
  )
74
78
  scheduled_task.deploy
75
79
  end
@@ -97,9 +101,20 @@ namespace :ecs do
97
101
  task_definition_name: service[:task_definition_name],
98
102
  load_balancers: service[:load_balancers],
99
103
  desired_count: service[:desired_count],
104
+ launch_type: service[:launch_type],
105
+ network_configuration: service[:network_configuration],
106
+ health_check_grace_period_seconds: service[:health_check_grace_period_seconds],
107
+ delete: service[:delete],
108
+ enable_ecs_managed_tags: service[:enable_ecs_managed_tags],
109
+ tags: service[:tags],
110
+ propagate_tags: service[:propagate_tags],
111
+ enable_execute_command: service[:enable_execute_command],
100
112
  }
101
113
  service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration]
102
- s = EcsDeploy::Service.new(service_options)
114
+ service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints]
115
+ service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy]
116
+ service_options[:scheduling_strategy] = service[:scheduling_strategy] if service[:scheduling_strategy]
117
+ s = EcsDeploy::Service.new(**service_options)
103
118
  s.deploy
104
119
  s
105
120
  end
@@ -157,9 +172,14 @@ namespace :ecs do
157
172
  task_definition_name: rollback_arn,
158
173
  load_balancers: service[:load_balancers],
159
174
  desired_count: service[:desired_count],
175
+ launch_type: service[:launch_type],
176
+ network_configuration: service[:network_configuration],
177
+ health_check_grace_period_seconds: service[:health_check_grace_period_seconds],
160
178
  }
161
179
  service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration]
162
- s = EcsDeploy::Service.new(service_options)
180
+ service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints]
181
+ service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy]
182
+ s = EcsDeploy::Service.new(**service_options)
163
183
  s.deploy
164
184
  EcsDeploy::TaskDefinition.deregister(current_task_definition_arn, region: r)
165
185
  s
@@ -168,4 +188,46 @@ namespace :ecs do
168
188
  end
169
189
  end
170
190
  end
191
+
192
+ task increase_instances_to_max_size: [:configure] do
193
+ configs = fetch(:ecs_instance_fluctuation_manager_configs, [])
194
+ unless configs.empty?
195
+ regions = Array(fetch(:ecs_region))
196
+ regions = [EcsDeploy.config.default_region] if regions.empty?
197
+ regions.each do |region|
198
+ configs.each do |config|
199
+ logger = config.fetch(:logger, EcsDeploy.logger)
200
+ m = EcsDeploy::InstanceFluctuationManager.new(
201
+ region: config[:region] || region,
202
+ cluster: config[:cluster] || fetch(:ecs_default_cluster),
203
+ auto_scaling_group_name: config[:auto_scaling_group_name],
204
+ desired_capacity: config[:desired_capacity],
205
+ logger: logger
206
+ )
207
+ m.increase
208
+ end
209
+ end
210
+ end
211
+ end
212
+
213
+ task terminate_redundant_instances: [:configure] do
214
+ configs = fetch(:ecs_instance_fluctuation_manager_configs, [])
215
+ unless configs.empty?
216
+ regions = Array(fetch(:ecs_region))
217
+ regions = [EcsDeploy.config.default_region] if regions.empty?
218
+ regions.each do |region|
219
+ configs.each do |config|
220
+ logger = config.fetch(:logger, EcsDeploy.logger)
221
+ m = EcsDeploy::InstanceFluctuationManager.new(
222
+ region: config[:region] || region,
223
+ cluster: config[:cluster] || fetch(:ecs_default_cluster),
224
+ auto_scaling_group_name: config[:auto_scaling_group_name],
225
+ desired_capacity: config[:desired_capacity],
226
+ logger: logger
227
+ )
228
+ m.decrease
229
+ end
230
+ end
231
+ end
232
+ end
171
233
  end