ecs_deploy 0.3.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ require "aws-sdk-ecs"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler/config_base"
4
+ require "ecs_deploy/auto_scaler/trigger_config"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count required_capacity)
9
+ ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
10
+ include ConfigBase
11
+
12
+ MAX_DESCRIBABLE_TASK_COUNT = 100
13
+
14
+ def initialize(attributes = {}, logger)
15
+ super
16
+ self.idle_time ||= 60
17
+ self.max_task_count = Array(max_task_count)
18
+ self.upscale_triggers = upscale_triggers.to_a.map do |t|
19
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
20
+ end
21
+ self.downscale_triggers = downscale_triggers.to_a.map do |t|
22
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
23
+ end
24
+ self.max_task_count.sort!
25
+ self.desired_count = fetch_service.desired_count
26
+ self.required_capacity ||= 1
27
+ @reach_max_at = nil
28
+ @last_updated_at = nil
29
+ @logger = logger
30
+ end
31
+
32
+ def adjust_desired_count(cluster_resource_manager)
33
+ if idle?
34
+ @logger.debug "#{name} is idling"
35
+ return
36
+ end
37
+
38
+ difference = 0
39
+ upscale_triggers.each do |trigger|
40
+ next if difference >= trigger.step
41
+
42
+ if trigger.match?
43
+ @logger.info "#{log_prefix} Fire upscale trigger by #{trigger.alarm_name} #{trigger.state}"
44
+ difference = trigger.step
45
+ end
46
+ end
47
+
48
+ if desired_count > current_min_task_count
49
+ downscale_triggers.each do |trigger|
50
+ next if difference > 0 && !trigger.prioritized_over_upscale_triggers?
51
+ next unless trigger.match?
52
+
53
+ @logger.info "#{log_prefix} Fire downscale trigger by #{trigger.alarm_name} #{trigger.state}"
54
+ difference = [difference, -trigger.step].min
55
+ end
56
+ end
57
+
58
+ if current_min_task_count > desired_count + difference
59
+ difference = current_min_task_count - desired_count
60
+ end
61
+
62
+ if difference >= 0 && desired_count > max_task_count.max
63
+ difference = max_task_count.max - desired_count
64
+ end
65
+
66
+ if difference != 0
67
+ update_service(difference, cluster_resource_manager)
68
+ end
69
+ end
70
+
71
+ def wait_until_desired_count_updated
72
+ @increase_desired_count_thread&.join
73
+ rescue => e
74
+ AutoScaler.error_logger.warn("`#{__method__}': #{e} (#{e.class})")
75
+ ensure
76
+ @increase_desired_count_thread = nil
77
+ end
78
+
79
+ private
80
+
81
+ def client
82
+ Aws::ECS::Client.new(
83
+ access_key_id: EcsDeploy.config.access_key_id,
84
+ secret_access_key: EcsDeploy.config.secret_access_key,
85
+ region: region,
86
+ logger: logger
87
+ )
88
+ end
89
+
90
+ def idle?
91
+ return false unless @last_updated_at
92
+
93
+ diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
94
+ diff < idle_time
95
+ end
96
+
97
+ def current_min_task_count
98
+ return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
99
+
100
+ scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
101
+ from = Time.parse(s["from"])
102
+ to = Time.parse(s["to"])
103
+ (from..to).cover?(Time.now)
104
+ }["count"]
105
+ end
106
+
107
+ def overheat?
108
+ return false unless @reach_max_at
109
+ (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
110
+ end
111
+
112
+ def fetch_service
113
+ res = client.describe_services(cluster: cluster, services: [name])
114
+ raise "Service \"#{name}\" is not found" if res.services.empty?
115
+ res.services[0]
116
+ rescue => e
117
+ AutoScaler.error_logger.error(e)
118
+ end
119
+
120
+ def update_service(difference, cluster_resource_manager)
121
+ next_desired_count = desired_count + difference
122
+ current_level = max_task_level(desired_count)
123
+ next_level = max_task_level(next_desired_count)
124
+ if current_level < next_level && overheat? # next max
125
+ level = next_level
126
+ @reach_max_at = nil
127
+ @logger.info "#{log_prefix} Service is overheat, uses next max count"
128
+ elsif current_level < next_level && !overheat? # wait cooldown
129
+ level = current_level
130
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
131
+ @reach_max_at ||= now
132
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
133
+ elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
134
+ level = current_level
135
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
136
+ @reach_max_at ||= now
137
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
138
+ elsif current_level == next_level && next_desired_count < max_task_count[current_level]
139
+ level = current_level
140
+ @reach_max_at = nil
141
+ @logger.info "#{log_prefix} Service clears cooldown state"
142
+ elsif current_level > next_level
143
+ level = next_level
144
+ @reach_max_at = nil
145
+ @logger.info "#{log_prefix} Service clears cooldown state"
146
+ end
147
+
148
+ next_desired_count = [next_desired_count, max_task_count[level]].min
149
+ if next_desired_count > desired_count
150
+ increase_desired_count(next_desired_count - desired_count, cluster_resource_manager)
151
+ else
152
+ decrease_desired_count(desired_count - next_desired_count, cluster_resource_manager)
153
+ end
154
+
155
+ @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
156
+ @logger.info "#{log_prefix} Update desired_count to #{next_desired_count}"
157
+ rescue => e
158
+ AutoScaler.error_logger.error(e)
159
+ end
160
+
161
+ def increase_desired_count(by, cluster_resource_manager)
162
+ applied_desired_count = desired_count
163
+ self.desired_count += by
164
+
165
+ wait_until = Process.clock_gettime(Process::CLOCK_MONOTONIC) + 180
166
+ @increase_desired_count_thread = Thread.new do
167
+ cl = client
168
+ by.times do
169
+ timeout = wait_until - Process.clock_gettime(Process::CLOCK_MONOTONIC)
170
+ break if timeout <= 0
171
+ break unless cluster_resource_manager.acquire(required_capacity, timeout: timeout)
172
+ begin
173
+ cl.update_service(cluster: cluster, service: name, desired_count: applied_desired_count + 1)
174
+ applied_desired_count += 1
175
+ rescue => e
176
+ cluster_resource_manager.release(required_capacity)
177
+ AutoScaler.error_logger.error(e)
178
+ break
179
+ end
180
+ end
181
+
182
+ if applied_desired_count != desired_count
183
+ self.desired_count = applied_desired_count
184
+ @logger.info "#{log_prefix} Failed to update service and set desired_count to #{desired_count}"
185
+ end
186
+ end
187
+ end
188
+
189
+ def decrease_desired_count(by, cluster_resource_manager)
190
+ cl = client
191
+ running_task_arns = cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
192
+
193
+ cl.update_service(cluster: cluster, service: name, desired_count: desired_count - by)
194
+
195
+ cl.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
196
+ w.before_wait do
197
+ @logger.debug "#{log_prefix} wait service stable"
198
+ end
199
+ end
200
+
201
+ stopping_task_arns = running_task_arns - cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
202
+ stopping_task_arns.each_slice(MAX_DESCRIBABLE_TASK_COUNT) do |arns|
203
+ cl.wait_until(:tasks_stopped, cluster: cluster, tasks: arns) do |w|
204
+ w.before_wait do
205
+ @logger.debug "#{log_prefix} wait stopping tasks stopped"
206
+ end
207
+ end
208
+ end
209
+
210
+ cluster_resource_manager.release(required_capacity * by)
211
+ self.desired_count -= by
212
+ end
213
+
214
+ def max_task_level(count)
215
+ max_task_count.index { |i| count <= i } || max_task_count.size - 1
216
+ end
217
+
218
+ def log_prefix
219
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
220
+ end
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,102 @@
1
+ require "json"
2
+ require "timeout"
3
+
4
+ require "aws-sdk-ec2"
5
+ require "ecs_deploy"
6
+ require "ecs_deploy/auto_scaler/config_base"
7
+ require "ecs_deploy/auto_scaler/cluster_resource_manager"
8
+
9
+ module EcsDeploy
10
+ module AutoScaler
11
+ SpotFleetRequestConfig = Struct.new(:id, :region, :cluster, :buffer, :service_configs) do
12
+ include ConfigBase
13
+
14
+ def initialize(attributes = {}, logger)
15
+ attributes = attributes.dup
16
+ services = attributes.delete("services")
17
+ super(attributes, logger)
18
+ self.service_configs = services.map do |s|
19
+ ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger)
20
+ end
21
+ end
22
+
23
+ def name
24
+ id
25
+ end
26
+
27
+ def update_desired_capacity(required_capacity)
28
+ terminate_orphan_instances
29
+
30
+ desired_capacity = (required_capacity + buffer.to_f).ceil
31
+
32
+ request_config = ec2_client.describe_spot_fleet_requests(
33
+ spot_fleet_request_ids: [id]
34
+ ).spot_fleet_request_configs[0].spot_fleet_request_config
35
+
36
+ return if desired_capacity == request_config.target_capacity
37
+
38
+ ec2_client.modify_spot_fleet_request(spot_fleet_request_id: id, target_capacity: desired_capacity)
39
+
40
+ cluster_resource_manager.trigger_capacity_update(
41
+ request_config.target_capacity,
42
+ desired_capacity,
43
+ # Wait until the capacity is updated to prevent the process from terminating before container draining is completed
44
+ wait_until_capacity_updated: desired_capacity < request_config.target_capacity,
45
+ )
46
+ @logger.info "#{log_prefix} Update desired_capacity to #{desired_capacity}"
47
+ rescue => e
48
+ AutoScaler.error_logger.error(e)
49
+ end
50
+
51
+ def cluster_resource_manager
52
+ @cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new(
53
+ region: region,
54
+ cluster: cluster,
55
+ service_configs: service_configs,
56
+ capacity_based_on: "vCPUs",
57
+ logger: @logger,
58
+ )
59
+ end
60
+
61
+ private
62
+
63
+ def terminate_orphan_instances
64
+ container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id)
65
+ spot_fleet_instances = ec2_client.describe_spot_fleet_instances(spot_fleet_request_id: id).active_instances
66
+ orphans = spot_fleet_instances.reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
67
+
68
+ return if orphans.empty?
69
+
70
+ running_instances = ec2_client.describe_instances(
71
+ instance_ids: orphans,
72
+ filters: [{ name: "instance-state-name", values: ["running"] }],
73
+ ).reservations.flat_map(&:instances)
74
+ # instances which have just launched might not be registered to the cluster yet.
75
+ instance_ids = running_instances.select { |i| (Time.now - i.launch_time) > 600 }.map(&:instance_id)
76
+
77
+ return if instance_ids.empty?
78
+
79
+ # Terminate orpahns without canceling spot instance request
80
+ # because we can't terminate canceled spot instances by decreasing the capacity
81
+ ec2_client.terminate_instances(instance_ids: instance_ids)
82
+
83
+ @logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}"
84
+ rescue => e
85
+ AutoScaler.error_logger.error(e)
86
+ end
87
+
88
+ def ec2_client
89
+ Aws::EC2::Client.new(
90
+ access_key_id: EcsDeploy.config.access_key_id,
91
+ secret_access_key: EcsDeploy.config.secret_access_key,
92
+ region: region,
93
+ logger: logger,
94
+ )
95
+ end
96
+
97
+ def log_prefix
98
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,42 @@
1
+ require "aws-sdk-cloudwatch"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler"
4
+ require "ecs_deploy/auto_scaler/config_base"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ TriggerConfig = Struct.new(:alarm_name, :region, :state, :step, :prioritized_over_upscale_triggers) do
9
+ include ConfigBase
10
+
11
+ def match?
12
+ fetch_alarm.state_value == state
13
+ end
14
+
15
+ def prioritized_over_upscale_triggers?
16
+ !!prioritized_over_upscale_triggers
17
+ end
18
+
19
+ private
20
+
21
+ def client
22
+ Aws::CloudWatch::Client.new(
23
+ access_key_id: EcsDeploy.config.access_key_id,
24
+ secret_access_key: EcsDeploy.config.secret_access_key,
25
+ region: region,
26
+ logger: logger
27
+ )
28
+ end
29
+
30
+ def fetch_alarm
31
+ res = client.describe_alarms(alarm_names: [alarm_name])
32
+
33
+ raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
34
+ res.metric_alarms[0].tap do |alarm|
35
+ AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
36
+ end
37
+ rescue => e
38
+ AutoScaler.error_logger.error(e)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -1,4 +1,5 @@
1
1
  require 'ecs_deploy'
2
+ require 'ecs_deploy/instance_fluctuation_manager'
2
3
 
3
4
  namespace :ecs do
4
5
  task :configure do
@@ -7,6 +8,8 @@ namespace :ecs do
7
8
  c.deploy_wait_timeout = fetch(:ecs_deploy_wait_timeout) if fetch(:ecs_deploy_wait_timeout)
8
9
  c.ecs_service_role = fetch(:ecs_service_role) if fetch(:ecs_service_role)
9
10
  c.default_region = Array(fetch(:ecs_region))[0] if fetch(:ecs_region)
11
+ c.ecs_wait_until_services_stable_max_attempts = fetch(:ecs_wait_until_services_stable_max_attempts) if fetch(:ecs_wait_until_services_stable_max_attempts)
12
+ c.ecs_wait_until_services_stable_delay = fetch(:ecs_wait_until_services_stable_delay) if fetch(:ecs_wait_until_services_stable_delay)
10
13
  end
11
14
 
12
15
  if ENV["TARGET_CLUSTER"]
@@ -20,9 +23,9 @@ namespace :ecs do
20
23
  task register_task_definition: [:configure] do
21
24
  if fetch(:ecs_tasks)
22
25
  regions = Array(fetch(:ecs_region))
23
- regions = [EcsDeploy.config.default_region || ENV["AWS_DEFAULT_REGION"]] if regions.empty?
26
+ regions = [EcsDeploy.config.default_region] if regions.empty?
24
27
  ecs_registered_tasks = {}
25
- regions.each do |r|
28
+ regions.each do |region|
26
29
  ecs_registered_tasks[region] = {}
27
30
  fetch(:ecs_tasks).each do |t|
28
31
  task_definition = EcsDeploy::TaskDefinition.new(
@@ -30,21 +33,17 @@ namespace :ecs do
30
33
  task_definition_name: t[:name],
31
34
  container_definitions: t[:container_definitions],
32
35
  task_role_arn: t[:task_role_arn],
36
+ execution_role_arn: t[:execution_role_arn],
33
37
  volumes: t[:volumes],
34
38
  network_mode: t[:network_mode],
35
39
  placement_constraints: t[:placement_constraints],
40
+ requires_compatibilities: t[:requires_compatibilities],
41
+ cpu: t[:cpu],
42
+ memory: t[:memory],
43
+ tags: t[:tags],
36
44
  )
37
45
  result = task_definition.register
38
46
  ecs_registered_tasks[region][t[:name]] = result
39
-
40
- executions = t[:executions].to_a
41
- unless executions.empty?
42
- warn "`executions` config is deprecated. I will remove this in near future"
43
- end
44
- executions.each do |exec|
45
- exec[:cluster] ||= fetch(:ecs_default_cluster)
46
- task_definition.run(exec)
47
- end
48
47
  end
49
48
  end
50
49
 
@@ -53,23 +52,28 @@ namespace :ecs do
53
52
  end
54
53
 
55
54
  task deploy_scheduled_task: [:configure, :register_task_definition] do
56
- if fetch(:ecs_tasks)
55
+ if fetch(:ecs_scheduled_tasks)
57
56
  regions = Array(fetch(:ecs_region))
58
57
  regions = [nil] if regions.empty?
59
58
  regions.each do |r|
60
59
  fetch(:ecs_scheduled_tasks).each do |t|
61
60
  scheduled_task = EcsDeploy::ScheduledTask.new(
62
61
  region: r,
63
- cluster: t[:cluster],
62
+ cluster: t[:cluster] || fetch(:ecs_default_cluster),
64
63
  rule_name: t[:rule_name],
65
64
  schedule_expression: t[:schedule_expression],
66
65
  enabled: t[:enabled] != false,
67
66
  description: t[:description],
68
67
  target_id: t[:target_id],
69
68
  task_definition_name: t[:task_definition_name],
69
+ network_configuration: t[:network_configuration],
70
+ launch_type: t[:launch_type],
71
+ platform_version: t[:platform_version],
72
+ group: t[:group],
70
73
  revision: t[:revision],
71
74
  task_count: t[:task_count],
72
75
  role_arn: t[:role_arn],
76
+ container_overrides: t[:container_overrides],
73
77
  )
74
78
  scheduled_task.deploy
75
79
  end
@@ -97,9 +101,20 @@ namespace :ecs do
97
101
  task_definition_name: service[:task_definition_name],
98
102
  load_balancers: service[:load_balancers],
99
103
  desired_count: service[:desired_count],
104
+ launch_type: service[:launch_type],
105
+ network_configuration: service[:network_configuration],
106
+ health_check_grace_period_seconds: service[:health_check_grace_period_seconds],
107
+ delete: service[:delete],
108
+ enable_ecs_managed_tags: service[:enable_ecs_managed_tags],
109
+ tags: service[:tags],
110
+ propagate_tags: service[:propagate_tags],
111
+ enable_execute_command: service[:enable_execute_command],
100
112
  }
101
113
  service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration]
102
- s = EcsDeploy::Service.new(service_options)
114
+ service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints]
115
+ service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy]
116
+ service_options[:scheduling_strategy] = service[:scheduling_strategy] if service[:scheduling_strategy]
117
+ s = EcsDeploy::Service.new(**service_options)
103
118
  s.deploy
104
119
  s
105
120
  end
@@ -157,9 +172,14 @@ namespace :ecs do
157
172
  task_definition_name: rollback_arn,
158
173
  load_balancers: service[:load_balancers],
159
174
  desired_count: service[:desired_count],
175
+ launch_type: service[:launch_type],
176
+ network_configuration: service[:network_configuration],
177
+ health_check_grace_period_seconds: service[:health_check_grace_period_seconds],
160
178
  }
161
179
  service_options[:deployment_configuration] = service[:deployment_configuration] if service[:deployment_configuration]
162
- s = EcsDeploy::Service.new(service_options)
180
+ service_options[:placement_constraints] = service[:placement_constraints] if service[:placement_constraints]
181
+ service_options[:placement_strategy] = service[:placement_strategy] if service[:placement_strategy]
182
+ s = EcsDeploy::Service.new(**service_options)
163
183
  s.deploy
164
184
  EcsDeploy::TaskDefinition.deregister(current_task_definition_arn, region: r)
165
185
  s
@@ -168,4 +188,46 @@ namespace :ecs do
168
188
  end
169
189
  end
170
190
  end
191
+
192
+ task increase_instances_to_max_size: [:configure] do
193
+ configs = fetch(:ecs_instance_fluctuation_manager_configs, [])
194
+ unless configs.empty?
195
+ regions = Array(fetch(:ecs_region))
196
+ regions = [EcsDeploy.config.default_region] if regions.empty?
197
+ regions.each do |region|
198
+ configs.each do |config|
199
+ logger = config.fetch(:logger, EcsDeploy.logger)
200
+ m = EcsDeploy::InstanceFluctuationManager.new(
201
+ region: config[:region] || region,
202
+ cluster: config[:cluster] || fetch(:ecs_default_cluster),
203
+ auto_scaling_group_name: config[:auto_scaling_group_name],
204
+ desired_capacity: config[:desired_capacity],
205
+ logger: logger
206
+ )
207
+ m.increase
208
+ end
209
+ end
210
+ end
211
+ end
212
+
213
+ task terminate_redundant_instances: [:configure] do
214
+ configs = fetch(:ecs_instance_fluctuation_manager_configs, [])
215
+ unless configs.empty?
216
+ regions = Array(fetch(:ecs_region))
217
+ regions = [EcsDeploy.config.default_region] if regions.empty?
218
+ regions.each do |region|
219
+ configs.each do |config|
220
+ logger = config.fetch(:logger, EcsDeploy.logger)
221
+ m = EcsDeploy::InstanceFluctuationManager.new(
222
+ region: config[:region] || region,
223
+ cluster: config[:cluster] || fetch(:ecs_default_cluster),
224
+ auto_scaling_group_name: config[:auto_scaling_group_name],
225
+ desired_capacity: config[:desired_capacity],
226
+ logger: logger
227
+ )
228
+ m.decrease
229
+ end
230
+ end
231
+ end
232
+ end
171
233
  end