ecs_deploy 0.3.2 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,223 @@
1
+ require "aws-sdk-ecs"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler/config_base"
4
+ require "ecs_deploy/auto_scaler/trigger_config"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count required_capacity)
9
+ ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
10
+ include ConfigBase
11
+
12
+ MAX_DESCRIBABLE_TASK_COUNT = 100
13
+
14
+ def initialize(attributes = {}, logger)
15
+ super
16
+ self.idle_time ||= 60
17
+ self.max_task_count = Array(max_task_count)
18
+ self.upscale_triggers = upscale_triggers.to_a.map do |t|
19
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
20
+ end
21
+ self.downscale_triggers = downscale_triggers.to_a.map do |t|
22
+ TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
23
+ end
24
+ self.max_task_count.sort!
25
+ self.desired_count = fetch_service.desired_count
26
+ self.required_capacity ||= 1
27
+ @reach_max_at = nil
28
+ @last_updated_at = nil
29
+ @logger = logger
30
+ end
31
+
32
+ def adjust_desired_count(cluster_resource_manager)
33
+ if idle?
34
+ @logger.debug "#{name} is idling"
35
+ return
36
+ end
37
+
38
+ difference = 0
39
+ upscale_triggers.each do |trigger|
40
+ next if difference >= trigger.step
41
+
42
+ if trigger.match?
43
+ @logger.info "#{log_prefix} Fire upscale trigger by #{trigger.alarm_name} #{trigger.state}"
44
+ difference = trigger.step
45
+ end
46
+ end
47
+
48
+ if desired_count > current_min_task_count
49
+ downscale_triggers.each do |trigger|
50
+ next if difference > 0 && !trigger.prioritized_over_upscale_triggers?
51
+ next unless trigger.match?
52
+
53
+ @logger.info "#{log_prefix} Fire downscale trigger by #{trigger.alarm_name} #{trigger.state}"
54
+ difference = [difference, -trigger.step].min
55
+ end
56
+ end
57
+
58
+ if current_min_task_count > desired_count + difference
59
+ difference = current_min_task_count - desired_count
60
+ end
61
+
62
+ if difference >= 0 && desired_count > max_task_count.max
63
+ difference = max_task_count.max - desired_count
64
+ end
65
+
66
+ if difference != 0
67
+ update_service(difference, cluster_resource_manager)
68
+ end
69
+ end
70
+
71
+ def wait_until_desired_count_updated
72
+ @increase_desired_count_thread&.join
73
+ rescue => e
74
+ AutoScaler.error_logger.warn("`#{__method__}': #{e} (#{e.class})")
75
+ ensure
76
+ @increase_desired_count_thread = nil
77
+ end
78
+
79
+ private
80
+
81
+ def client
82
+ Aws::ECS::Client.new(
83
+ access_key_id: EcsDeploy.config.access_key_id,
84
+ secret_access_key: EcsDeploy.config.secret_access_key,
85
+ region: region,
86
+ logger: logger
87
+ )
88
+ end
89
+
90
+ def idle?
91
+ return false unless @last_updated_at
92
+
93
+ diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
94
+ diff < idle_time
95
+ end
96
+
97
+ def current_min_task_count
98
+ return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
99
+
100
+ scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
101
+ from = Time.parse(s["from"])
102
+ to = Time.parse(s["to"])
103
+ (from..to).cover?(Time.now)
104
+ }["count"]
105
+ end
106
+
107
+ def overheat?
108
+ return false unless @reach_max_at
109
+ (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
110
+ end
111
+
112
+ def fetch_service
113
+ res = client.describe_services(cluster: cluster, services: [name])
114
+ raise "Service \"#{name}\" is not found" if res.services.empty?
115
+ res.services[0]
116
+ rescue => e
117
+ AutoScaler.error_logger.error(e)
118
+ end
119
+
120
+ def update_service(difference, cluster_resource_manager)
121
+ next_desired_count = desired_count + difference
122
+ current_level = max_task_level(desired_count)
123
+ next_level = max_task_level(next_desired_count)
124
+ if current_level < next_level && overheat? # next max
125
+ level = next_level
126
+ @reach_max_at = nil
127
+ @logger.info "#{log_prefix} Service is overheat, uses next max count"
128
+ elsif current_level < next_level && !overheat? # wait cooldown
129
+ level = current_level
130
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
131
+ @reach_max_at ||= now
132
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
133
+ elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
134
+ level = current_level
135
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
136
+ @reach_max_at ||= now
137
+ @logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
138
+ elsif current_level == next_level && next_desired_count < max_task_count[current_level]
139
+ level = current_level
140
+ @reach_max_at = nil
141
+ @logger.info "#{log_prefix} Service clears cooldown state"
142
+ elsif current_level > next_level
143
+ level = next_level
144
+ @reach_max_at = nil
145
+ @logger.info "#{log_prefix} Service clears cooldown state"
146
+ end
147
+
148
+ next_desired_count = [next_desired_count, max_task_count[level]].min
149
+ if next_desired_count > desired_count
150
+ increase_desired_count(next_desired_count - desired_count, cluster_resource_manager)
151
+ else
152
+ decrease_desired_count(desired_count - next_desired_count, cluster_resource_manager)
153
+ end
154
+
155
+ @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
156
+ @logger.info "#{log_prefix} Update desired_count to #{next_desired_count}"
157
+ rescue => e
158
+ AutoScaler.error_logger.error(e)
159
+ end
160
+
161
+ def increase_desired_count(by, cluster_resource_manager)
162
+ applied_desired_count = desired_count
163
+ self.desired_count += by
164
+
165
+ wait_until = Process.clock_gettime(Process::CLOCK_MONOTONIC) + 180
166
+ @increase_desired_count_thread = Thread.new do
167
+ cl = client
168
+ by.times do
169
+ timeout = wait_until - Process.clock_gettime(Process::CLOCK_MONOTONIC)
170
+ break if timeout <= 0
171
+ break unless cluster_resource_manager.acquire(required_capacity, timeout: timeout)
172
+ begin
173
+ cl.update_service(cluster: cluster, service: name, desired_count: applied_desired_count + 1)
174
+ applied_desired_count += 1
175
+ rescue => e
176
+ cluster_resource_manager.release(required_capacity)
177
+ AutoScaler.error_logger.error(e)
178
+ break
179
+ end
180
+ end
181
+
182
+ if applied_desired_count != desired_count
183
+ self.desired_count = applied_desired_count
184
+ @logger.info "#{log_prefix} Failed to update service and set desired_count to #{desired_count}"
185
+ end
186
+ end
187
+ end
188
+
189
+ def decrease_desired_count(by, cluster_resource_manager)
190
+ cl = client
191
+ running_task_arns = cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
192
+
193
+ cl.update_service(cluster: cluster, service: name, desired_count: desired_count - by)
194
+
195
+ cl.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
196
+ w.before_wait do
197
+ @logger.debug "#{log_prefix} wait service stable"
198
+ end
199
+ end
200
+
201
+ stopping_task_arns = running_task_arns - cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
202
+ stopping_task_arns.each_slice(MAX_DESCRIBABLE_TASK_COUNT) do |arns|
203
+ cl.wait_until(:tasks_stopped, cluster: cluster, tasks: arns) do |w|
204
+ w.before_wait do
205
+ @logger.debug "#{log_prefix} wait stopping tasks stopped"
206
+ end
207
+ end
208
+ end
209
+
210
+ cluster_resource_manager.release(required_capacity * by)
211
+ self.desired_count -= by
212
+ end
213
+
214
+ def max_task_level(count)
215
+ max_task_count.index { |i| count <= i } || max_task_count.size - 1
216
+ end
217
+
218
+ def log_prefix
219
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
220
+ end
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,102 @@
1
+ require "json"
2
+ require "timeout"
3
+
4
+ require "aws-sdk-ec2"
5
+ require "ecs_deploy"
6
+ require "ecs_deploy/auto_scaler/config_base"
7
+ require "ecs_deploy/auto_scaler/cluster_resource_manager"
8
+
9
+ module EcsDeploy
10
+ module AutoScaler
11
+ SpotFleetRequestConfig = Struct.new(:id, :region, :cluster, :buffer, :service_configs) do
12
+ include ConfigBase
13
+
14
+ def initialize(attributes = {}, logger)
15
+ attributes = attributes.dup
16
+ services = attributes.delete("services")
17
+ super(attributes, logger)
18
+ self.service_configs = services.map do |s|
19
+ ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger)
20
+ end
21
+ end
22
+
23
+ def name
24
+ id
25
+ end
26
+
27
+ def update_desired_capacity(required_capacity)
28
+ terminate_orphan_instances
29
+
30
+ desired_capacity = (required_capacity + buffer.to_f).ceil
31
+
32
+ request_config = ec2_client.describe_spot_fleet_requests(
33
+ spot_fleet_request_ids: [id]
34
+ ).spot_fleet_request_configs[0].spot_fleet_request_config
35
+
36
+ return if desired_capacity == request_config.target_capacity
37
+
38
+ ec2_client.modify_spot_fleet_request(spot_fleet_request_id: id, target_capacity: desired_capacity)
39
+
40
+ cluster_resource_manager.trigger_capacity_update(
41
+ request_config.target_capacity,
42
+ desired_capacity,
43
+ # Wait until the capacity is updated to prevent the process from terminating before container draining is completed
44
+ wait_until_capacity_updated: desired_capacity < request_config.target_capacity,
45
+ )
46
+ @logger.info "#{log_prefix} Update desired_capacity to #{desired_capacity}"
47
+ rescue => e
48
+ AutoScaler.error_logger.error(e)
49
+ end
50
+
51
+ def cluster_resource_manager
52
+ @cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new(
53
+ region: region,
54
+ cluster: cluster,
55
+ service_configs: service_configs,
56
+ capacity_based_on: "vCPUs",
57
+ logger: @logger,
58
+ )
59
+ end
60
+
61
+ private
62
+
63
+ def terminate_orphan_instances
64
+ container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id)
65
+ spot_fleet_instances = ec2_client.describe_spot_fleet_instances(spot_fleet_request_id: id).active_instances
66
+ orphans = spot_fleet_instances.reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
67
+
68
+ return if orphans.empty?
69
+
70
+ running_instances = ec2_client.describe_instances(
71
+ instance_ids: orphans,
72
+ filters: [{ name: "instance-state-name", values: ["running"] }],
73
+ ).reservations.flat_map(&:instances)
74
+ # instances which have just launched might not be registered to the cluster yet.
75
+ instance_ids = running_instances.select { |i| (Time.now - i.launch_time) > 600 }.map(&:instance_id)
76
+
77
+ return if instance_ids.empty?
78
+
79
+ # Terminate orpahns without canceling spot instance request
80
+ # because we can't terminate canceled spot instances by decreasing the capacity
81
+ ec2_client.terminate_instances(instance_ids: instance_ids)
82
+
83
+ @logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}"
84
+ rescue => e
85
+ AutoScaler.error_logger.error(e)
86
+ end
87
+
88
+ def ec2_client
89
+ Aws::EC2::Client.new(
90
+ access_key_id: EcsDeploy.config.access_key_id,
91
+ secret_access_key: EcsDeploy.config.secret_access_key,
92
+ region: region,
93
+ logger: logger,
94
+ )
95
+ end
96
+
97
+ def log_prefix
98
+ "[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,42 @@
1
+ require "aws-sdk-cloudwatch"
2
+ require "ecs_deploy"
3
+ require "ecs_deploy/auto_scaler"
4
+ require "ecs_deploy/auto_scaler/config_base"
5
+
6
+ module EcsDeploy
7
+ module AutoScaler
8
+ TriggerConfig = Struct.new(:alarm_name, :region, :state, :step, :prioritized_over_upscale_triggers) do
9
+ include ConfigBase
10
+
11
+ def match?
12
+ fetch_alarm.state_value == state
13
+ end
14
+
15
+ def prioritized_over_upscale_triggers?
16
+ !!prioritized_over_upscale_triggers
17
+ end
18
+
19
+ private
20
+
21
+ def client
22
+ Aws::CloudWatch::Client.new(
23
+ access_key_id: EcsDeploy.config.access_key_id,
24
+ secret_access_key: EcsDeploy.config.secret_access_key,
25
+ region: region,
26
+ logger: logger
27
+ )
28
+ end
29
+
30
+ def fetch_alarm
31
+ res = client.describe_alarms(alarm_names: [alarm_name])
32
+
33
+ raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
34
+ res.metric_alarms[0].tap do |alarm|
35
+ AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
36
+ end
37
+ rescue => e
38
+ AutoScaler.error_logger.error(e)
39
+ end
40
+ end
41
+ end
42
+ end