ecs_deploy 0.3.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +150 -0
- data/README.md +272 -23
- data/Rakefile +4 -0
- data/ecs_deploy.gemspec +9 -3
- data/lib/ecs_deploy/auto_scaler/auto_scaling_group_config.rb +209 -0
- data/lib/ecs_deploy/auto_scaler/cluster_resource_manager.rb +149 -0
- data/lib/ecs_deploy/auto_scaler/config_base.rb +16 -0
- data/lib/ecs_deploy/auto_scaler/instance_drainer.rb +134 -0
- data/lib/ecs_deploy/auto_scaler/service_config.rb +223 -0
- data/lib/ecs_deploy/auto_scaler/spot_fleet_request_config.rb +102 -0
- data/lib/ecs_deploy/auto_scaler/trigger_config.rb +42 -0
- data/lib/ecs_deploy/auto_scaler.rb +105 -339
- data/lib/ecs_deploy/capistrano.rb +73 -3
- data/lib/ecs_deploy/configuration.rb +6 -2
- data/lib/ecs_deploy/instance_fluctuation_manager.rb +198 -0
- data/lib/ecs_deploy/scheduled_task.rb +15 -3
- data/lib/ecs_deploy/service.rb +100 -21
- data/lib/ecs_deploy/task_definition.rb +30 -9
- data/lib/ecs_deploy/version.rb +1 -1
- data/lib/ecs_deploy.rb +1 -1
- metadata +113 -14
@@ -0,0 +1,223 @@
|
|
1
|
+
require "aws-sdk-ecs"
|
2
|
+
require "ecs_deploy"
|
3
|
+
require "ecs_deploy/auto_scaler/config_base"
|
4
|
+
require "ecs_deploy/auto_scaler/trigger_config"
|
5
|
+
|
6
|
+
module EcsDeploy
|
7
|
+
module AutoScaler
|
8
|
+
SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count required_capacity)
|
9
|
+
ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
|
10
|
+
include ConfigBase
|
11
|
+
|
12
|
+
MAX_DESCRIBABLE_TASK_COUNT = 100
|
13
|
+
|
14
|
+
def initialize(attributes = {}, logger)
|
15
|
+
super
|
16
|
+
self.idle_time ||= 60
|
17
|
+
self.max_task_count = Array(max_task_count)
|
18
|
+
self.upscale_triggers = upscale_triggers.to_a.map do |t|
|
19
|
+
TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
|
20
|
+
end
|
21
|
+
self.downscale_triggers = downscale_triggers.to_a.map do |t|
|
22
|
+
TriggerConfig.new({"region" => region, "step" => step}.merge(t), logger)
|
23
|
+
end
|
24
|
+
self.max_task_count.sort!
|
25
|
+
self.desired_count = fetch_service.desired_count
|
26
|
+
self.required_capacity ||= 1
|
27
|
+
@reach_max_at = nil
|
28
|
+
@last_updated_at = nil
|
29
|
+
@logger = logger
|
30
|
+
end
|
31
|
+
|
32
|
+
def adjust_desired_count(cluster_resource_manager)
|
33
|
+
if idle?
|
34
|
+
@logger.debug "#{name} is idling"
|
35
|
+
return
|
36
|
+
end
|
37
|
+
|
38
|
+
difference = 0
|
39
|
+
upscale_triggers.each do |trigger|
|
40
|
+
next if difference >= trigger.step
|
41
|
+
|
42
|
+
if trigger.match?
|
43
|
+
@logger.info "#{log_prefix} Fire upscale trigger by #{trigger.alarm_name} #{trigger.state}"
|
44
|
+
difference = trigger.step
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
if desired_count > current_min_task_count
|
49
|
+
downscale_triggers.each do |trigger|
|
50
|
+
next if difference > 0 && !trigger.prioritized_over_upscale_triggers?
|
51
|
+
next unless trigger.match?
|
52
|
+
|
53
|
+
@logger.info "#{log_prefix} Fire downscale trigger by #{trigger.alarm_name} #{trigger.state}"
|
54
|
+
difference = [difference, -trigger.step].min
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
if current_min_task_count > desired_count + difference
|
59
|
+
difference = current_min_task_count - desired_count
|
60
|
+
end
|
61
|
+
|
62
|
+
if difference >= 0 && desired_count > max_task_count.max
|
63
|
+
difference = max_task_count.max - desired_count
|
64
|
+
end
|
65
|
+
|
66
|
+
if difference != 0
|
67
|
+
update_service(difference, cluster_resource_manager)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def wait_until_desired_count_updated
|
72
|
+
@increase_desired_count_thread&.join
|
73
|
+
rescue => e
|
74
|
+
AutoScaler.error_logger.warn("`#{__method__}': #{e} (#{e.class})")
|
75
|
+
ensure
|
76
|
+
@increase_desired_count_thread = nil
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def client
|
82
|
+
Aws::ECS::Client.new(
|
83
|
+
access_key_id: EcsDeploy.config.access_key_id,
|
84
|
+
secret_access_key: EcsDeploy.config.secret_access_key,
|
85
|
+
region: region,
|
86
|
+
logger: logger
|
87
|
+
)
|
88
|
+
end
|
89
|
+
|
90
|
+
def idle?
|
91
|
+
return false unless @last_updated_at
|
92
|
+
|
93
|
+
diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
|
94
|
+
diff < idle_time
|
95
|
+
end
|
96
|
+
|
97
|
+
def current_min_task_count
|
98
|
+
return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
|
99
|
+
|
100
|
+
scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
|
101
|
+
from = Time.parse(s["from"])
|
102
|
+
to = Time.parse(s["to"])
|
103
|
+
(from..to).cover?(Time.now)
|
104
|
+
}["count"]
|
105
|
+
end
|
106
|
+
|
107
|
+
def overheat?
|
108
|
+
return false unless @reach_max_at
|
109
|
+
(Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
|
110
|
+
end
|
111
|
+
|
112
|
+
def fetch_service
|
113
|
+
res = client.describe_services(cluster: cluster, services: [name])
|
114
|
+
raise "Service \"#{name}\" is not found" if res.services.empty?
|
115
|
+
res.services[0]
|
116
|
+
rescue => e
|
117
|
+
AutoScaler.error_logger.error(e)
|
118
|
+
end
|
119
|
+
|
120
|
+
def update_service(difference, cluster_resource_manager)
|
121
|
+
next_desired_count = desired_count + difference
|
122
|
+
current_level = max_task_level(desired_count)
|
123
|
+
next_level = max_task_level(next_desired_count)
|
124
|
+
if current_level < next_level && overheat? # next max
|
125
|
+
level = next_level
|
126
|
+
@reach_max_at = nil
|
127
|
+
@logger.info "#{log_prefix} Service is overheat, uses next max count"
|
128
|
+
elsif current_level < next_level && !overheat? # wait cooldown
|
129
|
+
level = current_level
|
130
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
|
131
|
+
@reach_max_at ||= now
|
132
|
+
@logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
|
133
|
+
elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
|
134
|
+
level = current_level
|
135
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
|
136
|
+
@reach_max_at ||= now
|
137
|
+
@logger.info "#{log_prefix} Service waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
|
138
|
+
elsif current_level == next_level && next_desired_count < max_task_count[current_level]
|
139
|
+
level = current_level
|
140
|
+
@reach_max_at = nil
|
141
|
+
@logger.info "#{log_prefix} Service clears cooldown state"
|
142
|
+
elsif current_level > next_level
|
143
|
+
level = next_level
|
144
|
+
@reach_max_at = nil
|
145
|
+
@logger.info "#{log_prefix} Service clears cooldown state"
|
146
|
+
end
|
147
|
+
|
148
|
+
next_desired_count = [next_desired_count, max_task_count[level]].min
|
149
|
+
if next_desired_count > desired_count
|
150
|
+
increase_desired_count(next_desired_count - desired_count, cluster_resource_manager)
|
151
|
+
else
|
152
|
+
decrease_desired_count(desired_count - next_desired_count, cluster_resource_manager)
|
153
|
+
end
|
154
|
+
|
155
|
+
@last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
|
156
|
+
@logger.info "#{log_prefix} Update desired_count to #{next_desired_count}"
|
157
|
+
rescue => e
|
158
|
+
AutoScaler.error_logger.error(e)
|
159
|
+
end
|
160
|
+
|
161
|
+
def increase_desired_count(by, cluster_resource_manager)
|
162
|
+
applied_desired_count = desired_count
|
163
|
+
self.desired_count += by
|
164
|
+
|
165
|
+
wait_until = Process.clock_gettime(Process::CLOCK_MONOTONIC) + 180
|
166
|
+
@increase_desired_count_thread = Thread.new do
|
167
|
+
cl = client
|
168
|
+
by.times do
|
169
|
+
timeout = wait_until - Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
170
|
+
break if timeout <= 0
|
171
|
+
break unless cluster_resource_manager.acquire(required_capacity, timeout: timeout)
|
172
|
+
begin
|
173
|
+
cl.update_service(cluster: cluster, service: name, desired_count: applied_desired_count + 1)
|
174
|
+
applied_desired_count += 1
|
175
|
+
rescue => e
|
176
|
+
cluster_resource_manager.release(required_capacity)
|
177
|
+
AutoScaler.error_logger.error(e)
|
178
|
+
break
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
if applied_desired_count != desired_count
|
183
|
+
self.desired_count = applied_desired_count
|
184
|
+
@logger.info "#{log_prefix} Failed to update service and set desired_count to #{desired_count}"
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def decrease_desired_count(by, cluster_resource_manager)
|
190
|
+
cl = client
|
191
|
+
running_task_arns = cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
|
192
|
+
|
193
|
+
cl.update_service(cluster: cluster, service: name, desired_count: desired_count - by)
|
194
|
+
|
195
|
+
cl.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
|
196
|
+
w.before_wait do
|
197
|
+
@logger.debug "#{log_prefix} wait service stable"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
stopping_task_arns = running_task_arns - cl.list_tasks(cluster: cluster, service_name: name, desired_status: "RUNNING").flat_map(&:task_arns)
|
202
|
+
stopping_task_arns.each_slice(MAX_DESCRIBABLE_TASK_COUNT) do |arns|
|
203
|
+
cl.wait_until(:tasks_stopped, cluster: cluster, tasks: arns) do |w|
|
204
|
+
w.before_wait do
|
205
|
+
@logger.debug "#{log_prefix} wait stopping tasks stopped"
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
cluster_resource_manager.release(required_capacity * by)
|
211
|
+
self.desired_count -= by
|
212
|
+
end
|
213
|
+
|
214
|
+
def max_task_level(count)
|
215
|
+
max_task_count.index { |i| count <= i } || max_task_count.size - 1
|
216
|
+
end
|
217
|
+
|
218
|
+
def log_prefix
|
219
|
+
"[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require "json"
|
2
|
+
require "timeout"
|
3
|
+
|
4
|
+
require "aws-sdk-ec2"
|
5
|
+
require "ecs_deploy"
|
6
|
+
require "ecs_deploy/auto_scaler/config_base"
|
7
|
+
require "ecs_deploy/auto_scaler/cluster_resource_manager"
|
8
|
+
|
9
|
+
module EcsDeploy
|
10
|
+
module AutoScaler
|
11
|
+
SpotFleetRequestConfig = Struct.new(:id, :region, :cluster, :buffer, :service_configs) do
|
12
|
+
include ConfigBase
|
13
|
+
|
14
|
+
def initialize(attributes = {}, logger)
|
15
|
+
attributes = attributes.dup
|
16
|
+
services = attributes.delete("services")
|
17
|
+
super(attributes, logger)
|
18
|
+
self.service_configs = services.map do |s|
|
19
|
+
ServiceConfig.new(s.merge("cluster" => cluster, "region" => region), logger)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def name
|
24
|
+
id
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_desired_capacity(required_capacity)
|
28
|
+
terminate_orphan_instances
|
29
|
+
|
30
|
+
desired_capacity = (required_capacity + buffer.to_f).ceil
|
31
|
+
|
32
|
+
request_config = ec2_client.describe_spot_fleet_requests(
|
33
|
+
spot_fleet_request_ids: [id]
|
34
|
+
).spot_fleet_request_configs[0].spot_fleet_request_config
|
35
|
+
|
36
|
+
return if desired_capacity == request_config.target_capacity
|
37
|
+
|
38
|
+
ec2_client.modify_spot_fleet_request(spot_fleet_request_id: id, target_capacity: desired_capacity)
|
39
|
+
|
40
|
+
cluster_resource_manager.trigger_capacity_update(
|
41
|
+
request_config.target_capacity,
|
42
|
+
desired_capacity,
|
43
|
+
# Wait until the capacity is updated to prevent the process from terminating before container draining is completed
|
44
|
+
wait_until_capacity_updated: desired_capacity < request_config.target_capacity,
|
45
|
+
)
|
46
|
+
@logger.info "#{log_prefix} Update desired_capacity to #{desired_capacity}"
|
47
|
+
rescue => e
|
48
|
+
AutoScaler.error_logger.error(e)
|
49
|
+
end
|
50
|
+
|
51
|
+
def cluster_resource_manager
|
52
|
+
@cluster_resource_manager ||= EcsDeploy::AutoScaler::ClusterResourceManager.new(
|
53
|
+
region: region,
|
54
|
+
cluster: cluster,
|
55
|
+
service_configs: service_configs,
|
56
|
+
capacity_based_on: "vCPUs",
|
57
|
+
logger: @logger,
|
58
|
+
)
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def terminate_orphan_instances
|
64
|
+
container_instance_ids = cluster_resource_manager.fetch_container_instances_in_cluster.map(&:ec2_instance_id)
|
65
|
+
spot_fleet_instances = ec2_client.describe_spot_fleet_instances(spot_fleet_request_id: id).active_instances
|
66
|
+
orphans = spot_fleet_instances.reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
|
67
|
+
|
68
|
+
return if orphans.empty?
|
69
|
+
|
70
|
+
running_instances = ec2_client.describe_instances(
|
71
|
+
instance_ids: orphans,
|
72
|
+
filters: [{ name: "instance-state-name", values: ["running"] }],
|
73
|
+
).reservations.flat_map(&:instances)
|
74
|
+
# instances which have just launched might not be registered to the cluster yet.
|
75
|
+
instance_ids = running_instances.select { |i| (Time.now - i.launch_time) > 600 }.map(&:instance_id)
|
76
|
+
|
77
|
+
return if instance_ids.empty?
|
78
|
+
|
79
|
+
# Terminate orpahns without canceling spot instance request
|
80
|
+
# because we can't terminate canceled spot instances by decreasing the capacity
|
81
|
+
ec2_client.terminate_instances(instance_ids: instance_ids)
|
82
|
+
|
83
|
+
@logger.info "#{log_prefix} Terminated instances: #{instance_ids.inspect}"
|
84
|
+
rescue => e
|
85
|
+
AutoScaler.error_logger.error(e)
|
86
|
+
end
|
87
|
+
|
88
|
+
def ec2_client
|
89
|
+
Aws::EC2::Client.new(
|
90
|
+
access_key_id: EcsDeploy.config.access_key_id,
|
91
|
+
secret_access_key: EcsDeploy.config.secret_access_key,
|
92
|
+
region: region,
|
93
|
+
logger: logger,
|
94
|
+
)
|
95
|
+
end
|
96
|
+
|
97
|
+
def log_prefix
|
98
|
+
"[#{self.class.to_s.sub(/\AEcsDeploy::AutoScaler::/, "")} #{name} #{region}]"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require "aws-sdk-cloudwatch"
|
2
|
+
require "ecs_deploy"
|
3
|
+
require "ecs_deploy/auto_scaler"
|
4
|
+
require "ecs_deploy/auto_scaler/config_base"
|
5
|
+
|
6
|
+
module EcsDeploy
|
7
|
+
module AutoScaler
|
8
|
+
TriggerConfig = Struct.new(:alarm_name, :region, :state, :step, :prioritized_over_upscale_triggers) do
|
9
|
+
include ConfigBase
|
10
|
+
|
11
|
+
def match?
|
12
|
+
fetch_alarm.state_value == state
|
13
|
+
end
|
14
|
+
|
15
|
+
def prioritized_over_upscale_triggers?
|
16
|
+
!!prioritized_over_upscale_triggers
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def client
|
22
|
+
Aws::CloudWatch::Client.new(
|
23
|
+
access_key_id: EcsDeploy.config.access_key_id,
|
24
|
+
secret_access_key: EcsDeploy.config.secret_access_key,
|
25
|
+
region: region,
|
26
|
+
logger: logger
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
def fetch_alarm
|
31
|
+
res = client.describe_alarms(alarm_names: [alarm_name])
|
32
|
+
|
33
|
+
raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
|
34
|
+
res.metric_alarms[0].tap do |alarm|
|
35
|
+
AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
|
36
|
+
end
|
37
|
+
rescue => e
|
38
|
+
AutoScaler.error_logger.error(e)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|