ecs_deploy 0.3.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,2 +1,6 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
2
6
  task :default => :spec
@@ -18,10 +18,16 @@ Gem::Specification.new do |spec|
18
18
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency "aws-sdk", "~> 2.9"
21
+ spec.add_runtime_dependency "aws-sdk-autoscaling", "~> 1"
22
+ spec.add_runtime_dependency "aws-sdk-cloudwatch", "~> 1"
23
+ spec.add_runtime_dependency "aws-sdk-cloudwatchevents", "~> 1"
24
+ spec.add_runtime_dependency "aws-sdk-ec2", "~> 1"
25
+ spec.add_runtime_dependency "aws-sdk-ecs", "~> 1"
26
+ spec.add_runtime_dependency "aws-sdk-sqs", "~> 1"
22
27
  spec.add_runtime_dependency "terminal-table"
23
28
  spec.add_runtime_dependency "paint"
24
29
 
25
- spec.add_development_dependency "bundler", "~> 1.11"
26
- spec.add_development_dependency "rake", "~> 10.0"
30
+ spec.add_development_dependency "bundler", ">= 1.11", "< 3"
31
+ spec.add_development_dependency "rake", ">= 10.0"
32
+ spec.add_development_dependency "rspec", "~> 3.0"
27
33
  end
@@ -1,7 +1,7 @@
1
1
  require "ecs_deploy/version"
2
2
  require "ecs_deploy/configuration"
3
3
 
4
- require 'aws-sdk'
4
+ require 'aws-sdk-ecs'
5
5
  require 'logger'
6
6
  require 'terminal-table'
7
7
  require 'paint'
@@ -1,6 +1,11 @@
1
- require 'yaml'
2
- require 'logger'
3
- require 'time'
1
+ require "logger"
2
+ require "time"
3
+ require "yaml"
4
+
5
+ require "ecs_deploy/auto_scaler/auto_scaling_group_config"
6
+ require "ecs_deploy/auto_scaler/instance_drainer"
7
+ require "ecs_deploy/auto_scaler/service_config"
8
+ require "ecs_deploy/auto_scaler/spot_fleet_request_config"
4
9
 
5
10
  module EcsDeploy
6
11
  module AutoScaler
@@ -8,8 +13,8 @@ module EcsDeploy
8
13
  attr_reader :logger, :error_logger
9
14
 
10
15
  def run(yaml_path, log_file = nil, error_log_file = nil)
11
- trap(:TERM) { @stop = true }
12
- trap(:INT) { @stop = true }
16
+ @enable_auto_scaling = true
17
+ setup_signal_handlers
13
18
  @logger = Logger.new(log_file || STDOUT)
14
19
  @logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"]
15
20
  STDOUT.sync = true unless log_file
@@ -17,90 +22,129 @@ module EcsDeploy
17
22
  @error_logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"]
18
23
  STDERR.sync = true unless error_log_file
19
24
  load_config(yaml_path)
20
- service_configs
21
- auto_scaling_group_configs
22
25
 
23
- config_groups = service_configs.group_by { |s| [s.region, s.auto_scaling_group_name] }
24
- ths = config_groups.map do |(region, auto_scaling_group_name), configs|
25
- asg_config = auto_scaling_group_configs.find { |c| c.name == auto_scaling_group_name && c.region == region }
26
- Thread.new(asg_config, configs, &method(:main_loop)).tap { |th| th.abort_on_exception = true }
26
+ ths = (auto_scaling_group_configs + spot_fleet_request_configs).map do |cluster_scaling_config|
27
+ Thread.new(cluster_scaling_config, &method(:main_loop)).tap { |th| th.abort_on_exception = true }
28
+ end
29
+
30
+ if @config["spot_instance_intrp_warns_queue_urls"]
31
+ drainer = EcsDeploy::AutoScaler::InstanceDrainer.new(
32
+ auto_scaling_group_configs: auto_scaling_group_configs,
33
+ spot_fleet_request_configs: spot_fleet_request_configs,
34
+ logger: logger,
35
+ )
36
+ polling_ths = @config["spot_instance_intrp_warns_queue_urls"].map do |queue_url|
37
+ Thread.new(queue_url) do |url|
38
+ drainer.poll_spot_instance_interruption_warnings(url)
39
+ end.tap { |th| th.abort_on_exception = true }
40
+ end
27
41
  end
28
42
 
29
43
  ths.each(&:join)
44
+
45
+ drainer&.stop
46
+ polling_ths&.each(&:join)
30
47
  end
31
48
 
32
- def main_loop(asg_config, configs)
33
- loop_with_polling_interval("loop of #{asg_config.name}") do
34
- ths = configs.map do |service_config|
49
+ def main_loop(cluster_scaling_config)
50
+ loop_with_polling_interval("loop of #{cluster_scaling_config.name}") do
51
+ ths = cluster_scaling_config.service_configs.map do |service_config|
35
52
  Thread.new(service_config) do |s|
36
53
  @logger.debug "Start service scaling of #{s.name}"
37
-
38
- if s.idle?
39
- @logger.debug "#{s.name} is idling"
40
- next
41
- end
42
-
43
- difference = 0
44
- s.upscale_triggers.each do |trigger|
45
- step = trigger.step || s.step
46
- next if difference >= step
47
-
48
- if trigger.match?
49
- logger.info "Fire upscale trigger of #{s.name} by #{trigger.alarm_name} #{trigger.state}"
50
- difference = step
51
- end
52
- end
53
-
54
- if difference == 0 && s.desired_count > s.current_min_task_count
55
- s.downscale_triggers.each do |trigger|
56
- next unless trigger.match?
57
-
58
- logger.info "Fire downscale trigger of #{s.name} by #{trigger.alarm_name} #{trigger.state}"
59
- step = trigger.step || s.step
60
- difference = [difference, -step].min
61
- end
62
- end
63
-
64
- if s.current_min_task_count > s.desired_count + difference
65
- difference = s.current_min_task_count - s.desired_count
66
- end
67
-
68
- if difference >= 0 && s.desired_count > s.max_task_count.max
69
- difference = s.max_task_count.max - s.desired_count
70
- end
71
-
72
- if difference != 0
73
- s.update_service(difference)
74
- end
54
+ s.adjust_desired_count(cluster_scaling_config.cluster_resource_manager)
75
55
  end
76
56
  end
77
57
  ths.each { |th| th.abort_on_exception = true }
78
58
 
79
59
  ths.each(&:join)
80
60
 
81
- @logger.debug "Start asg scaling of #{asg_config.name}"
61
+ @logger.debug "Start cluster scaling of #{cluster_scaling_config.name}"
62
+
63
+ required_capacity = cluster_scaling_config.service_configs.sum { |s| s.desired_count * s.required_capacity }
64
+ cluster_scaling_config.update_desired_capacity(required_capacity)
82
65
 
83
- total_service_count = configs.inject(0) { |sum, s| sum + s.desired_count }
84
- asg_config.update_auto_scaling_group(total_service_count, configs[0])
85
- asg_config.detach_and_terminate_orphan_instances(configs[0])
66
+ cluster_scaling_config.service_configs.each(&:wait_until_desired_count_updated)
86
67
  end
87
68
  end
88
69
 
89
70
  def load_config(yaml_path)
90
71
  @config = YAML.load_file(yaml_path)
91
72
  @polling_interval = @config["polling_interval"] || 30
92
- end
73
+ if @config["services"]
74
+ @error_logger&.warn('"services" property in root-level is deprecated. Please define it in "auto_scaling_groups" property or "spot_fleet_requests" property.')
75
+ @config.delete("services").each do |svc|
76
+ if svc["auto_scaling_group_name"] && svc["spot_fleet_request_id"]
77
+ raise "You can specify only one of 'auto_scaling_group_name' or 'spot_fleet_request_name'"
78
+ end
79
+
80
+ svc_region = svc.delete("region")
81
+ if svc["auto_scaling_group_name"]
82
+ asg_name = svc.delete("auto_scaling_group_name")
83
+ asg = @config["auto_scaling_groups"].find { |g| g["region"] == svc_region && g["name"] == asg_name }
84
+ asg["services"] ||= []
85
+ asg["services"] << svc
86
+ asg["cluster"] = svc.delete("cluster")
87
+ end
93
88
 
94
- def service_configs
95
- @service_configs ||= @config["services"].map(&ServiceConfig.method(:new))
89
+ if svc["spot_fleet_request_id"]
90
+ sfr_id = svc.delete("spot_fleet_request_id")
91
+ sfr = @config["spot_fleet_requests"].find { |r| r["region"] == svc_region && r["id"] == sfr_id }
92
+ sfr["services"] ||= []
93
+ sfr["services"] << svc
94
+ sfr["cluster"] = svc.delete("cluster")
95
+ end
96
+ end
97
+ end
96
98
  end
97
99
 
98
100
  def auto_scaling_group_configs
99
- @auto_scaling_group_configs ||= @config["auto_scaling_groups"].map(&AutoScalingConfig.method(:new))
101
+ @auto_scaling_group_configs ||= (@config["auto_scaling_groups"] || []).each.with_object({}) do |c, configs|
102
+ configs[c["name"]] ||= {}
103
+ if configs[c["name"]][c["region"]]
104
+ raise "Duplicate entry in auto_scaling_groups (name: #{c["name"]}, region: #{c["region"]})"
105
+ end
106
+ configs[c["name"]][c["region"]] = AutoScalingGroupConfig.new(c, @logger)
107
+ end.values.flat_map(&:values)
108
+ end
109
+
110
+ def spot_fleet_request_configs
111
+ @spot_fleet_request_configs ||= (@config["spot_fleet_requests"] || []).each.with_object({}) do |c, configs|
112
+ configs[c["id"]] ||= {}
113
+ if configs[c["id"]][c["region"]]
114
+ raise "Duplicate entry in spot_fleet_requests (id: #{c["id"]}, region: #{c["region"]})"
115
+ end
116
+ configs[c["id"]][c["region"]] = SpotFleetRequestConfig.new(c, @logger)
117
+ end.values.flat_map(&:values)
100
118
  end
101
119
 
102
120
  private
103
121
 
122
+ def setup_signal_handlers
123
+ # Use a thread and a queue to avoid "log writing failed. can't be called from trap context"
124
+ # cf. https://bugs.ruby-lang.org/issues/14222#note-3
125
+ signals = Queue.new
126
+ %i(TERM INT CONT TSTP).each do |sig|
127
+ trap(sig) { signals << sig }
128
+ end
129
+
130
+ Thread.new do
131
+ loop do
132
+ sig = signals.pop
133
+ case sig
134
+ when :INT, :TERM
135
+ @logger.info "Received SIG#{sig}, shutting down gracefully"
136
+ @stop = true
137
+ when :CONT
138
+ @logger.info "Received SIGCONT, resume auto scaling"
139
+ @enable_auto_scaling = true
140
+ when :TSTP
141
+ @logger.info "Received SIGTSTP, pause auto scaling. Send SIGCONT to resume it."
142
+ @enable_auto_scaling = false
143
+ end
144
+ end
145
+ end
146
+ end
147
+
104
148
  def wait_polling_interval?(last_executed_at)
105
149
  current = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
106
150
  diff = current - last_executed_at
@@ -114,6 +158,7 @@ module EcsDeploy
114
158
  loop do
115
159
  break if @stop
116
160
  sleep 1
161
+ next unless @enable_auto_scaling
117
162
  next if wait_polling_interval?(last_executed_at)
118
163
  yield
119
164
  last_executed_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
@@ -123,284 +168,5 @@ module EcsDeploy
123
168
  @logger.debug "Stop #{name}"
124
169
  end
125
170
  end
126
-
127
- module ConfigBase
128
- def initialize(attributes = {})
129
- attributes.each do |key, val|
130
- send("#{key}=", val)
131
- end
132
- end
133
- end
134
-
135
- SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region auto_scaling_group_name step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count)
136
- ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
137
- include ConfigBase
138
-
139
- def initialize(attributes = {})
140
- super(attributes)
141
- self.idle_time ||= 60
142
- self.max_task_count = Array(max_task_count)
143
- self.upscale_triggers = upscale_triggers.to_a.map do |t|
144
- TriggerConfig.new(t.merge(region: region))
145
- end
146
- self.downscale_triggers = downscale_triggers.to_a.map do |t|
147
- TriggerConfig.new(t.merge(region: region))
148
- end
149
- self.max_task_count.sort!
150
- self.desired_count = fetch_service.desired_count
151
- @reach_max_at = nil
152
- @last_updated_at = nil
153
- end
154
-
155
- def client
156
- Aws::ECS::Client.new(
157
- access_key_id: EcsDeploy.config.access_key_id,
158
- secret_access_key: EcsDeploy.config.secret_access_key,
159
- region: region
160
- )
161
- end
162
-
163
- def idle?
164
- return false unless @last_updated_at
165
-
166
- diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
167
- diff < idle_time
168
- end
169
-
170
- def current_min_task_count
171
- return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
172
-
173
- scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
174
- from = Time.parse(s["from"])
175
- to = Time.parse(s["to"])
176
- (from..to).cover?(Time.now)
177
- }["count"]
178
- end
179
-
180
- def overheat?
181
- return false unless @reach_max_at
182
- (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
183
- end
184
-
185
- def fetch_service
186
- res = client.describe_services(cluster: cluster, services: [name])
187
- raise "Service \"#{name}\" is not found" if res.services.empty?
188
- res.services[0]
189
- rescue => e
190
- AutoScaler.error_logger.error(e)
191
- end
192
-
193
- def update_service(difference)
194
- next_desired_count = desired_count + difference
195
- current_level = max_task_level(desired_count)
196
- next_level = max_task_level(next_desired_count)
197
- if current_level < next_level && overheat? # next max
198
- level = next_level
199
- @reach_max_at = nil
200
- AutoScaler.logger.info "Service \"#{name}\" is overheat, uses next max count"
201
- elsif current_level < next_level && !overheat? # wait cooldown
202
- level = current_level
203
- now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
204
- @reach_max_at ||= now
205
- AutoScaler.logger.info "Service \"#{name}\" waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
206
- elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
207
- level = current_level
208
- now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
209
- @reach_max_at ||= now
210
- AutoScaler.logger.info "Service \"#{name}\" waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
211
- elsif current_level == next_level && next_desired_count < max_task_count[current_level]
212
- level = current_level
213
- @reach_max_at = nil
214
- AutoScaler.logger.info "Service \"#{name}\" clears cooldown state"
215
- elsif current_level > next_level
216
- level = next_level
217
- @reach_max_at = nil
218
- AutoScaler.logger.info "Service \"#{name}\" clears cooldown state"
219
- end
220
-
221
- cl = client
222
- next_desired_count = [next_desired_count, max_task_count[level]].min
223
- cl.update_service(
224
- cluster: cluster,
225
- service: name,
226
- desired_count: next_desired_count,
227
- )
228
- cl.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
229
- w.before_wait do
230
- AutoScaler.logger.debug "wait service stable [#{name}]"
231
- end
232
- end if difference < 0
233
- @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
234
- self.desired_count = next_desired_count
235
- AutoScaler.logger.info "Update service \"#{name}\": desired_count -> #{next_desired_count}"
236
- rescue => e
237
- AutoScaler.error_logger.error(e)
238
- end
239
-
240
- def fetch_container_instances
241
- arns = []
242
- resp = nil
243
- cl = client
244
- loop do
245
- options = {cluster: cluster}
246
- options.merge(next_token: resp.next_token) if resp && resp.next_token
247
- resp = cl.list_container_instances(options)
248
- arns.concat(resp.container_instance_arns)
249
- break unless resp.next_token
250
- end
251
-
252
- chunk_size = 50
253
- container_instances = []
254
- arns.each_slice(chunk_size) do |arn_chunk|
255
- is = cl.describe_container_instances(cluster: cluster, container_instances: arn_chunk).container_instances
256
- container_instances.concat(is)
257
- end
258
-
259
- container_instances
260
- end
261
-
262
- private
263
-
264
- def max_task_level(count)
265
- max_task_count.index { |i| count <= i } || max_task_count.size - 1
266
- end
267
- end
268
-
269
- TriggerConfig = Struct.new(:alarm_name, :region, :state, :step) do
270
- include ConfigBase
271
-
272
- def client
273
- Aws::CloudWatch::Client.new(
274
- access_key_id: EcsDeploy.config.access_key_id,
275
- secret_access_key: EcsDeploy.config.secret_access_key,
276
- region: region
277
- )
278
- end
279
-
280
- def match?
281
- fetch_alarm.state_value == state
282
- end
283
-
284
- def fetch_alarm
285
- res = client.describe_alarms(alarm_names: [alarm_name])
286
-
287
- raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
288
- res.metric_alarms[0].tap do |alarm|
289
- AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
290
- end
291
- rescue => e
292
- AutoScaler.error_logger.error(e)
293
- end
294
- end
295
-
296
- AutoScalingConfig = Struct.new(:name, :region, :buffer) do
297
- include ConfigBase
298
-
299
- def client
300
- Aws::AutoScaling::Client.new(
301
- access_key_id: EcsDeploy.config.access_key_id,
302
- secret_access_key: EcsDeploy.config.secret_access_key,
303
- region: region
304
- )
305
- end
306
-
307
- def ec2_client
308
- Aws::EC2::Client.new(
309
- access_key_id: EcsDeploy.config.access_key_id,
310
- secret_access_key: EcsDeploy.config.secret_access_key,
311
- region: region
312
- )
313
- end
314
-
315
- def instances(reload: false)
316
- if reload || @instances.nil?
317
- resp = client.describe_auto_scaling_groups({
318
- auto_scaling_group_names: [name],
319
- })
320
- @instances = resp.auto_scaling_groups[0].instances
321
- else
322
- @instances
323
- end
324
- end
325
-
326
- def update_auto_scaling_group(total_service_count, service_config)
327
- desired_capacity = total_service_count + buffer.to_i
328
-
329
- current_asg = client.describe_auto_scaling_groups({
330
- auto_scaling_group_names: [name],
331
- }).auto_scaling_groups[0]
332
-
333
- if current_asg.desired_capacity > desired_capacity
334
- diff = current_asg.desired_capacity - desired_capacity
335
- container_instances = service_config.fetch_container_instances
336
- deregisterable_instances = container_instances.select do |i|
337
- i.pending_tasks_count == 0 && i.running_tasks_count == 0
338
- end
339
-
340
- AutoScaler.logger.info "Fetch deregisterable instances: #{deregisterable_instances.map(&:ec2_instance_id).inspect}"
341
-
342
- deregistered_instance_ids = []
343
- deregisterable_instances.each do |i|
344
- break if deregistered_instance_ids.size >= diff
345
-
346
- begin
347
- service_config.client.deregister_container_instance(cluster: service_config.cluster, container_instance: i.container_instance_arn, force: false)
348
- deregistered_instance_ids << i.ec2_instance_id
349
- rescue Aws::ECS::Errors::InvalidParameterException
350
- end
351
- end
352
-
353
- AutoScaler.logger.info "Deregistered instances: #{deregistered_instance_ids.inspect}"
354
-
355
- detach_and_terminate_instances(deregistered_instance_ids)
356
-
357
- AutoScaler.logger.info "Update auto scaling group \"#{name}\": desired_capacity -> #{desired_capacity}"
358
- elsif current_asg.desired_capacity < desired_capacity
359
- client.update_auto_scaling_group(
360
- auto_scaling_group_name: name,
361
- min_size: 0,
362
- max_size: [current_asg.max_size, desired_capacity].max,
363
- desired_capacity: desired_capacity,
364
- )
365
- AutoScaler.logger.info "Update auto scaling group \"#{name}\": desired_capacity -> #{desired_capacity}"
366
- end
367
- rescue => e
368
- AutoScaler.error_logger.error(e)
369
- end
370
-
371
- def detach_and_terminate_instances(instance_ids)
372
- return if instance_ids.empty?
373
-
374
- client.detach_instances(
375
- auto_scaling_group_name: name,
376
- instance_ids: instance_ids,
377
- should_decrement_desired_capacity: true
378
- )
379
-
380
- AutoScaler.logger.info "Detach instances from ASG #{name}: #{instance_ids.inspect}"
381
- sleep 3
382
-
383
- ec2_client.terminate_instances(instance_ids: instance_ids)
384
-
385
- AutoScaler.logger.info "Terminated instances: #{instance_ids.inspect}"
386
- rescue => e
387
- AutoScaler.error_logger.error(e)
388
- end
389
-
390
- def detach_and_terminate_orphan_instances(service_config)
391
- container_instance_ids = service_config.fetch_container_instances.map(&:ec2_instance_id)
392
- orphans = instances(reload: true).reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
393
-
394
- return if orphans.empty?
395
-
396
- targets = ec2_client.describe_instances(instance_ids: orphans).reservations[0].instances.select do |i|
397
- (Time.now - i.launch_time) > 600
398
- end
399
-
400
- detach_and_terminate_instances(targets.map(&:instance_id))
401
- rescue => e
402
- AutoScaler.error_logger.error(e)
403
- end
404
- end
405
171
  end
406
172
  end