ecs_deploy 0.2.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,2 +1,6 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
2
6
  task :default => :spec
data/ecs_deploy.gemspec CHANGED
@@ -18,10 +18,16 @@ Gem::Specification.new do |spec|
18
18
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency "aws-sdk", "~> 2.4"
21
+ spec.add_runtime_dependency "aws-sdk-autoscaling", "~> 1"
22
+ spec.add_runtime_dependency "aws-sdk-cloudwatch", "~> 1"
23
+ spec.add_runtime_dependency "aws-sdk-cloudwatchevents", "~> 1"
24
+ spec.add_runtime_dependency "aws-sdk-ec2", "~> 1"
25
+ spec.add_runtime_dependency "aws-sdk-ecs", "~> 1"
26
+ spec.add_runtime_dependency "aws-sdk-sqs", "~> 1"
22
27
  spec.add_runtime_dependency "terminal-table"
23
28
  spec.add_runtime_dependency "paint"
24
29
 
25
- spec.add_development_dependency "bundler", "~> 1.11"
26
- spec.add_development_dependency "rake", "~> 10.0"
30
+ spec.add_development_dependency "bundler", ">= 1.11", "< 3"
31
+ spec.add_development_dependency "rake", ">= 10.0"
32
+ spec.add_development_dependency "rspec", "~> 3.0"
27
33
  end
data/lib/ecs_deploy.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require "ecs_deploy/version"
2
2
  require "ecs_deploy/configuration"
3
3
 
4
- require 'aws-sdk'
4
+ require 'aws-sdk-ecs'
5
5
  require 'logger'
6
6
  require 'terminal-table'
7
7
  require 'paint'
@@ -27,3 +27,4 @@ end
27
27
 
28
28
  require "ecs_deploy/task_definition"
29
29
  require "ecs_deploy/service"
30
+ require "ecs_deploy/scheduled_task"
@@ -1,6 +1,11 @@
1
- require 'yaml'
2
- require 'logger'
3
- require 'time'
1
+ require "logger"
2
+ require "time"
3
+ require "yaml"
4
+
5
+ require "ecs_deploy/auto_scaler/auto_scaling_group_config"
6
+ require "ecs_deploy/auto_scaler/instance_drainer"
7
+ require "ecs_deploy/auto_scaler/service_config"
8
+ require "ecs_deploy/auto_scaler/spot_fleet_request_config"
4
9
 
5
10
  module EcsDeploy
6
11
  module AutoScaler
@@ -8,8 +13,8 @@ module EcsDeploy
8
13
  attr_reader :logger, :error_logger
9
14
 
10
15
  def run(yaml_path, log_file = nil, error_log_file = nil)
11
- trap(:TERM) { @stop = true }
12
- trap(:INT) { @stop = true }
16
+ @enable_auto_scaling = true
17
+ setup_signal_handlers
13
18
  @logger = Logger.new(log_file || STDOUT)
14
19
  @logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"]
15
20
  STDOUT.sync = true unless log_file
@@ -17,86 +22,129 @@ module EcsDeploy
17
22
  @error_logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"]
18
23
  STDERR.sync = true unless error_log_file
19
24
  load_config(yaml_path)
20
- service_configs
21
- auto_scaling_group_configs
22
25
 
23
- config_groups = service_configs.group_by { |s| [s.region, s.auto_scaling_group_name] }
24
- ths = config_groups.map do |(region, auto_scaling_group_name), configs|
25
- asg_config = auto_scaling_group_configs.find { |c| c.name == auto_scaling_group_name && c.region == region }
26
- Thread.new(asg_config, configs, &method(:main_loop))
26
+ ths = (auto_scaling_group_configs + spot_fleet_request_configs).map do |cluster_scaling_config|
27
+ Thread.new(cluster_scaling_config, &method(:main_loop)).tap { |th| th.abort_on_exception = true }
28
+ end
29
+
30
+ if @config["spot_instance_intrp_warns_queue_urls"]
31
+ drainer = EcsDeploy::AutoScaler::InstanceDrainer.new(
32
+ auto_scaling_group_configs: auto_scaling_group_configs,
33
+ spot_fleet_request_configs: spot_fleet_request_configs,
34
+ logger: logger,
35
+ )
36
+ polling_ths = @config["spot_instance_intrp_warns_queue_urls"].map do |queue_url|
37
+ Thread.new(queue_url) do |url|
38
+ drainer.poll_spot_instance_interruption_warnings(url)
39
+ end.tap { |th| th.abort_on_exception = true }
40
+ end
27
41
  end
28
42
 
29
43
  ths.each(&:join)
44
+
45
+ drainer&.stop
46
+ polling_ths&.each(&:join)
30
47
  end
31
48
 
32
- def main_loop(asg_config, configs)
33
- loop_with_polling_interval("loop of #{asg_config.name}") do
34
- ths = configs.map do |service_config|
49
+ def main_loop(cluster_scaling_config)
50
+ loop_with_polling_interval("loop of #{cluster_scaling_config.name}") do
51
+ ths = cluster_scaling_config.service_configs.map do |service_config|
35
52
  Thread.new(service_config) do |s|
36
- next if s.idle?
37
-
38
53
  @logger.debug "Start service scaling of #{s.name}"
39
-
40
- difference = 0
41
- s.upscale_triggers.each do |trigger|
42
- step = trigger.step || s.step
43
- next if difference >= step
44
-
45
- if trigger.match?
46
- logger.info "Fire upscale trigger of #{s.name} by #{trigger.alarm_name} #{trigger.state}"
47
- difference = step
48
- end
49
- end
50
-
51
- if difference == 0 && s.desired_count > s.current_min_task_count
52
- s.downscale_triggers.each do |trigger|
53
- next unless trigger.match?
54
-
55
- logger.info "Fire downscale trigger of #{s.name} by #{trigger.alarm_name} #{trigger.state}"
56
- step = trigger.step || s.step
57
- difference = [difference, -step].min
58
- end
59
- end
60
-
61
- if s.current_min_task_count > s.desired_count + difference
62
- difference = s.current_min_task_count - s.desired_count
63
- end
64
-
65
- if difference >= 0 && s.desired_count > s.max_task_count.max
66
- difference = s.max_task_count.max - s.desired_count
67
- end
68
-
69
- if difference != 0
70
- s.update_service(difference)
71
- end
54
+ s.adjust_desired_count(cluster_scaling_config.cluster_resource_manager)
72
55
  end
73
56
  end
57
+ ths.each { |th| th.abort_on_exception = true }
74
58
 
75
59
  ths.each(&:join)
76
60
 
77
- @logger.debug "Start asg scaling of #{asg_config.name}"
61
+ @logger.debug "Start cluster scaling of #{cluster_scaling_config.name}"
78
62
 
79
- total_service_count = configs.inject(0) { |sum, s| sum + s.desired_count }
80
- asg_config.update_auto_scaling_group(total_service_count, configs[0])
81
- asg_config.detach_and_terminate_orphan_instances(configs[0])
63
+ required_capacity = cluster_scaling_config.service_configs.sum { |s| s.desired_count * s.required_capacity }
64
+ cluster_scaling_config.update_desired_capacity(required_capacity)
65
+
66
+ cluster_scaling_config.service_configs.each(&:wait_until_desired_count_updated)
82
67
  end
83
68
  end
84
69
 
85
70
  def load_config(yaml_path)
86
71
  @config = YAML.load_file(yaml_path)
87
72
  @polling_interval = @config["polling_interval"] || 30
88
- end
73
+ if @config["services"]
74
+ @error_logger&.warn('"services" property in root-level is deprecated. Please define it in "auto_scaling_groups" property or "spot_fleet_requests" property.')
75
+ @config.delete("services").each do |svc|
76
+ if svc["auto_scaling_group_name"] && svc["spot_fleet_request_id"]
77
+ raise "You can specify only one of 'auto_scaling_group_name' or 'spot_fleet_request_name'"
78
+ end
79
+
80
+ svc_region = svc.delete("region")
81
+ if svc["auto_scaling_group_name"]
82
+ asg_name = svc.delete("auto_scaling_group_name")
83
+ asg = @config["auto_scaling_groups"].find { |g| g["region"] == svc_region && g["name"] == asg_name }
84
+ asg["services"] ||= []
85
+ asg["services"] << svc
86
+ asg["cluster"] = svc.delete("cluster")
87
+ end
89
88
 
90
- def service_configs
91
- @service_configs ||= @config["services"].map(&ServiceConfig.method(:new))
89
+ if svc["spot_fleet_request_id"]
90
+ sfr_id = svc.delete("spot_fleet_request_id")
91
+ sfr = @config["spot_fleet_requests"].find { |r| r["region"] == svc_region && r["id"] == sfr_id }
92
+ sfr["services"] ||= []
93
+ sfr["services"] << svc
94
+ sfr["cluster"] = svc.delete("cluster")
95
+ end
96
+ end
97
+ end
92
98
  end
93
99
 
94
100
  def auto_scaling_group_configs
95
- @auto_scaling_group_configs ||= @config["auto_scaling_groups"].map(&AutoScalingConfig.method(:new))
101
+ @auto_scaling_group_configs ||= (@config["auto_scaling_groups"] || []).each.with_object({}) do |c, configs|
102
+ configs[c["name"]] ||= {}
103
+ if configs[c["name"]][c["region"]]
104
+ raise "Duplicate entry in auto_scaling_groups (name: #{c["name"]}, region: #{c["region"]})"
105
+ end
106
+ configs[c["name"]][c["region"]] = AutoScalingGroupConfig.new(c, @logger)
107
+ end.values.flat_map(&:values)
108
+ end
109
+
110
+ def spot_fleet_request_configs
111
+ @spot_fleet_request_configs ||= (@config["spot_fleet_requests"] || []).each.with_object({}) do |c, configs|
112
+ configs[c["id"]] ||= {}
113
+ if configs[c["id"]][c["region"]]
114
+ raise "Duplicate entry in spot_fleet_requests (id: #{c["id"]}, region: #{c["region"]})"
115
+ end
116
+ configs[c["id"]][c["region"]] = SpotFleetRequestConfig.new(c, @logger)
117
+ end.values.flat_map(&:values)
96
118
  end
97
119
 
98
120
  private
99
121
 
122
+ def setup_signal_handlers
123
+ # Use a thread and a queue to avoid "log writing failed. can't be called from trap context"
124
+ # cf. https://bugs.ruby-lang.org/issues/14222#note-3
125
+ signals = Queue.new
126
+ %i(TERM INT CONT TSTP).each do |sig|
127
+ trap(sig) { signals << sig }
128
+ end
129
+
130
+ Thread.new do
131
+ loop do
132
+ sig = signals.pop
133
+ case sig
134
+ when :INT, :TERM
135
+ @logger.info "Received SIG#{sig}, shutting down gracefully"
136
+ @stop = true
137
+ when :CONT
138
+ @logger.info "Received SIGCONT, resume auto scaling"
139
+ @enable_auto_scaling = true
140
+ when :TSTP
141
+ @logger.info "Received SIGTSTP, pause auto scaling. Send SIGCONT to resume it."
142
+ @enable_auto_scaling = false
143
+ end
144
+ end
145
+ end
146
+ end
147
+
100
148
  def wait_polling_interval?(last_executed_at)
101
149
  current = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
102
150
  diff = current - last_executed_at
@@ -110,314 +158,15 @@ module EcsDeploy
110
158
  loop do
111
159
  break if @stop
112
160
  sleep 1
161
+ next unless @enable_auto_scaling
113
162
  next if wait_polling_interval?(last_executed_at)
114
163
  yield
115
164
  last_executed_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
165
+ @logger.debug "#{name} is last executed at #{last_executed_at}"
116
166
  end
117
167
 
118
168
  @logger.debug "Stop #{name}"
119
169
  end
120
170
  end
121
-
122
- module ConfigBase
123
- def initialize(attributes = {})
124
- attributes.each do |key, val|
125
- send("#{key}=", val)
126
- end
127
- end
128
- end
129
-
130
- SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region auto_scaling_group_name step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count)
131
- ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
132
- include ConfigBase
133
-
134
- def initialize(attributes = {})
135
- super(attributes)
136
- self.idle_time ||= 60
137
- self.max_task_count = Array(max_task_count)
138
- self.upscale_triggers = upscale_triggers.to_a.map do |t|
139
- TriggerConfig.new(t.merge(region: region))
140
- end
141
- self.downscale_triggers = downscale_triggers.to_a.map do |t|
142
- TriggerConfig.new(t.merge(region: region))
143
- end
144
- self.max_task_count.sort!
145
- self.desired_count = fetch_service.desired_count
146
- @reach_max_at = nil
147
- @last_updated_at = nil
148
- end
149
-
150
- def client
151
- Thread.current["ecs_auto_scaler_ecs_#{region}"] ||= Aws::ECS::Client.new(
152
- access_key_id: EcsDeploy.config.access_key_id,
153
- secret_access_key: EcsDeploy.config.secret_access_key,
154
- region: region
155
- )
156
- end
157
-
158
- def clear_client
159
- Thread.current["ecs_auto_scaler_ecs_#{region}"] = nil
160
- end
161
-
162
- def idle?
163
- return false unless @last_updated_at
164
-
165
- diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
166
- diff < idle_time
167
- end
168
-
169
- def current_min_task_count
170
- return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
171
-
172
- scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
173
- from = Time.parse(s["from"])
174
- to = Time.parse(s["to"])
175
- (from..to).cover?(Time.now)
176
- }["count"]
177
- end
178
-
179
- def overheat?
180
- return false unless @reach_max_at
181
- (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
182
- end
183
-
184
- def fetch_service
185
- res = client.describe_services(cluster: cluster, services: [name])
186
- raise "Service \"#{name}\" is not found" if res.services.empty?
187
- res.services[0]
188
- rescue => e
189
- AutoScaler.error_logger.error(e)
190
- clear_client
191
- end
192
-
193
- def update_service(difference)
194
- next_desired_count = desired_count + difference
195
- current_level = max_task_level(desired_count)
196
- next_level = max_task_level(next_desired_count)
197
- if current_level < next_level && overheat? # next max
198
- level = next_level
199
- @reach_max_at = nil
200
- AutoScaler.logger.info "Service \"#{name}\" is overheat, uses next max count"
201
- elsif current_level < next_level && !overheat? # wait cooldown
202
- level = current_level
203
- now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
204
- @reach_max_at ||= now
205
- AutoScaler.logger.info "Service \"#{name}\" waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
206
- elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
207
- level = current_level
208
- now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
209
- @reach_max_at ||= now
210
- AutoScaler.logger.info "Service \"#{name}\" waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
211
- elsif current_level == next_level && next_desired_count < max_task_count[current_level]
212
- level = current_level
213
- @reach_max_at = nil
214
- AutoScaler.logger.info "Service \"#{name}\" clears cooldown state"
215
- elsif current_level > next_level
216
- level = next_level
217
- @reach_max_at = nil
218
- AutoScaler.logger.info "Service \"#{name}\" clears cooldown state"
219
- end
220
-
221
- next_desired_count = [next_desired_count, max_task_count[level]].min
222
- client.update_service(
223
- cluster: cluster,
224
- service: name,
225
- desired_count: next_desired_count,
226
- )
227
- client.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
228
- w.before_wait do
229
- AutoScaler.logger.debug "wait service stable [#{name}]"
230
- end
231
- end if difference < 0
232
- @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
233
- self.desired_count = next_desired_count
234
- AutoScaler.logger.info "Update service \"#{name}\": desired_count -> #{next_desired_count}"
235
- rescue => e
236
- AutoScaler.error_logger.error(e)
237
- clear_client
238
- end
239
-
240
- def fetch_container_instances
241
- arns = []
242
- resp = nil
243
- loop do
244
- options = {cluster: cluster}
245
- options.merge(next_token: resp.next_token) if resp && resp.next_token
246
- resp = client.list_container_instances(options)
247
- arns.concat(resp.container_instance_arns)
248
- break unless resp.next_token
249
- end
250
-
251
- chunk_size = 50
252
- container_instances = []
253
- arns.each_slice(chunk_size) do |arn_chunk|
254
- is = client.describe_container_instances(cluster: cluster, container_instances: arn_chunk).container_instances
255
- container_instances.concat(is)
256
- end
257
-
258
- container_instances
259
- end
260
-
261
- private
262
-
263
- def max_task_level(count)
264
- max_task_count.index { |i| count <= i } || max_task_count.size - 1
265
- end
266
- end
267
-
268
- TriggerConfig = Struct.new(:alarm_name, :region, :state, :step) do
269
- include ConfigBase
270
-
271
- def client
272
- Thread.current["ecs_auto_scaler_cloud_watch_#{region}"] ||= Aws::CloudWatch::Client.new(
273
- access_key_id: EcsDeploy.config.access_key_id,
274
- secret_access_key: EcsDeploy.config.secret_access_key,
275
- region: region
276
- )
277
- end
278
-
279
- def clear_client
280
- Thread.current["ecs_auto_scaler_cloud_watch_#{region}"] = nil
281
- end
282
-
283
- def match?
284
- fetch_alarm.state_value == state
285
- end
286
-
287
- def fetch_alarm
288
- res = client.describe_alarms(alarm_names: [alarm_name])
289
-
290
- raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
291
- res.metric_alarms[0].tap do |alarm|
292
- AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
293
- end
294
- rescue => e
295
- AutoScaler.error_logger.error(e)
296
- clear_client
297
- end
298
- end
299
-
300
- AutoScalingConfig = Struct.new(:name, :region, :buffer) do
301
- include ConfigBase
302
-
303
- def client
304
- Thread.current["ecs_auto_scaler_auto_scaling_#{region}"] ||= Aws::AutoScaling::Client.new(
305
- access_key_id: EcsDeploy.config.access_key_id,
306
- secret_access_key: EcsDeploy.config.secret_access_key,
307
- region: region
308
- )
309
- end
310
-
311
- def clear_client
312
- Thread.current["ecs_auto_scaler_auto_scaling_#{region}"] = nil
313
- end
314
-
315
- def ec2_client
316
- Thread.current["ecs_auto_scaler_ec2_#{region}"] ||= Aws::EC2::Client.new(
317
- access_key_id: EcsDeploy.config.access_key_id,
318
- secret_access_key: EcsDeploy.config.secret_access_key,
319
- region: region
320
- )
321
- end
322
-
323
- def clear_ec2_client
324
- Thread.current["ecs_auto_scaler_ec2_#{region}"] = nil
325
- end
326
-
327
- def instances(reload: false)
328
- if reload || @instances.nil?
329
- resp = client.describe_auto_scaling_groups({
330
- auto_scaling_group_names: [name],
331
- })
332
- @instances = resp.auto_scaling_groups[0].instances
333
- else
334
- @instances
335
- end
336
- end
337
-
338
- def update_auto_scaling_group(total_service_count, service_config)
339
- desired_capacity = total_service_count + buffer.to_i
340
-
341
- current_asg = client.describe_auto_scaling_groups({
342
- auto_scaling_group_names: [name],
343
- }).auto_scaling_groups[0]
344
-
345
- if current_asg.desired_capacity > desired_capacity
346
- diff = current_asg.desired_capacity - desired_capacity
347
- container_instances = service_config.fetch_container_instances
348
- deregisterable_instances = container_instances.select do |i|
349
- i.pending_tasks_count == 0 && i.running_tasks_count == 0
350
- end
351
-
352
- AutoScaler.logger.info "Fetch deregisterable instances: #{deregisterable_instances.map(&:ec2_instance_id).inspect}"
353
-
354
- deregistered_instance_ids = []
355
- deregisterable_instances.each do |i|
356
- break if deregistered_instance_ids.size >= diff
357
-
358
- begin
359
- service_config.client.deregister_container_instance(cluster: service_config.cluster, container_instance: i.container_instance_arn, force: false)
360
- deregistered_instance_ids << i.ec2_instance_id
361
- rescue Aws::ECS::Errors::InvalidParameterException
362
- end
363
- end
364
-
365
- AutoScaler.logger.info "Deregistered instances: #{deregistered_instance_ids.inspect}"
366
-
367
- detach_and_terminate_instances(deregistered_instance_ids)
368
-
369
- AutoScaler.logger.info "Update auto scaling group \"#{name}\": desired_capacity -> #{desired_capacity}"
370
- elsif current_asg.desired_capacity < desired_capacity
371
- client.update_auto_scaling_group(
372
- auto_scaling_group_name: name,
373
- min_size: 0,
374
- max_size: [current_asg.max_size, desired_capacity].max,
375
- desired_capacity: desired_capacity,
376
- )
377
- AutoScaler.logger.info "Update auto scaling group \"#{name}\": desired_capacity -> #{desired_capacity}"
378
- end
379
- rescue => e
380
- AutoScaler.error_logger.error(e)
381
- clear_client
382
- end
383
-
384
- def detach_and_terminate_instances(instance_ids)
385
- return if instance_ids.empty?
386
-
387
- client.detach_instances(
388
- auto_scaling_group_name: name,
389
- instance_ids: instance_ids,
390
- should_decrement_desired_capacity: true
391
- )
392
-
393
- AutoScaler.logger.info "Detach instances from ASG #{name}: #{instance_ids.inspect}"
394
- sleep 3
395
-
396
- ec2_client.terminate_instances(instance_ids: instance_ids)
397
-
398
- AutoScaler.logger.info "Terminated instances: #{instance_ids.inspect}"
399
- rescue => e
400
- AutoScaler.error_logger.error(e)
401
- clear_client
402
- clear_ec2_client
403
- end
404
-
405
- def detach_and_terminate_orphan_instances(service_config)
406
- container_instance_ids = service_config.fetch_container_instances.map(&:ec2_instance_id)
407
- orphans = instances(reload: true).reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
408
-
409
- return if orphans.empty?
410
-
411
- targets = ec2_client.describe_instances(instance_ids: orphans).reservations[0].instances.select do |i|
412
- (Time.now - i.launch_time) > 600
413
- end
414
-
415
- detach_and_terminate_instances(targets.map(&:instance_id))
416
- rescue => e
417
- AutoScaler.error_logger.error(e)
418
- clear_client
419
- clear_ec2_client
420
- end
421
- end
422
171
  end
423
172
  end