ecs_deploy 0.2.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,2 +1,6 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
2
6
  task :default => :spec
data/ecs_deploy.gemspec CHANGED
@@ -18,10 +18,16 @@ Gem::Specification.new do |spec|
18
18
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency "aws-sdk", "~> 2.4"
21
+ spec.add_runtime_dependency "aws-sdk-autoscaling", "~> 1"
22
+ spec.add_runtime_dependency "aws-sdk-cloudwatch", "~> 1"
23
+ spec.add_runtime_dependency "aws-sdk-cloudwatchevents", "~> 1"
24
+ spec.add_runtime_dependency "aws-sdk-ec2", "~> 1"
25
+ spec.add_runtime_dependency "aws-sdk-ecs", "~> 1"
26
+ spec.add_runtime_dependency "aws-sdk-sqs", "~> 1"
22
27
  spec.add_runtime_dependency "terminal-table"
23
28
  spec.add_runtime_dependency "paint"
24
29
 
25
- spec.add_development_dependency "bundler", "~> 1.11"
26
- spec.add_development_dependency "rake", "~> 10.0"
30
+ spec.add_development_dependency "bundler", ">= 1.11", "< 3"
31
+ spec.add_development_dependency "rake", ">= 10.0"
32
+ spec.add_development_dependency "rspec", "~> 3.0"
27
33
  end
data/lib/ecs_deploy.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require "ecs_deploy/version"
2
2
  require "ecs_deploy/configuration"
3
3
 
4
- require 'aws-sdk'
4
+ require 'aws-sdk-ecs'
5
5
  require 'logger'
6
6
  require 'terminal-table'
7
7
  require 'paint'
@@ -27,3 +27,4 @@ end
27
27
 
28
28
  require "ecs_deploy/task_definition"
29
29
  require "ecs_deploy/service"
30
+ require "ecs_deploy/scheduled_task"
@@ -1,6 +1,11 @@
1
- require 'yaml'
2
- require 'logger'
3
- require 'time'
1
+ require "logger"
2
+ require "time"
3
+ require "yaml"
4
+
5
+ require "ecs_deploy/auto_scaler/auto_scaling_group_config"
6
+ require "ecs_deploy/auto_scaler/instance_drainer"
7
+ require "ecs_deploy/auto_scaler/service_config"
8
+ require "ecs_deploy/auto_scaler/spot_fleet_request_config"
4
9
 
5
10
  module EcsDeploy
6
11
  module AutoScaler
@@ -8,8 +13,8 @@ module EcsDeploy
8
13
  attr_reader :logger, :error_logger
9
14
 
10
15
  def run(yaml_path, log_file = nil, error_log_file = nil)
11
- trap(:TERM) { @stop = true }
12
- trap(:INT) { @stop = true }
16
+ @enable_auto_scaling = true
17
+ setup_signal_handlers
13
18
  @logger = Logger.new(log_file || STDOUT)
14
19
  @logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"]
15
20
  STDOUT.sync = true unless log_file
@@ -17,86 +22,129 @@ module EcsDeploy
17
22
  @error_logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"]
18
23
  STDERR.sync = true unless error_log_file
19
24
  load_config(yaml_path)
20
- service_configs
21
- auto_scaling_group_configs
22
25
 
23
- config_groups = service_configs.group_by { |s| [s.region, s.auto_scaling_group_name] }
24
- ths = config_groups.map do |(region, auto_scaling_group_name), configs|
25
- asg_config = auto_scaling_group_configs.find { |c| c.name == auto_scaling_group_name && c.region == region }
26
- Thread.new(asg_config, configs, &method(:main_loop))
26
+ ths = (auto_scaling_group_configs + spot_fleet_request_configs).map do |cluster_scaling_config|
27
+ Thread.new(cluster_scaling_config, &method(:main_loop)).tap { |th| th.abort_on_exception = true }
28
+ end
29
+
30
+ if @config["spot_instance_intrp_warns_queue_urls"]
31
+ drainer = EcsDeploy::AutoScaler::InstanceDrainer.new(
32
+ auto_scaling_group_configs: auto_scaling_group_configs,
33
+ spot_fleet_request_configs: spot_fleet_request_configs,
34
+ logger: logger,
35
+ )
36
+ polling_ths = @config["spot_instance_intrp_warns_queue_urls"].map do |queue_url|
37
+ Thread.new(queue_url) do |url|
38
+ drainer.poll_spot_instance_interruption_warnings(url)
39
+ end.tap { |th| th.abort_on_exception = true }
40
+ end
27
41
  end
28
42
 
29
43
  ths.each(&:join)
44
+
45
+ drainer&.stop
46
+ polling_ths&.each(&:join)
30
47
  end
31
48
 
32
- def main_loop(asg_config, configs)
33
- loop_with_polling_interval("loop of #{asg_config.name}") do
34
- ths = configs.map do |service_config|
49
+ def main_loop(cluster_scaling_config)
50
+ loop_with_polling_interval("loop of #{cluster_scaling_config.name}") do
51
+ ths = cluster_scaling_config.service_configs.map do |service_config|
35
52
  Thread.new(service_config) do |s|
36
- next if s.idle?
37
-
38
53
  @logger.debug "Start service scaling of #{s.name}"
39
-
40
- difference = 0
41
- s.upscale_triggers.each do |trigger|
42
- step = trigger.step || s.step
43
- next if difference >= step
44
-
45
- if trigger.match?
46
- logger.info "Fire upscale trigger of #{s.name} by #{trigger.alarm_name} #{trigger.state}"
47
- difference = step
48
- end
49
- end
50
-
51
- if difference == 0 && s.desired_count > s.current_min_task_count
52
- s.downscale_triggers.each do |trigger|
53
- next unless trigger.match?
54
-
55
- logger.info "Fire downscale trigger of #{s.name} by #{trigger.alarm_name} #{trigger.state}"
56
- step = trigger.step || s.step
57
- difference = [difference, -step].min
58
- end
59
- end
60
-
61
- if s.current_min_task_count > s.desired_count + difference
62
- difference = s.current_min_task_count - s.desired_count
63
- end
64
-
65
- if difference >= 0 && s.desired_count > s.max_task_count.max
66
- difference = s.max_task_count.max - s.desired_count
67
- end
68
-
69
- if difference != 0
70
- s.update_service(difference)
71
- end
54
+ s.adjust_desired_count(cluster_scaling_config.cluster_resource_manager)
72
55
  end
73
56
  end
57
+ ths.each { |th| th.abort_on_exception = true }
74
58
 
75
59
  ths.each(&:join)
76
60
 
77
- @logger.debug "Start asg scaling of #{asg_config.name}"
61
+ @logger.debug "Start cluster scaling of #{cluster_scaling_config.name}"
78
62
 
79
- total_service_count = configs.inject(0) { |sum, s| sum + s.desired_count }
80
- asg_config.update_auto_scaling_group(total_service_count, configs[0])
81
- asg_config.detach_and_terminate_orphan_instances(configs[0])
63
+ required_capacity = cluster_scaling_config.service_configs.sum { |s| s.desired_count * s.required_capacity }
64
+ cluster_scaling_config.update_desired_capacity(required_capacity)
65
+
66
+ cluster_scaling_config.service_configs.each(&:wait_until_desired_count_updated)
82
67
  end
83
68
  end
84
69
 
85
70
  def load_config(yaml_path)
86
71
  @config = YAML.load_file(yaml_path)
87
72
  @polling_interval = @config["polling_interval"] || 30
88
- end
73
+ if @config["services"]
74
+ @error_logger&.warn('"services" property in root-level is deprecated. Please define it in "auto_scaling_groups" property or "spot_fleet_requests" property.')
75
+ @config.delete("services").each do |svc|
76
+ if svc["auto_scaling_group_name"] && svc["spot_fleet_request_id"]
77
+ raise "You can specify only one of 'auto_scaling_group_name' or 'spot_fleet_request_name'"
78
+ end
79
+
80
+ svc_region = svc.delete("region")
81
+ if svc["auto_scaling_group_name"]
82
+ asg_name = svc.delete("auto_scaling_group_name")
83
+ asg = @config["auto_scaling_groups"].find { |g| g["region"] == svc_region && g["name"] == asg_name }
84
+ asg["services"] ||= []
85
+ asg["services"] << svc
86
+ asg["cluster"] = svc.delete("cluster")
87
+ end
89
88
 
90
- def service_configs
91
- @service_configs ||= @config["services"].map(&ServiceConfig.method(:new))
89
+ if svc["spot_fleet_request_id"]
90
+ sfr_id = svc.delete("spot_fleet_request_id")
91
+ sfr = @config["spot_fleet_requests"].find { |r| r["region"] == svc_region && r["id"] == sfr_id }
92
+ sfr["services"] ||= []
93
+ sfr["services"] << svc
94
+ sfr["cluster"] = svc.delete("cluster")
95
+ end
96
+ end
97
+ end
92
98
  end
93
99
 
94
100
  def auto_scaling_group_configs
95
- @auto_scaling_group_configs ||= @config["auto_scaling_groups"].map(&AutoScalingConfig.method(:new))
101
+ @auto_scaling_group_configs ||= (@config["auto_scaling_groups"] || []).each.with_object({}) do |c, configs|
102
+ configs[c["name"]] ||= {}
103
+ if configs[c["name"]][c["region"]]
104
+ raise "Duplicate entry in auto_scaling_groups (name: #{c["name"]}, region: #{c["region"]})"
105
+ end
106
+ configs[c["name"]][c["region"]] = AutoScalingGroupConfig.new(c, @logger)
107
+ end.values.flat_map(&:values)
108
+ end
109
+
110
+ def spot_fleet_request_configs
111
+ @spot_fleet_request_configs ||= (@config["spot_fleet_requests"] || []).each.with_object({}) do |c, configs|
112
+ configs[c["id"]] ||= {}
113
+ if configs[c["id"]][c["region"]]
114
+ raise "Duplicate entry in spot_fleet_requests (id: #{c["id"]}, region: #{c["region"]})"
115
+ end
116
+ configs[c["id"]][c["region"]] = SpotFleetRequestConfig.new(c, @logger)
117
+ end.values.flat_map(&:values)
96
118
  end
97
119
 
98
120
  private
99
121
 
122
+ def setup_signal_handlers
123
+ # Use a thread and a queue to avoid "log writing failed. can't be called from trap context"
124
+ # cf. https://bugs.ruby-lang.org/issues/14222#note-3
125
+ signals = Queue.new
126
+ %i(TERM INT CONT TSTP).each do |sig|
127
+ trap(sig) { signals << sig }
128
+ end
129
+
130
+ Thread.new do
131
+ loop do
132
+ sig = signals.pop
133
+ case sig
134
+ when :INT, :TERM
135
+ @logger.info "Received SIG#{sig}, shutting down gracefully"
136
+ @stop = true
137
+ when :CONT
138
+ @logger.info "Received SIGCONT, resume auto scaling"
139
+ @enable_auto_scaling = true
140
+ when :TSTP
141
+ @logger.info "Received SIGTSTP, pause auto scaling. Send SIGCONT to resume it."
142
+ @enable_auto_scaling = false
143
+ end
144
+ end
145
+ end
146
+ end
147
+
100
148
  def wait_polling_interval?(last_executed_at)
101
149
  current = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
102
150
  diff = current - last_executed_at
@@ -110,314 +158,15 @@ module EcsDeploy
110
158
  loop do
111
159
  break if @stop
112
160
  sleep 1
161
+ next unless @enable_auto_scaling
113
162
  next if wait_polling_interval?(last_executed_at)
114
163
  yield
115
164
  last_executed_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
165
+ @logger.debug "#{name} is last executed at #{last_executed_at}"
116
166
  end
117
167
 
118
168
  @logger.debug "Stop #{name}"
119
169
  end
120
170
  end
121
-
122
- module ConfigBase
123
- def initialize(attributes = {})
124
- attributes.each do |key, val|
125
- send("#{key}=", val)
126
- end
127
- end
128
- end
129
-
130
- SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region auto_scaling_group_name step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count)
131
- ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
132
- include ConfigBase
133
-
134
- def initialize(attributes = {})
135
- super(attributes)
136
- self.idle_time ||= 60
137
- self.max_task_count = Array(max_task_count)
138
- self.upscale_triggers = upscale_triggers.to_a.map do |t|
139
- TriggerConfig.new(t.merge(region: region))
140
- end
141
- self.downscale_triggers = downscale_triggers.to_a.map do |t|
142
- TriggerConfig.new(t.merge(region: region))
143
- end
144
- self.max_task_count.sort!
145
- self.desired_count = fetch_service.desired_count
146
- @reach_max_at = nil
147
- @last_updated_at = nil
148
- end
149
-
150
- def client
151
- Thread.current["ecs_auto_scaler_ecs_#{region}"] ||= Aws::ECS::Client.new(
152
- access_key_id: EcsDeploy.config.access_key_id,
153
- secret_access_key: EcsDeploy.config.secret_access_key,
154
- region: region
155
- )
156
- end
157
-
158
- def clear_client
159
- Thread.current["ecs_auto_scaler_ecs_#{region}"] = nil
160
- end
161
-
162
- def idle?
163
- return false unless @last_updated_at
164
-
165
- diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
166
- diff < idle_time
167
- end
168
-
169
- def current_min_task_count
170
- return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
171
-
172
- scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
173
- from = Time.parse(s["from"])
174
- to = Time.parse(s["to"])
175
- (from..to).cover?(Time.now)
176
- }["count"]
177
- end
178
-
179
- def overheat?
180
- return false unless @reach_max_at
181
- (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
182
- end
183
-
184
- def fetch_service
185
- res = client.describe_services(cluster: cluster, services: [name])
186
- raise "Service \"#{name}\" is not found" if res.services.empty?
187
- res.services[0]
188
- rescue => e
189
- AutoScaler.error_logger.error(e)
190
- clear_client
191
- end
192
-
193
- def update_service(difference)
194
- next_desired_count = desired_count + difference
195
- current_level = max_task_level(desired_count)
196
- next_level = max_task_level(next_desired_count)
197
- if current_level < next_level && overheat? # next max
198
- level = next_level
199
- @reach_max_at = nil
200
- AutoScaler.logger.info "Service \"#{name}\" is overheat, uses next max count"
201
- elsif current_level < next_level && !overheat? # wait cooldown
202
- level = current_level
203
- now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
204
- @reach_max_at ||= now
205
- AutoScaler.logger.info "Service \"#{name}\" waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
206
- elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
207
- level = current_level
208
- now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
209
- @reach_max_at ||= now
210
- AutoScaler.logger.info "Service \"#{name}\" waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
211
- elsif current_level == next_level && next_desired_count < max_task_count[current_level]
212
- level = current_level
213
- @reach_max_at = nil
214
- AutoScaler.logger.info "Service \"#{name}\" clears cooldown state"
215
- elsif current_level > next_level
216
- level = next_level
217
- @reach_max_at = nil
218
- AutoScaler.logger.info "Service \"#{name}\" clears cooldown state"
219
- end
220
-
221
- next_desired_count = [next_desired_count, max_task_count[level]].min
222
- client.update_service(
223
- cluster: cluster,
224
- service: name,
225
- desired_count: next_desired_count,
226
- )
227
- client.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
228
- w.before_wait do
229
- AutoScaler.logger.debug "wait service stable [#{name}]"
230
- end
231
- end if difference < 0
232
- @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
233
- self.desired_count = next_desired_count
234
- AutoScaler.logger.info "Update service \"#{name}\": desired_count -> #{next_desired_count}"
235
- rescue => e
236
- AutoScaler.error_logger.error(e)
237
- clear_client
238
- end
239
-
240
- def fetch_container_instances
241
- arns = []
242
- resp = nil
243
- loop do
244
- options = {cluster: cluster}
245
- options.merge(next_token: resp.next_token) if resp && resp.next_token
246
- resp = client.list_container_instances(options)
247
- arns.concat(resp.container_instance_arns)
248
- break unless resp.next_token
249
- end
250
-
251
- chunk_size = 50
252
- container_instances = []
253
- arns.each_slice(chunk_size) do |arn_chunk|
254
- is = client.describe_container_instances(cluster: cluster, container_instances: arn_chunk).container_instances
255
- container_instances.concat(is)
256
- end
257
-
258
- container_instances
259
- end
260
-
261
- private
262
-
263
- def max_task_level(count)
264
- max_task_count.index { |i| count <= i } || max_task_count.size - 1
265
- end
266
- end
267
-
268
- TriggerConfig = Struct.new(:alarm_name, :region, :state, :step) do
269
- include ConfigBase
270
-
271
- def client
272
- Thread.current["ecs_auto_scaler_cloud_watch_#{region}"] ||= Aws::CloudWatch::Client.new(
273
- access_key_id: EcsDeploy.config.access_key_id,
274
- secret_access_key: EcsDeploy.config.secret_access_key,
275
- region: region
276
- )
277
- end
278
-
279
- def clear_client
280
- Thread.current["ecs_auto_scaler_cloud_watch_#{region}"] = nil
281
- end
282
-
283
- def match?
284
- fetch_alarm.state_value == state
285
- end
286
-
287
- def fetch_alarm
288
- res = client.describe_alarms(alarm_names: [alarm_name])
289
-
290
- raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
291
- res.metric_alarms[0].tap do |alarm|
292
- AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
293
- end
294
- rescue => e
295
- AutoScaler.error_logger.error(e)
296
- clear_client
297
- end
298
- end
299
-
300
- AutoScalingConfig = Struct.new(:name, :region, :buffer) do
301
- include ConfigBase
302
-
303
- def client
304
- Thread.current["ecs_auto_scaler_auto_scaling_#{region}"] ||= Aws::AutoScaling::Client.new(
305
- access_key_id: EcsDeploy.config.access_key_id,
306
- secret_access_key: EcsDeploy.config.secret_access_key,
307
- region: region
308
- )
309
- end
310
-
311
- def clear_client
312
- Thread.current["ecs_auto_scaler_auto_scaling_#{region}"] = nil
313
- end
314
-
315
- def ec2_client
316
- Thread.current["ecs_auto_scaler_ec2_#{region}"] ||= Aws::EC2::Client.new(
317
- access_key_id: EcsDeploy.config.access_key_id,
318
- secret_access_key: EcsDeploy.config.secret_access_key,
319
- region: region
320
- )
321
- end
322
-
323
- def clear_ec2_client
324
- Thread.current["ecs_auto_scaler_ec2_#{region}"] = nil
325
- end
326
-
327
- def instances(reload: false)
328
- if reload || @instances.nil?
329
- resp = client.describe_auto_scaling_groups({
330
- auto_scaling_group_names: [name],
331
- })
332
- @instances = resp.auto_scaling_groups[0].instances
333
- else
334
- @instances
335
- end
336
- end
337
-
338
- def update_auto_scaling_group(total_service_count, service_config)
339
- desired_capacity = total_service_count + buffer.to_i
340
-
341
- current_asg = client.describe_auto_scaling_groups({
342
- auto_scaling_group_names: [name],
343
- }).auto_scaling_groups[0]
344
-
345
- if current_asg.desired_capacity > desired_capacity
346
- diff = current_asg.desired_capacity - desired_capacity
347
- container_instances = service_config.fetch_container_instances
348
- deregisterable_instances = container_instances.select do |i|
349
- i.pending_tasks_count == 0 && i.running_tasks_count == 0
350
- end
351
-
352
- AutoScaler.logger.info "Fetch deregisterable instances: #{deregisterable_instances.map(&:ec2_instance_id).inspect}"
353
-
354
- deregistered_instance_ids = []
355
- deregisterable_instances.each do |i|
356
- break if deregistered_instance_ids.size >= diff
357
-
358
- begin
359
- service_config.client.deregister_container_instance(cluster: service_config.cluster, container_instance: i.container_instance_arn, force: false)
360
- deregistered_instance_ids << i.ec2_instance_id
361
- rescue Aws::ECS::Errors::InvalidParameterException
362
- end
363
- end
364
-
365
- AutoScaler.logger.info "Deregistered instances: #{deregistered_instance_ids.inspect}"
366
-
367
- detach_and_terminate_instances(deregistered_instance_ids)
368
-
369
- AutoScaler.logger.info "Update auto scaling group \"#{name}\": desired_capacity -> #{desired_capacity}"
370
- elsif current_asg.desired_capacity < desired_capacity
371
- client.update_auto_scaling_group(
372
- auto_scaling_group_name: name,
373
- min_size: 0,
374
- max_size: [current_asg.max_size, desired_capacity].max,
375
- desired_capacity: desired_capacity,
376
- )
377
- AutoScaler.logger.info "Update auto scaling group \"#{name}\": desired_capacity -> #{desired_capacity}"
378
- end
379
- rescue => e
380
- AutoScaler.error_logger.error(e)
381
- clear_client
382
- end
383
-
384
- def detach_and_terminate_instances(instance_ids)
385
- return if instance_ids.empty?
386
-
387
- client.detach_instances(
388
- auto_scaling_group_name: name,
389
- instance_ids: instance_ids,
390
- should_decrement_desired_capacity: true
391
- )
392
-
393
- AutoScaler.logger.info "Detach instances from ASG #{name}: #{instance_ids.inspect}"
394
- sleep 3
395
-
396
- ec2_client.terminate_instances(instance_ids: instance_ids)
397
-
398
- AutoScaler.logger.info "Terminated instances: #{instance_ids.inspect}"
399
- rescue => e
400
- AutoScaler.error_logger.error(e)
401
- clear_client
402
- clear_ec2_client
403
- end
404
-
405
- def detach_and_terminate_orphan_instances(service_config)
406
- container_instance_ids = service_config.fetch_container_instances.map(&:ec2_instance_id)
407
- orphans = instances(reload: true).reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
408
-
409
- return if orphans.empty?
410
-
411
- targets = ec2_client.describe_instances(instance_ids: orphans).reservations[0].instances.select do |i|
412
- (Time.now - i.launch_time) > 600
413
- end
414
-
415
- detach_and_terminate_instances(targets.map(&:instance_id))
416
- rescue => e
417
- AutoScaler.error_logger.error(e)
418
- clear_client
419
- clear_ec2_client
420
- end
421
- end
422
171
  end
423
172
  end