ecs_deploy 0.3.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,2 +1,6 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
2
6
  task :default => :spec
@@ -18,10 +18,16 @@ Gem::Specification.new do |spec|
18
18
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency "aws-sdk", "~> 2.9"
21
+ spec.add_runtime_dependency "aws-sdk-autoscaling", "~> 1"
22
+ spec.add_runtime_dependency "aws-sdk-cloudwatch", "~> 1"
23
+ spec.add_runtime_dependency "aws-sdk-cloudwatchevents", "~> 1"
24
+ spec.add_runtime_dependency "aws-sdk-ec2", "~> 1"
25
+ spec.add_runtime_dependency "aws-sdk-ecs", "~> 1"
26
+ spec.add_runtime_dependency "aws-sdk-sqs", "~> 1"
22
27
  spec.add_runtime_dependency "terminal-table"
23
28
  spec.add_runtime_dependency "paint"
24
29
 
25
- spec.add_development_dependency "bundler", "~> 1.11"
26
- spec.add_development_dependency "rake", "~> 10.0"
30
+ spec.add_development_dependency "bundler", ">= 1.11", "< 3"
31
+ spec.add_development_dependency "rake", ">= 10.0"
32
+ spec.add_development_dependency "rspec", "~> 3.0"
27
33
  end
@@ -1,7 +1,7 @@
1
1
  require "ecs_deploy/version"
2
2
  require "ecs_deploy/configuration"
3
3
 
4
- require 'aws-sdk'
4
+ require 'aws-sdk-ecs'
5
5
  require 'logger'
6
6
  require 'terminal-table'
7
7
  require 'paint'
@@ -1,6 +1,11 @@
1
- require 'yaml'
2
- require 'logger'
3
- require 'time'
1
+ require "logger"
2
+ require "time"
3
+ require "yaml"
4
+
5
+ require "ecs_deploy/auto_scaler/auto_scaling_group_config"
6
+ require "ecs_deploy/auto_scaler/instance_drainer"
7
+ require "ecs_deploy/auto_scaler/service_config"
8
+ require "ecs_deploy/auto_scaler/spot_fleet_request_config"
4
9
 
5
10
  module EcsDeploy
6
11
  module AutoScaler
@@ -8,8 +13,8 @@ module EcsDeploy
8
13
  attr_reader :logger, :error_logger
9
14
 
10
15
  def run(yaml_path, log_file = nil, error_log_file = nil)
11
- trap(:TERM) { @stop = true }
12
- trap(:INT) { @stop = true }
16
+ @enable_auto_scaling = true
17
+ setup_signal_handlers
13
18
  @logger = Logger.new(log_file || STDOUT)
14
19
  @logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"]
15
20
  STDOUT.sync = true unless log_file
@@ -17,90 +22,129 @@ module EcsDeploy
17
22
  @error_logger.level = Logger.const_get(ENV["ECS_AUTO_SCALER_LOG_LEVEL"].upcase) if ENV["ECS_AUTO_SCALER_LOG_LEVEL"]
18
23
  STDERR.sync = true unless error_log_file
19
24
  load_config(yaml_path)
20
- service_configs
21
- auto_scaling_group_configs
22
25
 
23
- config_groups = service_configs.group_by { |s| [s.region, s.auto_scaling_group_name] }
24
- ths = config_groups.map do |(region, auto_scaling_group_name), configs|
25
- asg_config = auto_scaling_group_configs.find { |c| c.name == auto_scaling_group_name && c.region == region }
26
- Thread.new(asg_config, configs, &method(:main_loop)).tap { |th| th.abort_on_exception = true }
26
+ ths = (auto_scaling_group_configs + spot_fleet_request_configs).map do |cluster_scaling_config|
27
+ Thread.new(cluster_scaling_config, &method(:main_loop)).tap { |th| th.abort_on_exception = true }
28
+ end
29
+
30
+ if @config["spot_instance_intrp_warns_queue_urls"]
31
+ drainer = EcsDeploy::AutoScaler::InstanceDrainer.new(
32
+ auto_scaling_group_configs: auto_scaling_group_configs,
33
+ spot_fleet_request_configs: spot_fleet_request_configs,
34
+ logger: logger,
35
+ )
36
+ polling_ths = @config["spot_instance_intrp_warns_queue_urls"].map do |queue_url|
37
+ Thread.new(queue_url) do |url|
38
+ drainer.poll_spot_instance_interruption_warnings(url)
39
+ end.tap { |th| th.abort_on_exception = true }
40
+ end
27
41
  end
28
42
 
29
43
  ths.each(&:join)
44
+
45
+ drainer&.stop
46
+ polling_ths&.each(&:join)
30
47
  end
31
48
 
32
- def main_loop(asg_config, configs)
33
- loop_with_polling_interval("loop of #{asg_config.name}") do
34
- ths = configs.map do |service_config|
49
+ def main_loop(cluster_scaling_config)
50
+ loop_with_polling_interval("loop of #{cluster_scaling_config.name}") do
51
+ ths = cluster_scaling_config.service_configs.map do |service_config|
35
52
  Thread.new(service_config) do |s|
36
53
  @logger.debug "Start service scaling of #{s.name}"
37
-
38
- if s.idle?
39
- @logger.debug "#{s.name} is idling"
40
- next
41
- end
42
-
43
- difference = 0
44
- s.upscale_triggers.each do |trigger|
45
- step = trigger.step || s.step
46
- next if difference >= step
47
-
48
- if trigger.match?
49
- logger.info "Fire upscale trigger of #{s.name} by #{trigger.alarm_name} #{trigger.state}"
50
- difference = step
51
- end
52
- end
53
-
54
- if difference == 0 && s.desired_count > s.current_min_task_count
55
- s.downscale_triggers.each do |trigger|
56
- next unless trigger.match?
57
-
58
- logger.info "Fire downscale trigger of #{s.name} by #{trigger.alarm_name} #{trigger.state}"
59
- step = trigger.step || s.step
60
- difference = [difference, -step].min
61
- end
62
- end
63
-
64
- if s.current_min_task_count > s.desired_count + difference
65
- difference = s.current_min_task_count - s.desired_count
66
- end
67
-
68
- if difference >= 0 && s.desired_count > s.max_task_count.max
69
- difference = s.max_task_count.max - s.desired_count
70
- end
71
-
72
- if difference != 0
73
- s.update_service(difference)
74
- end
54
+ s.adjust_desired_count(cluster_scaling_config.cluster_resource_manager)
75
55
  end
76
56
  end
77
57
  ths.each { |th| th.abort_on_exception = true }
78
58
 
79
59
  ths.each(&:join)
80
60
 
81
- @logger.debug "Start asg scaling of #{asg_config.name}"
61
+ @logger.debug "Start cluster scaling of #{cluster_scaling_config.name}"
62
+
63
+ required_capacity = cluster_scaling_config.service_configs.sum { |s| s.desired_count * s.required_capacity }
64
+ cluster_scaling_config.update_desired_capacity(required_capacity)
82
65
 
83
- total_service_count = configs.inject(0) { |sum, s| sum + s.desired_count }
84
- asg_config.update_auto_scaling_group(total_service_count, configs[0])
85
- asg_config.detach_and_terminate_orphan_instances(configs[0])
66
+ cluster_scaling_config.service_configs.each(&:wait_until_desired_count_updated)
86
67
  end
87
68
  end
88
69
 
89
70
  def load_config(yaml_path)
90
71
  @config = YAML.load_file(yaml_path)
91
72
  @polling_interval = @config["polling_interval"] || 30
92
- end
73
+ if @config["services"]
74
+ @error_logger&.warn('"services" property in root-level is deprecated. Please define it in "auto_scaling_groups" property or "spot_fleet_requests" property.')
75
+ @config.delete("services").each do |svc|
76
+ if svc["auto_scaling_group_name"] && svc["spot_fleet_request_id"]
77
+ raise "You can specify only one of 'auto_scaling_group_name' or 'spot_fleet_request_name'"
78
+ end
79
+
80
+ svc_region = svc.delete("region")
81
+ if svc["auto_scaling_group_name"]
82
+ asg_name = svc.delete("auto_scaling_group_name")
83
+ asg = @config["auto_scaling_groups"].find { |g| g["region"] == svc_region && g["name"] == asg_name }
84
+ asg["services"] ||= []
85
+ asg["services"] << svc
86
+ asg["cluster"] = svc.delete("cluster")
87
+ end
93
88
 
94
- def service_configs
95
- @service_configs ||= @config["services"].map(&ServiceConfig.method(:new))
89
+ if svc["spot_fleet_request_id"]
90
+ sfr_id = svc.delete("spot_fleet_request_id")
91
+ sfr = @config["spot_fleet_requests"].find { |r| r["region"] == svc_region && r["id"] == sfr_id }
92
+ sfr["services"] ||= []
93
+ sfr["services"] << svc
94
+ sfr["cluster"] = svc.delete("cluster")
95
+ end
96
+ end
97
+ end
96
98
  end
97
99
 
98
100
  def auto_scaling_group_configs
99
- @auto_scaling_group_configs ||= @config["auto_scaling_groups"].map(&AutoScalingConfig.method(:new))
101
+ @auto_scaling_group_configs ||= (@config["auto_scaling_groups"] || []).each.with_object({}) do |c, configs|
102
+ configs[c["name"]] ||= {}
103
+ if configs[c["name"]][c["region"]]
104
+ raise "Duplicate entry in auto_scaling_groups (name: #{c["name"]}, region: #{c["region"]})"
105
+ end
106
+ configs[c["name"]][c["region"]] = AutoScalingGroupConfig.new(c, @logger)
107
+ end.values.flat_map(&:values)
108
+ end
109
+
110
+ def spot_fleet_request_configs
111
+ @spot_fleet_request_configs ||= (@config["spot_fleet_requests"] || []).each.with_object({}) do |c, configs|
112
+ configs[c["id"]] ||= {}
113
+ if configs[c["id"]][c["region"]]
114
+ raise "Duplicate entry in spot_fleet_requests (id: #{c["id"]}, region: #{c["region"]})"
115
+ end
116
+ configs[c["id"]][c["region"]] = SpotFleetRequestConfig.new(c, @logger)
117
+ end.values.flat_map(&:values)
100
118
  end
101
119
 
102
120
  private
103
121
 
122
+ def setup_signal_handlers
123
+ # Use a thread and a queue to avoid "log writing failed. can't be called from trap context"
124
+ # cf. https://bugs.ruby-lang.org/issues/14222#note-3
125
+ signals = Queue.new
126
+ %i(TERM INT CONT TSTP).each do |sig|
127
+ trap(sig) { signals << sig }
128
+ end
129
+
130
+ Thread.new do
131
+ loop do
132
+ sig = signals.pop
133
+ case sig
134
+ when :INT, :TERM
135
+ @logger.info "Received SIG#{sig}, shutting down gracefully"
136
+ @stop = true
137
+ when :CONT
138
+ @logger.info "Received SIGCONT, resume auto scaling"
139
+ @enable_auto_scaling = true
140
+ when :TSTP
141
+ @logger.info "Received SIGTSTP, pause auto scaling. Send SIGCONT to resume it."
142
+ @enable_auto_scaling = false
143
+ end
144
+ end
145
+ end
146
+ end
147
+
104
148
  def wait_polling_interval?(last_executed_at)
105
149
  current = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
106
150
  diff = current - last_executed_at
@@ -114,6 +158,7 @@ module EcsDeploy
114
158
  loop do
115
159
  break if @stop
116
160
  sleep 1
161
+ next unless @enable_auto_scaling
117
162
  next if wait_polling_interval?(last_executed_at)
118
163
  yield
119
164
  last_executed_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
@@ -123,284 +168,5 @@ module EcsDeploy
123
168
  @logger.debug "Stop #{name}"
124
169
  end
125
170
  end
126
-
127
- module ConfigBase
128
- def initialize(attributes = {})
129
- attributes.each do |key, val|
130
- send("#{key}=", val)
131
- end
132
- end
133
- end
134
-
135
- SERVICE_CONFIG_ATTRIBUTES = %i(name cluster region auto_scaling_group_name step max_task_count min_task_count idle_time scheduled_min_task_count cooldown_time_for_reach_max upscale_triggers downscale_triggers desired_count)
136
- ServiceConfig = Struct.new(*SERVICE_CONFIG_ATTRIBUTES) do
137
- include ConfigBase
138
-
139
- def initialize(attributes = {})
140
- super(attributes)
141
- self.idle_time ||= 60
142
- self.max_task_count = Array(max_task_count)
143
- self.upscale_triggers = upscale_triggers.to_a.map do |t|
144
- TriggerConfig.new(t.merge(region: region))
145
- end
146
- self.downscale_triggers = downscale_triggers.to_a.map do |t|
147
- TriggerConfig.new(t.merge(region: region))
148
- end
149
- self.max_task_count.sort!
150
- self.desired_count = fetch_service.desired_count
151
- @reach_max_at = nil
152
- @last_updated_at = nil
153
- end
154
-
155
- def client
156
- Aws::ECS::Client.new(
157
- access_key_id: EcsDeploy.config.access_key_id,
158
- secret_access_key: EcsDeploy.config.secret_access_key,
159
- region: region
160
- )
161
- end
162
-
163
- def idle?
164
- return false unless @last_updated_at
165
-
166
- diff = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @last_updated_at
167
- diff < idle_time
168
- end
169
-
170
- def current_min_task_count
171
- return min_task_count if scheduled_min_task_count.nil? || scheduled_min_task_count.empty?
172
-
173
- scheduled_min_task_count.find(-> { {"count" => min_task_count} }) { |s|
174
- from = Time.parse(s["from"])
175
- to = Time.parse(s["to"])
176
- (from..to).cover?(Time.now)
177
- }["count"]
178
- end
179
-
180
- def overheat?
181
- return false unless @reach_max_at
182
- (Process.clock_gettime(Process::CLOCK_MONOTONIC, :second) - @reach_max_at) > cooldown_time_for_reach_max
183
- end
184
-
185
- def fetch_service
186
- res = client.describe_services(cluster: cluster, services: [name])
187
- raise "Service \"#{name}\" is not found" if res.services.empty?
188
- res.services[0]
189
- rescue => e
190
- AutoScaler.error_logger.error(e)
191
- end
192
-
193
- def update_service(difference)
194
- next_desired_count = desired_count + difference
195
- current_level = max_task_level(desired_count)
196
- next_level = max_task_level(next_desired_count)
197
- if current_level < next_level && overheat? # next max
198
- level = next_level
199
- @reach_max_at = nil
200
- AutoScaler.logger.info "Service \"#{name}\" is overheat, uses next max count"
201
- elsif current_level < next_level && !overheat? # wait cooldown
202
- level = current_level
203
- now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
204
- @reach_max_at ||= now
205
- AutoScaler.logger.info "Service \"#{name}\" waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
206
- elsif current_level == next_level && next_desired_count >= max_task_count[current_level] # reach current max
207
- level = current_level
208
- now = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
209
- @reach_max_at ||= now
210
- AutoScaler.logger.info "Service \"#{name}\" waits cooldown elapsed #{(now - @reach_max_at).to_i}sec"
211
- elsif current_level == next_level && next_desired_count < max_task_count[current_level]
212
- level = current_level
213
- @reach_max_at = nil
214
- AutoScaler.logger.info "Service \"#{name}\" clears cooldown state"
215
- elsif current_level > next_level
216
- level = next_level
217
- @reach_max_at = nil
218
- AutoScaler.logger.info "Service \"#{name}\" clears cooldown state"
219
- end
220
-
221
- cl = client
222
- next_desired_count = [next_desired_count, max_task_count[level]].min
223
- cl.update_service(
224
- cluster: cluster,
225
- service: name,
226
- desired_count: next_desired_count,
227
- )
228
- cl.wait_until(:services_stable, cluster: cluster, services: [name]) do |w|
229
- w.before_wait do
230
- AutoScaler.logger.debug "wait service stable [#{name}]"
231
- end
232
- end if difference < 0
233
- @last_updated_at = Process.clock_gettime(Process::CLOCK_MONOTONIC, :second)
234
- self.desired_count = next_desired_count
235
- AutoScaler.logger.info "Update service \"#{name}\": desired_count -> #{next_desired_count}"
236
- rescue => e
237
- AutoScaler.error_logger.error(e)
238
- end
239
-
240
- def fetch_container_instances
241
- arns = []
242
- resp = nil
243
- cl = client
244
- loop do
245
- options = {cluster: cluster}
246
- options.merge(next_token: resp.next_token) if resp && resp.next_token
247
- resp = cl.list_container_instances(options)
248
- arns.concat(resp.container_instance_arns)
249
- break unless resp.next_token
250
- end
251
-
252
- chunk_size = 50
253
- container_instances = []
254
- arns.each_slice(chunk_size) do |arn_chunk|
255
- is = cl.describe_container_instances(cluster: cluster, container_instances: arn_chunk).container_instances
256
- container_instances.concat(is)
257
- end
258
-
259
- container_instances
260
- end
261
-
262
- private
263
-
264
- def max_task_level(count)
265
- max_task_count.index { |i| count <= i } || max_task_count.size - 1
266
- end
267
- end
268
-
269
- TriggerConfig = Struct.new(:alarm_name, :region, :state, :step) do
270
- include ConfigBase
271
-
272
- def client
273
- Aws::CloudWatch::Client.new(
274
- access_key_id: EcsDeploy.config.access_key_id,
275
- secret_access_key: EcsDeploy.config.secret_access_key,
276
- region: region
277
- )
278
- end
279
-
280
- def match?
281
- fetch_alarm.state_value == state
282
- end
283
-
284
- def fetch_alarm
285
- res = client.describe_alarms(alarm_names: [alarm_name])
286
-
287
- raise "Alarm \"#{alarm_name}\" is not found" if res.metric_alarms.empty?
288
- res.metric_alarms[0].tap do |alarm|
289
- AutoScaler.logger.debug("#{alarm.alarm_name} state is #{alarm.state_value}")
290
- end
291
- rescue => e
292
- AutoScaler.error_logger.error(e)
293
- end
294
- end
295
-
296
- AutoScalingConfig = Struct.new(:name, :region, :buffer) do
297
- include ConfigBase
298
-
299
- def client
300
- Aws::AutoScaling::Client.new(
301
- access_key_id: EcsDeploy.config.access_key_id,
302
- secret_access_key: EcsDeploy.config.secret_access_key,
303
- region: region
304
- )
305
- end
306
-
307
- def ec2_client
308
- Aws::EC2::Client.new(
309
- access_key_id: EcsDeploy.config.access_key_id,
310
- secret_access_key: EcsDeploy.config.secret_access_key,
311
- region: region
312
- )
313
- end
314
-
315
- def instances(reload: false)
316
- if reload || @instances.nil?
317
- resp = client.describe_auto_scaling_groups({
318
- auto_scaling_group_names: [name],
319
- })
320
- @instances = resp.auto_scaling_groups[0].instances
321
- else
322
- @instances
323
- end
324
- end
325
-
326
- def update_auto_scaling_group(total_service_count, service_config)
327
- desired_capacity = total_service_count + buffer.to_i
328
-
329
- current_asg = client.describe_auto_scaling_groups({
330
- auto_scaling_group_names: [name],
331
- }).auto_scaling_groups[0]
332
-
333
- if current_asg.desired_capacity > desired_capacity
334
- diff = current_asg.desired_capacity - desired_capacity
335
- container_instances = service_config.fetch_container_instances
336
- deregisterable_instances = container_instances.select do |i|
337
- i.pending_tasks_count == 0 && i.running_tasks_count == 0
338
- end
339
-
340
- AutoScaler.logger.info "Fetch deregisterable instances: #{deregisterable_instances.map(&:ec2_instance_id).inspect}"
341
-
342
- deregistered_instance_ids = []
343
- deregisterable_instances.each do |i|
344
- break if deregistered_instance_ids.size >= diff
345
-
346
- begin
347
- service_config.client.deregister_container_instance(cluster: service_config.cluster, container_instance: i.container_instance_arn, force: false)
348
- deregistered_instance_ids << i.ec2_instance_id
349
- rescue Aws::ECS::Errors::InvalidParameterException
350
- end
351
- end
352
-
353
- AutoScaler.logger.info "Deregistered instances: #{deregistered_instance_ids.inspect}"
354
-
355
- detach_and_terminate_instances(deregistered_instance_ids)
356
-
357
- AutoScaler.logger.info "Update auto scaling group \"#{name}\": desired_capacity -> #{desired_capacity}"
358
- elsif current_asg.desired_capacity < desired_capacity
359
- client.update_auto_scaling_group(
360
- auto_scaling_group_name: name,
361
- min_size: 0,
362
- max_size: [current_asg.max_size, desired_capacity].max,
363
- desired_capacity: desired_capacity,
364
- )
365
- AutoScaler.logger.info "Update auto scaling group \"#{name}\": desired_capacity -> #{desired_capacity}"
366
- end
367
- rescue => e
368
- AutoScaler.error_logger.error(e)
369
- end
370
-
371
- def detach_and_terminate_instances(instance_ids)
372
- return if instance_ids.empty?
373
-
374
- client.detach_instances(
375
- auto_scaling_group_name: name,
376
- instance_ids: instance_ids,
377
- should_decrement_desired_capacity: true
378
- )
379
-
380
- AutoScaler.logger.info "Detach instances from ASG #{name}: #{instance_ids.inspect}"
381
- sleep 3
382
-
383
- ec2_client.terminate_instances(instance_ids: instance_ids)
384
-
385
- AutoScaler.logger.info "Terminated instances: #{instance_ids.inspect}"
386
- rescue => e
387
- AutoScaler.error_logger.error(e)
388
- end
389
-
390
- def detach_and_terminate_orphan_instances(service_config)
391
- container_instance_ids = service_config.fetch_container_instances.map(&:ec2_instance_id)
392
- orphans = instances(reload: true).reject { |i| container_instance_ids.include?(i.instance_id) }.map(&:instance_id)
393
-
394
- return if orphans.empty?
395
-
396
- targets = ec2_client.describe_instances(instance_ids: orphans).reservations[0].instances.select do |i|
397
- (Time.now - i.launch_time) > 600
398
- end
399
-
400
- detach_and_terminate_instances(targets.map(&:instance_id))
401
- rescue => e
402
- AutoScaler.error_logger.error(e)
403
- end
404
- end
405
171
  end
406
172
  end