vector 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +177 -0
- data/Rakefile +1 -0
- data/bin/vector +8 -0
- data/lib/vector.rb +82 -0
- data/lib/vector/cli.rb +193 -0
- data/lib/vector/functions/flexible_down_scaling.rb +187 -0
- data/lib/vector/functions/predictive_scaling.rb +150 -0
- data/lib/vector/version.rb +3 -0
- data/vector.gemspec +26 -0
- metadata +140 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Instructure, Inc
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
# Vector
|
2
|
+
|
3
|
+
Vector is a tool that augments your auto-scaling groups. The two
|
4
|
+
features currently offered are Predictive Scaling and Flexible Down
|
5
|
+
Scaling.
|
6
|
+
|
7
|
+
## Predictive scaling
|
8
|
+
|
9
|
+
Auto Scaling groups do a good job of responding to current
|
10
|
+
load conditions, but if you have a predictable load pattern,
|
11
|
+
it can be nice to scale up your servers a little bit *early*.
|
12
|
+
Some reasons you might want to do that are:
|
13
|
+
|
14
|
+
* If it takes several minutes for an instance to fully boot
|
15
|
+
and ready itself for requests.
|
16
|
+
* If you have very serious (but predictable) spikes,
|
17
|
+
it's nice to have the capacity in place before the spike
|
18
|
+
starts.
|
19
|
+
* To give yourself a buffer of time if AWS APIs start
|
20
|
+
throwing errors. If scaling up is going to fail, you'd
|
21
|
+
rather it start failing with a little bit of time before
|
22
|
+
you actually need the capacity so you can begin evasive maneuvers.
|
23
|
+
|
24
|
+
Vector examines your existing CloudWatch alarms tied to your Auto
|
25
|
+
Scaling groups, and predicts if they will be triggered in the future
|
26
|
+
based on what happened in the past.
|
27
|
+
|
28
|
+
**Note:** This only works with metrics that are averaged across your group -
|
29
|
+
like CPUUtilization or Load. If you auto-scale based on something
|
30
|
+
like QueueLength, Predictive Scaling will not work right for you.
|
31
|
+
|
32
|
+
For each lookback window you specify, Vector will first check the
|
33
|
+
current value of the metric * the number of nodes, and the past value of
|
34
|
+
the metric * the past number of nodes. If those numbers are close enough
|
35
|
+
(within the threshold specified by `--ps-valid-threshold`), then it will
|
36
|
+
continue.
|
37
|
+
|
38
|
+
Vector will then go back to the lookback window specified, and then
|
39
|
+
forward in time based on the lookahead window (`--ps-lookahead-window`).
|
40
|
+
It will compute the metric * number of nodes then to get a predicted
|
41
|
+
aggregate metric value for the current future. It then divides that by
|
42
|
+
the current number of nodes to get a predicted average value for the
|
43
|
+
metric. That is then compared against the alarm's threshold.
|
44
|
+
|
45
|
+
For example:
|
46
|
+
|
47
|
+
> You have an alarm that checks CPUUtilization of your group, and will
|
48
|
+
> trigger the alarm if that goes above 70%. Vector is configured to use a
|
49
|
+
> 1 week lookback window, a 1 hour lookahead window, and a valid-threshold
|
50
|
+
> of 0.8.
|
51
|
+
>
|
52
|
+
> The current value of CPUUtilization is 49%, and there are 2 nodes in the
|
53
|
+
> group. CPUUtilization 1 week ago was 53%, and there were 2 nodes in the
|
54
|
+
> group. Therefore, total current CPUUtilization is 98%, and 1 week ago was
|
55
|
+
> 106%. Those are within 80% of each other (valid-threshold), so we can
|
56
|
+
> continue with the prediction.
|
57
|
+
>
|
58
|
+
> The value of CPUUtilization 1 week ago, *plus* 1 hour was 45%, and
|
59
|
+
> there were 4 nodes in the group. We calculate total CPUUtilization for
|
60
|
+
> that time to be 180%. Assuming no new nodes are launched, the predicted
|
61
|
+
> average CPUUtilization for the group 1 hour from now is 180% / 2 = 90%.
|
62
|
+
> 90% is above the alarm's 75% threshold, so we trigger the scaleup
|
63
|
+
> policy.
|
64
|
+
|
65
|
+
|
66
|
+
If you use Predictive Scaling, you probably also want to use Flexible
|
67
|
+
Down Scaling (below) so that after scaling up in prediction of load,
|
68
|
+
your scaledown policy doesn't quickly undo Vector's hard work. You
|
69
|
+
probably want to set `up-to-down-cooldown` to be close to the size of
|
70
|
+
your `lookahead-window`.
|
71
|
+
|
72
|
+
## Flexible Down Scaling
|
73
|
+
|
74
|
+
### Different Cooldown Periods
|
75
|
+
|
76
|
+
Auto Scaling Groups support the concept of "cooldown periods" - a window
|
77
|
+
of time after a scaling activity where no other activities should take
|
78
|
+
place. This is to give the group a chance to settle into the new
|
79
|
+
configuration before deciding whether another action is required.
|
80
|
+
|
81
|
+
However, Auto Scaling Groups only support specifying the cooldown period
|
82
|
+
*after* a certain activity - you can say "After a scale up, wait 5
|
83
|
+
minutes before doing anything else, and after a scale down, wait 15
|
84
|
+
minutes." What you can't do is say "After a scale up, wait 5 minutes for
|
85
|
+
another scale up, and 40 minutes for a scale down."
|
86
|
+
|
87
|
+
Vector lets you add custom up-to-down and down-to-down cooldown periods.
|
88
|
+
You create your policies and alarms in your Auto Scaling Groups like
|
89
|
+
normal, and then *disable* the alarm you want a custom cooldown period
|
90
|
+
applied to. Then you tell Vector what cooldown periods to use, and he
|
91
|
+
does the rest.
|
92
|
+
|
93
|
+
### Multiple Alarms
|
94
|
+
|
95
|
+
Another benefit to Flexible Down Scaling is the ability to specify
|
96
|
+
multiple alarms for a scaling down policy and require *all* alarms to
|
97
|
+
trigger before scaling down. With Vector, you can add multiple
|
98
|
+
(disabled) alarms to a policy, and Vector will trigger the policy only
|
99
|
+
when *both* alarms are in ALARM state. This lets you do something like
|
100
|
+
"only scale down when CPU utilization is < 30% and there is not a
|
101
|
+
backlog of requests on any instances".
|
102
|
+
|
103
|
+
### Max Sunk Cost
|
104
|
+
|
105
|
+
Vector also lets you specify a "max sunk cost" when scaling down a node.
|
106
|
+
Amazon bills on hourly increments, and you pay a full hour for every
|
107
|
+
partial hour used, so you want your instances to terminate as close to
|
108
|
+
their hourly billing renewal (without going past it).
|
109
|
+
|
110
|
+
For example, if you specify `--fds-max-sunk-cost 15m` and have two nodes
|
111
|
+
in your group - 47 minutes and 32 minutes away from their hourly billing
|
112
|
+
renewals - the group will not be scaled down.
|
113
|
+
|
114
|
+
(You should make sure to run Vector on an interval smaller than this
|
115
|
+
one, or else it's possible Vector may never find eligible nodes for
|
116
|
+
scaledown and never scaledown.)
|
117
|
+
|
118
|
+
## Requirements
|
119
|
+
|
120
|
+
* Auto Scaling groups must have the GroupInServiceInstances metric
|
121
|
+
enabled.
|
122
|
+
* Auto Scaling groups must have at least one scaling policy with a
|
123
|
+
positive adjustment, and that policy must have at least one
|
124
|
+
CloudWatch alarm with a CPUUtilization metric.
|
125
|
+
|
126
|
+
## Installation
|
127
|
+
|
128
|
+
```bash
|
129
|
+
$ gem install vector
|
130
|
+
```
|
131
|
+
|
132
|
+
## Usage
|
133
|
+
|
134
|
+
Typically vector will be invoked via cron periodically (every 10 minutes
|
135
|
+
is a good choice.)
|
136
|
+
|
137
|
+
```
|
138
|
+
Usage: vector [options]
|
139
|
+
DURATION can look like 60s, 1m, 5h, 7d, 1w
|
140
|
+
--timezone TIMEZONE Timezone to use for date calculations (like America/Denver) (default: system timezone)
|
141
|
+
--region REGION AWS region to operate in (default: us-east-1)
|
142
|
+
--groups group1,group2 A list of Auto Scaling Groups to evaluate
|
143
|
+
--fleet fleet An AWS ASG Fleet (instead of specifying --groups)
|
144
|
+
-v, --[no-]verbose Run verbosely
|
145
|
+
|
146
|
+
Predictive Scaling Options
|
147
|
+
--[no-]ps Enable Predictive Scaling
|
148
|
+
--ps-lookback-windows DURATION,DURATION
|
149
|
+
List of lookback windows
|
150
|
+
--ps-lookahead-window DURATION
|
151
|
+
Lookahead window
|
152
|
+
--ps-valid-threshold FLOAT A number from 0.0 - 1.0 specifying how closely previous load must match current load for Predictive Scaling to take effect
|
153
|
+
--ps-valid-period DURATION The period to use when doing the threshold check
|
154
|
+
|
155
|
+
Flexible Down Scaling Options
|
156
|
+
--[no-]fds Enable Flexible Down Scaling
|
157
|
+
--fds-up-to-down DURATION The cooldown period between up and down scale events
|
158
|
+
--fds-down-to-down DURATION The cooldown period between down and down scale events
|
159
|
+
--fds-max-sunk-cost DURATION Only let a scaledown occur if there is an instance this close to its hourly billing point
|
160
|
+
```
|
161
|
+
|
162
|
+
# Questions
|
163
|
+
|
164
|
+
### Why not just predictively scale based on the past DesiredInstances?
|
165
|
+
|
166
|
+
If we don't look at the actual utilization and just look
|
167
|
+
at how many instances we were running in the past, we will end up
|
168
|
+
scaling earlier and earlier, and will never re-adjust and not scale up
|
169
|
+
if load patterns change, and we don't need so much capacity.
|
170
|
+
|
171
|
+
### What about high availability? What if the box Vector is running on dies?
|
172
|
+
|
173
|
+
Luckily Vector is just providing optimizations - the critical component
|
174
|
+
of scaling up based on demand is still provided by the normal Auto
|
175
|
+
Scaling service. If Vector does not run, you just don't get the
|
176
|
+
predictive scaling and down scaling.
|
177
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/vector
ADDED
data/lib/vector.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
require 'active_support/time'
|
3
|
+
|
4
|
+
require 'vector/version'
|
5
|
+
|
6
|
+
module Vector
|
7
|
+
def self.time_string_to_seconds(string)
|
8
|
+
if string =~ /^(\d+)([smhdw])?$/
|
9
|
+
n = $1.to_i
|
10
|
+
unit = $2 || 's'
|
11
|
+
|
12
|
+
case unit
|
13
|
+
when 's'
|
14
|
+
n.seconds
|
15
|
+
when 'm'
|
16
|
+
n.minutes
|
17
|
+
when 'h'
|
18
|
+
n.hours
|
19
|
+
when 'd'
|
20
|
+
n.days
|
21
|
+
when 'w'
|
22
|
+
n.weeks
|
23
|
+
end
|
24
|
+
else
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.within_threshold(threshold, v1, v2)
|
30
|
+
threshold * v1 < v2 && threshold * v2 < v1
|
31
|
+
end
|
32
|
+
|
33
|
+
module HLogger
|
34
|
+
@@enabled = false
|
35
|
+
def self.enable(bool)
|
36
|
+
@@enabled = bool
|
37
|
+
end
|
38
|
+
|
39
|
+
def hlog_ctx(ctx, &block)
|
40
|
+
@components ||= []
|
41
|
+
@components << ctx
|
42
|
+
yield
|
43
|
+
ensure
|
44
|
+
@components.pop
|
45
|
+
end
|
46
|
+
|
47
|
+
def hlog(string)
|
48
|
+
return unless @@enabled
|
49
|
+
|
50
|
+
tmp_components = @components.dup
|
51
|
+
level = 0
|
52
|
+
if @last_components
|
53
|
+
@last_components.each do |last_c|
|
54
|
+
break if tmp_components.empty?
|
55
|
+
if last_c == tmp_components[0]
|
56
|
+
level += 1
|
57
|
+
tmp_components.shift
|
58
|
+
else
|
59
|
+
break
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
tmp_components.each do |component|
|
65
|
+
name = if component.respond_to? :name
|
66
|
+
component.name
|
67
|
+
else
|
68
|
+
component.to_s
|
69
|
+
end
|
70
|
+
puts "#{" " * level}#{name}"
|
71
|
+
level += 1
|
72
|
+
end
|
73
|
+
|
74
|
+
puts "#{" " * level}#{string}"
|
75
|
+
@last_components = @components.dup
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
require 'vector/cli'
|
81
|
+
require 'vector/functions/predictive_scaling'
|
82
|
+
require 'vector/functions/flexible_down_scaling'
|
data/lib/vector/cli.rb
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require 'aws-sdk'
|
3
|
+
require 'aws/auto_scaling/fleets'
|
4
|
+
require 'vector/functions/flexible_down_scaling'
|
5
|
+
require 'vector/functions/predictive_scaling'
|
6
|
+
|
7
|
+
module Vector
|
8
|
+
class CLI
|
9
|
+
def initialize(argv)
|
10
|
+
@argv = argv
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
load_config
|
15
|
+
|
16
|
+
auto_scaling = AWS::AutoScaling.new(:region => @config[:region])
|
17
|
+
cloudwatch = AWS::CloudWatch.new(:region => @config[:region])
|
18
|
+
|
19
|
+
# everything we do should be fine looking at a snapshot in time,
|
20
|
+
# so memoizing should be fine when acting as a CLI.
|
21
|
+
AWS.start_memoizing
|
22
|
+
|
23
|
+
groups = if @config[:fleet]
|
24
|
+
auto_scaling.fleets[@config[:fleet]].groups
|
25
|
+
else
|
26
|
+
@config[:groups].map do |group_name|
|
27
|
+
auto_scaling.groups[group_name]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
if @config[:predictive_scaling][:enabled]
|
32
|
+
psconf = @config[:predictive_scaling]
|
33
|
+
|
34
|
+
ps = Vector::Function::PredictiveScaling.new(
|
35
|
+
{ :cloudwatch => cloudwatch }.merge(psconf))
|
36
|
+
|
37
|
+
groups.each do |group|
|
38
|
+
begin
|
39
|
+
ps.run_for(group)
|
40
|
+
rescue => e
|
41
|
+
puts "error performing Predictive Scaling on #{group.name}: #{e.inspect}\n#{e.backtrace.join "\n"}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
if @config[:flexible_down_scaling][:enabled]
|
47
|
+
fdsconf = @config[:flexible_down_scaling]
|
48
|
+
|
49
|
+
fds = Vector::Function::FlexibleDownScaling.new(
|
50
|
+
{ :cloudwatch => cloudwatch }.merge(fdsconf))
|
51
|
+
|
52
|
+
groups.each do |group|
|
53
|
+
fds.run_for(group)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
protected
|
59
|
+
|
60
|
+
def load_config
|
61
|
+
opts = {
|
62
|
+
:quiet => false,
|
63
|
+
:timezone => nil,
|
64
|
+
:region => 'us-east-1',
|
65
|
+
:groups => [],
|
66
|
+
:fleet => nil,
|
67
|
+
:predictive_scaling => {
|
68
|
+
:enabled => false,
|
69
|
+
:lookback_windows => [],
|
70
|
+
:lookahead_window => nil,
|
71
|
+
:valid_threshold => nil,
|
72
|
+
:valid_period => 60 * 10
|
73
|
+
},
|
74
|
+
:flexible_down_scaling => {
|
75
|
+
:enabled => false,
|
76
|
+
:up_down_cooldown => nil,
|
77
|
+
:down_down_cooldown => nil,
|
78
|
+
:max_sunk_cost => nil
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
optparser = OptionParser.new do |o|
|
83
|
+
o.banner = "Usage: vector [options]"
|
84
|
+
o.separator "DURATION can look like 60s, 1m, 5h, 7d, 1w"
|
85
|
+
|
86
|
+
o.on("--timezone TIMEZONE", "Timezone to use for date calculations (like America/Denver) (default: system timezone)") do |v|
|
87
|
+
opts[:timezone] = v
|
88
|
+
end
|
89
|
+
|
90
|
+
o.on("--region REGION", "AWS region to operate in (default: us-east-1)") do |v|
|
91
|
+
opts[:region] = v
|
92
|
+
end
|
93
|
+
|
94
|
+
o.on("--groups group1,group2", Array, "A list of Auto Scaling Groups to evaluate") do |v|
|
95
|
+
opts[:groups] = v
|
96
|
+
end
|
97
|
+
|
98
|
+
o.on("--fleet fleet", "An AWS ASG Fleet (instead of specifying --groups)") do |v|
|
99
|
+
opts[:fleet] = v
|
100
|
+
end
|
101
|
+
|
102
|
+
o.on("-q", "--[no-]quiet", "Run quietly") do |v|
|
103
|
+
opts[:quiet] = v
|
104
|
+
end
|
105
|
+
|
106
|
+
o.separator ""
|
107
|
+
o.separator "Predictive Scaling Options"
|
108
|
+
|
109
|
+
o.on("--[no-]ps", "Enable Predictive Scaling") do |v|
|
110
|
+
opts[:predictive_scaling][:enabled] = v
|
111
|
+
end
|
112
|
+
|
113
|
+
o.on("--ps-lookback-windows DURATION,DURATION", Array, "List of lookback windows") do |v|
|
114
|
+
opts[:predictive_scaling][:lookback_windows] =
|
115
|
+
v.map {|w| Vector.time_string_to_seconds(w) }
|
116
|
+
end
|
117
|
+
|
118
|
+
o.on("--ps-lookahead-window DURATION", String, "Lookahead window") do |v|
|
119
|
+
opts[:predictive_scaling][:lookahead_window] =
|
120
|
+
Vector.time_string_to_seconds(v)
|
121
|
+
end
|
122
|
+
|
123
|
+
o.on("--ps-valid-threshold FLOAT", Float, "A number from 0.0 - 1.0 specifying how closely previous load must match current load for Predictive Scaling to take effect") do |v|
|
124
|
+
opts[:predictive_scaling][:valid_threshold] = v
|
125
|
+
end
|
126
|
+
|
127
|
+
o.on("--ps-valid-period DURATION", String, "The period to use when doing the threshold check") do |v|
|
128
|
+
opts[:predictive_scaling][:valid_period] =
|
129
|
+
Vector.time_string_to_seconds v
|
130
|
+
end
|
131
|
+
|
132
|
+
o.separator ""
|
133
|
+
o.separator "Flexible Down Scaling Options"
|
134
|
+
|
135
|
+
o.on("--[no-]fds", "Enable Flexible Down Scaling") do |v|
|
136
|
+
opts[:flexible_down_scaling][:enabled] = v
|
137
|
+
end
|
138
|
+
|
139
|
+
o.on("--fds-up-to-down DURATION", String, "The cooldown period between up and down scale events") do |v|
|
140
|
+
opts[:flexible_down_scaling][:up_down_cooldown] =
|
141
|
+
Vector.time_string_to_seconds v
|
142
|
+
end
|
143
|
+
|
144
|
+
o.on("--fds-down-to-down DURATION", String, "The cooldown period between down and down scale events") do |v|
|
145
|
+
opts[:flexible_down_scaling][:down_down_cooldown] =
|
146
|
+
Vector.time_string_to_seconds v
|
147
|
+
end
|
148
|
+
|
149
|
+
o.on("--fds-max-sunk-cost DURATION", String, "Only let a scaledown occur if there is an instance this close to its hourly billing point") do |v|
|
150
|
+
time = Vector.time_string_to_seconds v
|
151
|
+
if time > 1.hour
|
152
|
+
puts "--fds-max-sunk-cost duration must be < 1 hour"
|
153
|
+
exit 1
|
154
|
+
end
|
155
|
+
|
156
|
+
opts[:flexible_down_scaling][:max_sunk_cost] = time
|
157
|
+
end
|
158
|
+
|
159
|
+
end.parse!(@argv)
|
160
|
+
|
161
|
+
if opts[:groups].empty? && opts[:fleet].nil?
|
162
|
+
puts "No groups were specified."
|
163
|
+
exit 1
|
164
|
+
end
|
165
|
+
|
166
|
+
if !opts[:groups].empty? && !opts[:fleet].nil?
|
167
|
+
puts "You can't specify --groups and --fleet."
|
168
|
+
exit 1
|
169
|
+
end
|
170
|
+
|
171
|
+
if opts[:predictive_scaling][:enabled]
|
172
|
+
ps = opts[:predictive_scaling]
|
173
|
+
if ps[:lookback_windows].empty? || ps[:lookahead_window].nil?
|
174
|
+
puts "You must specify lookback windows and a lookahead window for Predictive Scaling."
|
175
|
+
exit 1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
if opts[:flexible_down_scaling][:enabled]
|
180
|
+
fds = opts[:flexible_down_scaling]
|
181
|
+
if fds[:up_down_cooldown].nil? ||
|
182
|
+
fds[:down_down_cooldown].nil?
|
183
|
+
puts "You must specify both up-to-down and down-to-down cooldown periods for Flexible Down Scaling."
|
184
|
+
exit 1
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
Vector::HLogger.enable(!opts[:quiet])
|
189
|
+
|
190
|
+
@config = opts
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
require 'vector'
|
2
|
+
|
3
|
+
module Vector
|
4
|
+
module Function
|
5
|
+
class FlexibleDownScaling
|
6
|
+
include Vector::HLogger
|
7
|
+
|
8
|
+
def initialize(options)
|
9
|
+
@cloudwatch = options[:cloudwatch]
|
10
|
+
@up_down_cooldown = options[:up_down_cooldown]
|
11
|
+
@down_down_cooldown = options[:down_down_cooldown]
|
12
|
+
@max_sunk_cost = options[:max_sunk_cost]
|
13
|
+
end
|
14
|
+
|
15
|
+
def run_for(group)
|
16
|
+
hlog_ctx("group: #{group.name}") do
|
17
|
+
# don't check if no config was specified
|
18
|
+
if @up_down_cooldown.nil? && @down_down_cooldown.nil?
|
19
|
+
hlog("No cooldown periods specified, exiting")
|
20
|
+
return nil
|
21
|
+
end
|
22
|
+
|
23
|
+
# don't bother checking for a scaledown if desired capacity is
|
24
|
+
# already at the minimum size...
|
25
|
+
if group.desired_capacity == group.min_size
|
26
|
+
hlog("Group is already at minimum size, exiting")
|
27
|
+
return nil
|
28
|
+
end
|
29
|
+
|
30
|
+
scaledown_policies = group.scaling_policies.select do |policy|
|
31
|
+
policy.scaling_adjustment < 0
|
32
|
+
end
|
33
|
+
|
34
|
+
scaledown_policies.each do |policy|
|
35
|
+
hlog_ctx("policy: #{policy.name}") do
|
36
|
+
alarms = policy.alarms.keys.map do |alarm_name|
|
37
|
+
@cloudwatch.alarms[alarm_name]
|
38
|
+
end
|
39
|
+
|
40
|
+
# only consider disabled alarms (enabled alarms will trigger
|
41
|
+
# the policy automatically)
|
42
|
+
disabled_alarms = alarms.select do |alarm|
|
43
|
+
!alarm.enabled?
|
44
|
+
end
|
45
|
+
|
46
|
+
unless disabled_alarms.all? {|alarm| alarm.state_value == "ALARM" }
|
47
|
+
hlog("Not all alarms are in ALARM state")
|
48
|
+
next
|
49
|
+
end
|
50
|
+
|
51
|
+
unless outside_cooldown_period(group)
|
52
|
+
hlog("Group is not outside the specified cooldown periods")
|
53
|
+
next
|
54
|
+
end
|
55
|
+
|
56
|
+
unless has_eligible_scaledown_instance(group)
|
57
|
+
hlog("Group does not have an instance eligible for scaledown due to max_sunk_cost")
|
58
|
+
next
|
59
|
+
end
|
60
|
+
|
61
|
+
hlog("Executing policy")
|
62
|
+
policy.execute(:honor_cooldown => true)
|
63
|
+
|
64
|
+
# no need to evaluate other scaledown policies
|
65
|
+
return
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
protected
|
72
|
+
|
73
|
+
def has_eligible_scaledown_instance(group)
|
74
|
+
return true if @max_sunk_cost.nil?
|
75
|
+
|
76
|
+
group.ec2_instances.select {|i| i.status == :running }.each do |instance|
|
77
|
+
# get amount of time until hitting the instance renewal time
|
78
|
+
time_left = ((instance.launch_time.min - Time.now.min) % 60).minutes
|
79
|
+
|
80
|
+
# if we're within 1 minute, assume we won't be able to terminate it
|
81
|
+
# in time anyway and ignore it.
|
82
|
+
if time_left > 1.minute and time_left < @max_sunk_cost
|
83
|
+
# we only care if there is at least one instance within the window
|
84
|
+
# where we can scale down
|
85
|
+
return true
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
false
|
90
|
+
end
|
91
|
+
|
92
|
+
def outside_cooldown_period(group)
|
93
|
+
@cached_outside_cooldown ||= {}
|
94
|
+
if @cached_outside_cooldown.has_key? group
|
95
|
+
return @cached_outside_cooldown[group]
|
96
|
+
end
|
97
|
+
|
98
|
+
activities = previous_scaling_activities(group)
|
99
|
+
return nil if activities.nil?
|
100
|
+
|
101
|
+
if activities[:up]
|
102
|
+
hlog "Last scale up #{(Time.now - activities[:up]).minutes.inspect} ago"
|
103
|
+
end
|
104
|
+
if activities[:down]
|
105
|
+
hlog "Last scale down #{(Time.now - activities[:down]).minutes.inspect} ago"
|
106
|
+
end
|
107
|
+
result = true
|
108
|
+
|
109
|
+
# check up-down
|
110
|
+
if @up_down_cooldown && activities[:up] &&
|
111
|
+
Time.now - activities[:up] < @up_down_cooldown
|
112
|
+
result = false
|
113
|
+
end
|
114
|
+
|
115
|
+
# check down-down
|
116
|
+
if @down_down_cooldown && activities[:down] &&
|
117
|
+
Time.now - activities[:down] < @down_down_cooldown
|
118
|
+
result = false
|
119
|
+
end
|
120
|
+
|
121
|
+
result
|
122
|
+
end
|
123
|
+
|
124
|
+
# Looks at the GroupDesiredCapacity metric for the specified
|
125
|
+
# group, and finds the most recent change in value.
|
126
|
+
#
|
127
|
+
# @returns
|
128
|
+
# * nil if there was a problem getting data. There may have been
|
129
|
+
# scaling events or not, we don't know.
|
130
|
+
# * a hash with two keys, :up and :down, with values indicating
|
131
|
+
# when the last corresponding activity happened. If the
|
132
|
+
# activity was not seen in the examined time period, the value
|
133
|
+
# is nil.
|
134
|
+
def previous_scaling_activities(group)
|
135
|
+
metric = @cloudwatch.metrics.
|
136
|
+
with_namespace("AWS/AutoScaling").
|
137
|
+
with_metric_name("GroupDesiredCapacity").
|
138
|
+
filter('dimensions', [{
|
139
|
+
:name => "AutoScalingGroupName",
|
140
|
+
:value => group.name
|
141
|
+
}]).first
|
142
|
+
|
143
|
+
return nil unless metric
|
144
|
+
|
145
|
+
start_time = Time.now - [ @up_down_cooldown, @down_down_cooldown ].max
|
146
|
+
end_time = Time.now
|
147
|
+
|
148
|
+
stats = metric.statistics(
|
149
|
+
:start_time => start_time,
|
150
|
+
:end_time => end_time,
|
151
|
+
:statistics => [ "Average" ],
|
152
|
+
:period => 60)
|
153
|
+
|
154
|
+
# check if we got enough datapoints... if we didn't, we need to
|
155
|
+
# assume bad data and inform the caller. this code is basically
|
156
|
+
# checking if the # of received datapoints is within 50% of the
|
157
|
+
# expected datapoints.
|
158
|
+
got_datapoints = stats.datapoints.length
|
159
|
+
requested_datapoints = (end_time - start_time) / 60
|
160
|
+
if !Vector.within_threshold(0.5, got_datapoints, requested_datapoints)
|
161
|
+
return nil
|
162
|
+
end
|
163
|
+
|
164
|
+
# iterate over the datapoints in reverse, looking for the first
|
165
|
+
# change in value, which should be the most recent scaling
|
166
|
+
# activity
|
167
|
+
activities = { :down => nil, :up => nil }
|
168
|
+
last_value = nil
|
169
|
+
stats.datapoints.sort {|a,b| b[:timestamp] <=> a[:timestamp] }.each do |dp|
|
170
|
+
next if dp[:average].nil?
|
171
|
+
|
172
|
+
unless last_value.nil?
|
173
|
+
if dp[:average] != last_value
|
174
|
+
direction = (last_value < dp[:average]) ? :down : :up
|
175
|
+
activities[direction] ||= dp[:timestamp]
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
last_value = dp[:average]
|
180
|
+
break unless activities.values.any? {|v| v.nil? }
|
181
|
+
end
|
182
|
+
|
183
|
+
activities
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
module Vector
|
2
|
+
module Function
|
3
|
+
class PredictiveScaling
|
4
|
+
include Vector::HLogger
|
5
|
+
|
6
|
+
def initialize(options)
|
7
|
+
@cloudwatch = options[:cloudwatch]
|
8
|
+
@lookback_windows = options[:lookback_windows]
|
9
|
+
@lookahead_window = options[:lookahead_window]
|
10
|
+
@valid_threshold = options[:valid_threshold]
|
11
|
+
@valid_period = options[:valid_period]
|
12
|
+
end
|
13
|
+
|
14
|
+
def run_for(group)
|
15
|
+
hlog_ctx "group: #{group.name}" do
|
16
|
+
return if @lookback_windows.length == 0
|
17
|
+
|
18
|
+
scaleup_policies = group.scaling_policies.select do |policy|
|
19
|
+
policy.scaling_adjustment > 0
|
20
|
+
end
|
21
|
+
|
22
|
+
scaleup_policies.each do |policy|
|
23
|
+
hlog_ctx "policy: #{policy.name}" do
|
24
|
+
|
25
|
+
policy.alarms.keys.each do |alarm_name|
|
26
|
+
alarm = @cloudwatch.alarms[alarm_name]
|
27
|
+
hlog_ctx "alarm: #{alarm.name} (metric #{alarm.metric.name})" do
|
28
|
+
|
29
|
+
unless alarm.enabled?
|
30
|
+
hlog "Skipping disabled alarm"
|
31
|
+
next
|
32
|
+
end
|
33
|
+
|
34
|
+
# Note that everywhere we say "load" what we mean is
|
35
|
+
# "metric value * number of nodes"
|
36
|
+
now_load, now_num = load_for(group, alarm.metric,
|
37
|
+
Time.now, @valid_period)
|
38
|
+
|
39
|
+
if now_load.nil?
|
40
|
+
hlog "Could not get current total for metric"
|
41
|
+
next
|
42
|
+
end
|
43
|
+
|
44
|
+
@lookback_windows.each do |window|
|
45
|
+
hlog_ctx "window: #{window.inspect}" do
|
46
|
+
then_load, = load_for(group, alarm.metric,
|
47
|
+
Time.now - window, @valid_period)
|
48
|
+
|
49
|
+
if then_load.nil?
|
50
|
+
hlog "Could not get past total value for metric"
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
54
|
+
# check that the past total utilization is within
|
55
|
+
# threshold% of the current total utilization
|
56
|
+
if @valid_threshold &&
|
57
|
+
!Vector.within_threshold(@valid_threshold, now_load, then_load)
|
58
|
+
hlog "Past metric total value not within threshold (current #{now_load}, then #{then_load})"
|
59
|
+
next
|
60
|
+
end
|
61
|
+
|
62
|
+
past_load, = load_for(group, alarm.metric,
|
63
|
+
Time.now - window + @lookahead_window,
|
64
|
+
alarm.period)
|
65
|
+
|
66
|
+
if past_load.nil?
|
67
|
+
hlog "Could not get past + #{@lookahead_window.inspect} total value for metric"
|
68
|
+
next
|
69
|
+
end
|
70
|
+
|
71
|
+
# now take the past total load and divide it by the
|
72
|
+
# current number of instances to get the predicted value
|
73
|
+
predicted_value = past_load.to_f / now_num
|
74
|
+
hlog "Predicted #{alarm.metric.name}: #{predicted_value}"
|
75
|
+
|
76
|
+
if check_alarm_threshold(alarm, predicted_value)
|
77
|
+
hlog "Executing policy"
|
78
|
+
policy.execute(honor_cooldown: true)
|
79
|
+
|
80
|
+
# don't need to evaluate further windows or policies on this group
|
81
|
+
return
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
protected
|
93
|
+
|
94
|
+
def check_alarm_threshold(alarm, value)
|
95
|
+
case alarm.comparison_operator
|
96
|
+
when "GreaterThanOrEqualToThreshold"
|
97
|
+
value >= alarm.threshold
|
98
|
+
when "GreaterThanThreshold"
|
99
|
+
value > alarm.threshold
|
100
|
+
when "LessThanThreshold"
|
101
|
+
value < alarm.threshold
|
102
|
+
when "LessThanOrEqualToThreshold"
|
103
|
+
value <= alarm.threshold
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def load_for(group, metric, time, window)
|
108
|
+
num_instances_metric = @cloudwatch.metrics.
|
109
|
+
with_namespace("AWS/AutoScaling").
|
110
|
+
with_metric_name("GroupInServiceInstances").
|
111
|
+
filter('dimensions', [{
|
112
|
+
:name => 'AutoScalingGroupName',
|
113
|
+
:value => group.name
|
114
|
+
}]).first
|
115
|
+
|
116
|
+
unless num_instances_metric
|
117
|
+
raise "Could not find GroupInServicesInstances metric for #{group.name}"
|
118
|
+
end
|
119
|
+
|
120
|
+
start_time = time - (window / 2)
|
121
|
+
end_time = time + (window / 2)
|
122
|
+
|
123
|
+
avg = average_for_metric(metric, start_time, end_time)
|
124
|
+
num = average_for_metric(num_instances_metric, start_time, end_time)
|
125
|
+
|
126
|
+
if avg.nil? || num.nil?
|
127
|
+
return [ nil, nil ]
|
128
|
+
end
|
129
|
+
|
130
|
+
[ avg * num, num ]
|
131
|
+
end
|
132
|
+
|
133
|
+
def average_for_metric(metric, start_time, end_time)
|
134
|
+
stats = metric.statistics(
|
135
|
+
:start_time => start_time,
|
136
|
+
:end_time => end_time,
|
137
|
+
:statistics => [ "Average" ],
|
138
|
+
:period => 60)
|
139
|
+
|
140
|
+
return nil if stats.datapoints.length == 0
|
141
|
+
|
142
|
+
sum = stats.datapoints.inject(0) do |r, dp|
|
143
|
+
r + dp[:average]
|
144
|
+
end
|
145
|
+
|
146
|
+
sum.to_f / stats.datapoints.length
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
data/vector.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'vector/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "vector"
|
8
|
+
spec.version = Vector::VERSION
|
9
|
+
spec.authors = ["Zach Wily"]
|
10
|
+
spec.email = ["zach@zwily.com"]
|
11
|
+
spec.summary = %q{AWS Auto-Scaling Assistant}
|
12
|
+
spec.homepage = "http://github.com/instructure/vector"
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_dependency "aws-sdk"
|
21
|
+
spec.add_dependency "aws-asg-fleet"
|
22
|
+
spec.add_dependency "activesupport"
|
23
|
+
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
spec.add_development_dependency "rake"
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: vector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Zach Wily
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-07-09 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: aws-sdk
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: aws-asg-fleet
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: activesupport
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: bundler
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '1.3'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '1.3'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rake
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description:
|
95
|
+
email:
|
96
|
+
- zach@zwily.com
|
97
|
+
executables:
|
98
|
+
- vector
|
99
|
+
extensions: []
|
100
|
+
extra_rdoc_files: []
|
101
|
+
files:
|
102
|
+
- .gitignore
|
103
|
+
- Gemfile
|
104
|
+
- LICENSE.txt
|
105
|
+
- README.md
|
106
|
+
- Rakefile
|
107
|
+
- bin/vector
|
108
|
+
- lib/vector.rb
|
109
|
+
- lib/vector/cli.rb
|
110
|
+
- lib/vector/functions/flexible_down_scaling.rb
|
111
|
+
- lib/vector/functions/predictive_scaling.rb
|
112
|
+
- lib/vector/version.rb
|
113
|
+
- vector.gemspec
|
114
|
+
homepage: http://github.com/instructure/vector
|
115
|
+
licenses:
|
116
|
+
- MIT
|
117
|
+
post_install_message:
|
118
|
+
rdoc_options: []
|
119
|
+
require_paths:
|
120
|
+
- lib
|
121
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ! '>='
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
127
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
128
|
+
none: false
|
129
|
+
requirements:
|
130
|
+
- - ! '>='
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
133
|
+
requirements: []
|
134
|
+
rubyforge_project:
|
135
|
+
rubygems_version: 1.8.23
|
136
|
+
signing_key:
|
137
|
+
specification_version: 3
|
138
|
+
summary: AWS Auto-Scaling Assistant
|
139
|
+
test_files: []
|
140
|
+
has_rdoc:
|