vector 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +177 -0
- data/Rakefile +1 -0
- data/bin/vector +8 -0
- data/lib/vector.rb +82 -0
- data/lib/vector/cli.rb +193 -0
- data/lib/vector/functions/flexible_down_scaling.rb +187 -0
- data/lib/vector/functions/predictive_scaling.rb +150 -0
- data/lib/vector/version.rb +3 -0
- data/vector.gemspec +26 -0
- metadata +140 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Instructure, Inc
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
# Vector
|
2
|
+
|
3
|
+
Vector is a tool that augments your auto-scaling groups. The two
|
4
|
+
features currently offered are Predictive Scaling and Flexible Down
|
5
|
+
Scaling.
|
6
|
+
|
7
|
+
## Predictive scaling
|
8
|
+
|
9
|
+
Auto Scaling groups do a good job of responding to current
|
10
|
+
load conditions, but if you have a predictable load pattern,
|
11
|
+
it can be nice to scale up your servers a little bit *early*.
|
12
|
+
Some reasons you might want to do that are:
|
13
|
+
|
14
|
+
* If it takes several minutes for an instance to fully boot
|
15
|
+
and ready itself for requests.
|
16
|
+
* If you have very serious (but predictable) spikes,
|
17
|
+
it's nice to have the capacity in place before the spike
|
18
|
+
starts.
|
19
|
+
* To give yourself a buffer of time if AWS APIs start
|
20
|
+
throwing errors. If scaling up is going to fail, you'd
|
21
|
+
rather it start failing with a little bit of time before
|
22
|
+
you actually need the capacity so you can begin evasive maneuvers.
|
23
|
+
|
24
|
+
Vector examines your existing CloudWatch alarms tied to your Auto
|
25
|
+
Scaling groups, and predicts if they will be triggered in the future
|
26
|
+
based on what happened in the past.
|
27
|
+
|
28
|
+
**Note:** This only works with metrics that are averaged across your group -
|
29
|
+
like CPUUtilization or Load. If you auto-scale based on something
|
30
|
+
like QueueLength, Predictive Scaling will not work right for you.
|
31
|
+
|
32
|
+
For each lookback window you specify, Vector will first check the
|
33
|
+
current value of the metric * the number of nodes, and the past value of
|
34
|
+
the metric * the past number of nodes. If those numbers are close enough
|
35
|
+
(within the threshold specified by `--ps-valid-threshold`), then it will
|
36
|
+
continue.
|
37
|
+
|
38
|
+
Vector will then go back to the lookback window specified, and then
|
39
|
+
forward in time based on the lookahead window (`--ps-lookahead-window`).
|
40
|
+
It will compute the metric * number of nodes then to get a predicted
|
41
|
+
aggregate metric value for the current future. It then divides that by
|
42
|
+
the current number of nodes to get a predicted average value for the
|
43
|
+
metric. That is then compared against the alarm's threshold.
|
44
|
+
|
45
|
+
For example:
|
46
|
+
|
47
|
+
> You have an alarm that checks CPUUtilization of your group, and will
|
48
|
+
> trigger the alarm if that goes above 70%. Vector is configured to use a
|
49
|
+
> 1 week lookback window, a 1 hour lookahead window, and a valid-threshold
|
50
|
+
> of 0.8.
|
51
|
+
>
|
52
|
+
> The current value of CPUUtilization is 49%, and there are 2 nodes in the
|
53
|
+
> group. CPUUtilization 1 week ago was 53%, and there were 2 nodes in the
|
54
|
+
> group. Therefore, total current CPUUtilization is 98%, and 1 week ago was
|
55
|
+
> 106%. Those are within 80% of each other (valid-threshold), so we can
|
56
|
+
> continue with the prediction.
|
57
|
+
>
|
58
|
+
> The value of CPUUtilization 1 week ago, *plus* 1 hour was 45%, and
|
59
|
+
> there were 4 nodes in the group. We calculate total CPUUtilization for
|
60
|
+
> that time to be 180%. Assuming no new nodes are launched, the predicted
|
61
|
+
> average CPUUtilization for the group 1 hour from now is 180% / 2 = 90%.
|
62
|
+
> 90% is above the alarm's 75% threshold, so we trigger the scaleup
|
63
|
+
> policy.
|
64
|
+
|
65
|
+
|
66
|
+
If you use Predictive Scaling, you probably also want to use Flexible
|
67
|
+
Down Scaling (below) so that after scaling up in prediction of load,
|
68
|
+
your scaledown policy doesn't quickly undo Vector's hard work. You
|
69
|
+
probably want to set `up-to-down-cooldown` to be close to the size of
|
70
|
+
your `lookahead-window`.
|
71
|
+
|
72
|
+
## Flexible Down Scaling
|
73
|
+
|
74
|
+
### Different Cooldown Periods
|
75
|
+
|
76
|
+
Auto Scaling Groups support the concept of "cooldown periods" - a window
|
77
|
+
of time after a scaling activity where no other activities should take
|
78
|
+
place. This is to give the group a chance to settle into the new
|
79
|
+
configuration before deciding whether another action is required.
|
80
|
+
|
81
|
+
However, Auto Scaling Groups only support specifying the cooldown period
|
82
|
+
*after* a certain activity - you can say "After a scale up, wait 5
|
83
|
+
minutes before doing anything else, and after a scale down, wait 15
|
84
|
+
minutes." What you can't do is say "After a scale up, wait 5 minutes for
|
85
|
+
another scale up, and 40 minutes for a scale down."
|
86
|
+
|
87
|
+
Vector lets you add custom up-to-down and down-to-down cooldown periods.
|
88
|
+
You create your policies and alarms in your Auto Scaling Groups like
|
89
|
+
normal, and then *disable* the alarm you want a custom cooldown period
|
90
|
+
applied to. Then you tell Vector what cooldown periods to use, and he
|
91
|
+
does the rest.
|
92
|
+
|
93
|
+
### Multiple Alarms
|
94
|
+
|
95
|
+
Another benefit to Flexible Down Scaling is the ability to specify
|
96
|
+
multiple alarms for a scaling down policy and require *all* alarms to
|
97
|
+
trigger before scaling down. With Vector, you can add multiple
|
98
|
+
(disabled) alarms to a policy, and Vector will trigger the policy only
|
99
|
+
when *both* alarms are in ALARM state. This lets you do something like
|
100
|
+
"only scale down when CPU utilization is < 30% and there is not a
|
101
|
+
backlog of requests on any instances".
|
102
|
+
|
103
|
+
### Max Sunk Cost
|
104
|
+
|
105
|
+
Vector also lets you specify a "max sunk cost" when scaling down a node.
|
106
|
+
Amazon bills on hourly increments, and you pay a full hour for every
|
107
|
+
partial hour used, so you want your instances to terminate as close to
|
108
|
+
their hourly billing renewal (without going past it).
|
109
|
+
|
110
|
+
For example, if you specify `--fds-max-sunk-cost 15m` and have two nodes
|
111
|
+
in your group - 47 minutes and 32 minutes away from their hourly billing
|
112
|
+
renewals - the group will not be scaled down.
|
113
|
+
|
114
|
+
(You should make sure to run Vector on an interval smaller than this
|
115
|
+
one, or else it's possible Vector may never find eligible nodes for
|
116
|
+
scaledown and never scaledown.)
|
117
|
+
|
118
|
+
## Requirements
|
119
|
+
|
120
|
+
* Auto Scaling groups must have the GroupInServiceInstances metric
|
121
|
+
enabled.
|
122
|
+
* Auto Scaling groups must have at least one scaling policy with a
|
123
|
+
positive adjustment, and that policy must have at least one
|
124
|
+
CloudWatch alarm with a CPUUtilization metric.
|
125
|
+
|
126
|
+
## Installation
|
127
|
+
|
128
|
+
```bash
|
129
|
+
$ gem install vector
|
130
|
+
```
|
131
|
+
|
132
|
+
## Usage
|
133
|
+
|
134
|
+
Typically vector will be invoked via cron periodically (every 10 minutes
|
135
|
+
is a good choice.)
|
136
|
+
|
137
|
+
```
|
138
|
+
Usage: vector [options]
|
139
|
+
DURATION can look like 60s, 1m, 5h, 7d, 1w
|
140
|
+
--timezone TIMEZONE Timezone to use for date calculations (like America/Denver) (default: system timezone)
|
141
|
+
--region REGION AWS region to operate in (default: us-east-1)
|
142
|
+
--groups group1,group2 A list of Auto Scaling Groups to evaluate
|
143
|
+
--fleet fleet An AWS ASG Fleet (instead of specifying --groups)
|
144
|
+
-v, --[no-]verbose Run verbosely
|
145
|
+
|
146
|
+
Predictive Scaling Options
|
147
|
+
--[no-]ps Enable Predictive Scaling
|
148
|
+
--ps-lookback-windows DURATION,DURATION
|
149
|
+
List of lookback windows
|
150
|
+
--ps-lookahead-window DURATION
|
151
|
+
Lookahead window
|
152
|
+
--ps-valid-threshold FLOAT A number from 0.0 - 1.0 specifying how closely previous load must match current load for Predictive Scaling to take effect
|
153
|
+
--ps-valid-period DURATION The period to use when doing the threshold check
|
154
|
+
|
155
|
+
Flexible Down Scaling Options
|
156
|
+
--[no-]fds Enable Flexible Down Scaling
|
157
|
+
--fds-up-to-down DURATION The cooldown period between up and down scale events
|
158
|
+
--fds-down-to-down DURATION The cooldown period between down and down scale events
|
159
|
+
--fds-max-sunk-cost DURATION Only let a scaledown occur if there is an instance this close to its hourly billing point
|
160
|
+
```
|
161
|
+
|
162
|
+
# Questions
|
163
|
+
|
164
|
+
### Why not just predictively scale based on the past DesiredInstances?
|
165
|
+
|
166
|
+
If we don't look at the actual utilization and just look
|
167
|
+
at how many instances we were running in the past, we will end up
|
168
|
+
scaling earlier and earlier, and will never re-adjust and not scale up
|
169
|
+
if load patterns change, and we don't need so much capacity.
|
170
|
+
|
171
|
+
### What about high availability? What if the box Vector is running on dies?
|
172
|
+
|
173
|
+
Luckily Vector is just providing optimizations - the critical component
|
174
|
+
of scaling up based on demand is still provided by the normal Auto
|
175
|
+
Scaling service. If Vector does not run, you just don't get the
|
176
|
+
predictive scaling and down scaling.
|
177
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/vector
ADDED
data/lib/vector.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
require 'active_support/time'
|
3
|
+
|
4
|
+
require 'vector/version'
|
5
|
+
|
6
|
+
module Vector
|
7
|
+
def self.time_string_to_seconds(string)
|
8
|
+
if string =~ /^(\d+)([smhdw])?$/
|
9
|
+
n = $1.to_i
|
10
|
+
unit = $2 || 's'
|
11
|
+
|
12
|
+
case unit
|
13
|
+
when 's'
|
14
|
+
n.seconds
|
15
|
+
when 'm'
|
16
|
+
n.minutes
|
17
|
+
when 'h'
|
18
|
+
n.hours
|
19
|
+
when 'd'
|
20
|
+
n.days
|
21
|
+
when 'w'
|
22
|
+
n.weeks
|
23
|
+
end
|
24
|
+
else
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.within_threshold(threshold, v1, v2)
|
30
|
+
threshold * v1 < v2 && threshold * v2 < v1
|
31
|
+
end
|
32
|
+
|
33
|
+
module HLogger
|
34
|
+
@@enabled = false
|
35
|
+
def self.enable(bool)
|
36
|
+
@@enabled = bool
|
37
|
+
end
|
38
|
+
|
39
|
+
def hlog_ctx(ctx, &block)
|
40
|
+
@components ||= []
|
41
|
+
@components << ctx
|
42
|
+
yield
|
43
|
+
ensure
|
44
|
+
@components.pop
|
45
|
+
end
|
46
|
+
|
47
|
+
def hlog(string)
|
48
|
+
return unless @@enabled
|
49
|
+
|
50
|
+
tmp_components = @components.dup
|
51
|
+
level = 0
|
52
|
+
if @last_components
|
53
|
+
@last_components.each do |last_c|
|
54
|
+
break if tmp_components.empty?
|
55
|
+
if last_c == tmp_components[0]
|
56
|
+
level += 1
|
57
|
+
tmp_components.shift
|
58
|
+
else
|
59
|
+
break
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
tmp_components.each do |component|
|
65
|
+
name = if component.respond_to? :name
|
66
|
+
component.name
|
67
|
+
else
|
68
|
+
component.to_s
|
69
|
+
end
|
70
|
+
puts "#{" " * level}#{name}"
|
71
|
+
level += 1
|
72
|
+
end
|
73
|
+
|
74
|
+
puts "#{" " * level}#{string}"
|
75
|
+
@last_components = @components.dup
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
require 'vector/cli'
|
81
|
+
require 'vector/functions/predictive_scaling'
|
82
|
+
require 'vector/functions/flexible_down_scaling'
|
data/lib/vector/cli.rb
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require 'aws-sdk'
|
3
|
+
require 'aws/auto_scaling/fleets'
|
4
|
+
require 'vector/functions/flexible_down_scaling'
|
5
|
+
require 'vector/functions/predictive_scaling'
|
6
|
+
|
7
|
+
module Vector
|
8
|
+
class CLI
|
9
|
+
def initialize(argv)
|
10
|
+
@argv = argv
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
load_config
|
15
|
+
|
16
|
+
auto_scaling = AWS::AutoScaling.new(:region => @config[:region])
|
17
|
+
cloudwatch = AWS::CloudWatch.new(:region => @config[:region])
|
18
|
+
|
19
|
+
# everything we do should be fine looking at a snapshot in time,
|
20
|
+
# so memoizing should be fine when acting as a CLI.
|
21
|
+
AWS.start_memoizing
|
22
|
+
|
23
|
+
groups = if @config[:fleet]
|
24
|
+
auto_scaling.fleets[@config[:fleet]].groups
|
25
|
+
else
|
26
|
+
@config[:groups].map do |group_name|
|
27
|
+
auto_scaling.groups[group_name]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
if @config[:predictive_scaling][:enabled]
|
32
|
+
psconf = @config[:predictive_scaling]
|
33
|
+
|
34
|
+
ps = Vector::Function::PredictiveScaling.new(
|
35
|
+
{ :cloudwatch => cloudwatch }.merge(psconf))
|
36
|
+
|
37
|
+
groups.each do |group|
|
38
|
+
begin
|
39
|
+
ps.run_for(group)
|
40
|
+
rescue => e
|
41
|
+
puts "error performing Predictive Scaling on #{group.name}: #{e.inspect}\n#{e.backtrace.join "\n"}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
if @config[:flexible_down_scaling][:enabled]
|
47
|
+
fdsconf = @config[:flexible_down_scaling]
|
48
|
+
|
49
|
+
fds = Vector::Function::FlexibleDownScaling.new(
|
50
|
+
{ :cloudwatch => cloudwatch }.merge(fdsconf))
|
51
|
+
|
52
|
+
groups.each do |group|
|
53
|
+
fds.run_for(group)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
protected
|
59
|
+
|
60
|
+
def load_config
|
61
|
+
opts = {
|
62
|
+
:quiet => false,
|
63
|
+
:timezone => nil,
|
64
|
+
:region => 'us-east-1',
|
65
|
+
:groups => [],
|
66
|
+
:fleet => nil,
|
67
|
+
:predictive_scaling => {
|
68
|
+
:enabled => false,
|
69
|
+
:lookback_windows => [],
|
70
|
+
:lookahead_window => nil,
|
71
|
+
:valid_threshold => nil,
|
72
|
+
:valid_period => 60 * 10
|
73
|
+
},
|
74
|
+
:flexible_down_scaling => {
|
75
|
+
:enabled => false,
|
76
|
+
:up_down_cooldown => nil,
|
77
|
+
:down_down_cooldown => nil,
|
78
|
+
:max_sunk_cost => nil
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
optparser = OptionParser.new do |o|
|
83
|
+
o.banner = "Usage: vector [options]"
|
84
|
+
o.separator "DURATION can look like 60s, 1m, 5h, 7d, 1w"
|
85
|
+
|
86
|
+
o.on("--timezone TIMEZONE", "Timezone to use for date calculations (like America/Denver) (default: system timezone)") do |v|
|
87
|
+
opts[:timezone] = v
|
88
|
+
end
|
89
|
+
|
90
|
+
o.on("--region REGION", "AWS region to operate in (default: us-east-1)") do |v|
|
91
|
+
opts[:region] = v
|
92
|
+
end
|
93
|
+
|
94
|
+
o.on("--groups group1,group2", Array, "A list of Auto Scaling Groups to evaluate") do |v|
|
95
|
+
opts[:groups] = v
|
96
|
+
end
|
97
|
+
|
98
|
+
o.on("--fleet fleet", "An AWS ASG Fleet (instead of specifying --groups)") do |v|
|
99
|
+
opts[:fleet] = v
|
100
|
+
end
|
101
|
+
|
102
|
+
o.on("-q", "--[no-]quiet", "Run quietly") do |v|
|
103
|
+
opts[:quiet] = v
|
104
|
+
end
|
105
|
+
|
106
|
+
o.separator ""
|
107
|
+
o.separator "Predictive Scaling Options"
|
108
|
+
|
109
|
+
o.on("--[no-]ps", "Enable Predictive Scaling") do |v|
|
110
|
+
opts[:predictive_scaling][:enabled] = v
|
111
|
+
end
|
112
|
+
|
113
|
+
o.on("--ps-lookback-windows DURATION,DURATION", Array, "List of lookback windows") do |v|
|
114
|
+
opts[:predictive_scaling][:lookback_windows] =
|
115
|
+
v.map {|w| Vector.time_string_to_seconds(w) }
|
116
|
+
end
|
117
|
+
|
118
|
+
o.on("--ps-lookahead-window DURATION", String, "Lookahead window") do |v|
|
119
|
+
opts[:predictive_scaling][:lookahead_window] =
|
120
|
+
Vector.time_string_to_seconds(v)
|
121
|
+
end
|
122
|
+
|
123
|
+
o.on("--ps-valid-threshold FLOAT", Float, "A number from 0.0 - 1.0 specifying how closely previous load must match current load for Predictive Scaling to take effect") do |v|
|
124
|
+
opts[:predictive_scaling][:valid_threshold] = v
|
125
|
+
end
|
126
|
+
|
127
|
+
o.on("--ps-valid-period DURATION", String, "The period to use when doing the threshold check") do |v|
|
128
|
+
opts[:predictive_scaling][:valid_period] =
|
129
|
+
Vector.time_string_to_seconds v
|
130
|
+
end
|
131
|
+
|
132
|
+
o.separator ""
|
133
|
+
o.separator "Flexible Down Scaling Options"
|
134
|
+
|
135
|
+
o.on("--[no-]fds", "Enable Flexible Down Scaling") do |v|
|
136
|
+
opts[:flexible_down_scaling][:enabled] = v
|
137
|
+
end
|
138
|
+
|
139
|
+
o.on("--fds-up-to-down DURATION", String, "The cooldown period between up and down scale events") do |v|
|
140
|
+
opts[:flexible_down_scaling][:up_down_cooldown] =
|
141
|
+
Vector.time_string_to_seconds v
|
142
|
+
end
|
143
|
+
|
144
|
+
o.on("--fds-down-to-down DURATION", String, "The cooldown period between down and down scale events") do |v|
|
145
|
+
opts[:flexible_down_scaling][:down_down_cooldown] =
|
146
|
+
Vector.time_string_to_seconds v
|
147
|
+
end
|
148
|
+
|
149
|
+
o.on("--fds-max-sunk-cost DURATION", String, "Only let a scaledown occur if there is an instance this close to its hourly billing point") do |v|
|
150
|
+
time = Vector.time_string_to_seconds v
|
151
|
+
if time > 1.hour
|
152
|
+
puts "--fds-max-sunk-cost duration must be < 1 hour"
|
153
|
+
exit 1
|
154
|
+
end
|
155
|
+
|
156
|
+
opts[:flexible_down_scaling][:max_sunk_cost] = time
|
157
|
+
end
|
158
|
+
|
159
|
+
end.parse!(@argv)
|
160
|
+
|
161
|
+
if opts[:groups].empty? && opts[:fleet].nil?
|
162
|
+
puts "No groups were specified."
|
163
|
+
exit 1
|
164
|
+
end
|
165
|
+
|
166
|
+
if !opts[:groups].empty? && !opts[:fleet].nil?
|
167
|
+
puts "You can't specify --groups and --fleet."
|
168
|
+
exit 1
|
169
|
+
end
|
170
|
+
|
171
|
+
if opts[:predictive_scaling][:enabled]
|
172
|
+
ps = opts[:predictive_scaling]
|
173
|
+
if ps[:lookback_windows].empty? || ps[:lookahead_window].nil?
|
174
|
+
puts "You must specify lookback windows and a lookahead window for Predictive Scaling."
|
175
|
+
exit 1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
if opts[:flexible_down_scaling][:enabled]
|
180
|
+
fds = opts[:flexible_down_scaling]
|
181
|
+
if fds[:up_down_cooldown].nil? ||
|
182
|
+
fds[:down_down_cooldown].nil?
|
183
|
+
puts "You must specify both up-to-down and down-to-down cooldown periods for Flexible Down Scaling."
|
184
|
+
exit 1
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
Vector::HLogger.enable(!opts[:quiet])
|
189
|
+
|
190
|
+
@config = opts
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
require 'vector'
|
2
|
+
|
3
|
+
module Vector
|
4
|
+
module Function
|
5
|
+
class FlexibleDownScaling
|
6
|
+
include Vector::HLogger
|
7
|
+
|
8
|
+
def initialize(options)
|
9
|
+
@cloudwatch = options[:cloudwatch]
|
10
|
+
@up_down_cooldown = options[:up_down_cooldown]
|
11
|
+
@down_down_cooldown = options[:down_down_cooldown]
|
12
|
+
@max_sunk_cost = options[:max_sunk_cost]
|
13
|
+
end
|
14
|
+
|
15
|
+
def run_for(group)
|
16
|
+
hlog_ctx("group: #{group.name}") do
|
17
|
+
# don't check if no config was specified
|
18
|
+
if @up_down_cooldown.nil? && @down_down_cooldown.nil?
|
19
|
+
hlog("No cooldown periods specified, exiting")
|
20
|
+
return nil
|
21
|
+
end
|
22
|
+
|
23
|
+
# don't bother checking for a scaledown if desired capacity is
|
24
|
+
# already at the minimum size...
|
25
|
+
if group.desired_capacity == group.min_size
|
26
|
+
hlog("Group is already at minimum size, exiting")
|
27
|
+
return nil
|
28
|
+
end
|
29
|
+
|
30
|
+
scaledown_policies = group.scaling_policies.select do |policy|
|
31
|
+
policy.scaling_adjustment < 0
|
32
|
+
end
|
33
|
+
|
34
|
+
scaledown_policies.each do |policy|
|
35
|
+
hlog_ctx("policy: #{policy.name}") do
|
36
|
+
alarms = policy.alarms.keys.map do |alarm_name|
|
37
|
+
@cloudwatch.alarms[alarm_name]
|
38
|
+
end
|
39
|
+
|
40
|
+
# only consider disabled alarms (enabled alarms will trigger
|
41
|
+
# the policy automatically)
|
42
|
+
disabled_alarms = alarms.select do |alarm|
|
43
|
+
!alarm.enabled?
|
44
|
+
end
|
45
|
+
|
46
|
+
unless disabled_alarms.all? {|alarm| alarm.state_value == "ALARM" }
|
47
|
+
hlog("Not all alarms are in ALARM state")
|
48
|
+
next
|
49
|
+
end
|
50
|
+
|
51
|
+
unless outside_cooldown_period(group)
|
52
|
+
hlog("Group is not outside the specified cooldown periods")
|
53
|
+
next
|
54
|
+
end
|
55
|
+
|
56
|
+
unless has_eligible_scaledown_instance(group)
|
57
|
+
hlog("Group does not have an instance eligible for scaledown due to max_sunk_cost")
|
58
|
+
next
|
59
|
+
end
|
60
|
+
|
61
|
+
hlog("Executing policy")
|
62
|
+
policy.execute(:honor_cooldown => true)
|
63
|
+
|
64
|
+
# no need to evaluate other scaledown policies
|
65
|
+
return
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
protected
|
72
|
+
|
73
|
+
def has_eligible_scaledown_instance(group)
|
74
|
+
return true if @max_sunk_cost.nil?
|
75
|
+
|
76
|
+
group.ec2_instances.select {|i| i.status == :running }.each do |instance|
|
77
|
+
# get amount of time until hitting the instance renewal time
|
78
|
+
time_left = ((instance.launch_time.min - Time.now.min) % 60).minutes
|
79
|
+
|
80
|
+
# if we're within 1 minute, assume we won't be able to terminate it
|
81
|
+
# in time anyway and ignore it.
|
82
|
+
if time_left > 1.minute and time_left < @max_sunk_cost
|
83
|
+
# we only care if there is at least one instance within the window
|
84
|
+
# where we can scale down
|
85
|
+
return true
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
false
|
90
|
+
end
|
91
|
+
|
92
|
+
def outside_cooldown_period(group)
|
93
|
+
@cached_outside_cooldown ||= {}
|
94
|
+
if @cached_outside_cooldown.has_key? group
|
95
|
+
return @cached_outside_cooldown[group]
|
96
|
+
end
|
97
|
+
|
98
|
+
activities = previous_scaling_activities(group)
|
99
|
+
return nil if activities.nil?
|
100
|
+
|
101
|
+
if activities[:up]
|
102
|
+
hlog "Last scale up #{(Time.now - activities[:up]).minutes.inspect} ago"
|
103
|
+
end
|
104
|
+
if activities[:down]
|
105
|
+
hlog "Last scale down #{(Time.now - activities[:down]).minutes.inspect} ago"
|
106
|
+
end
|
107
|
+
result = true
|
108
|
+
|
109
|
+
# check up-down
|
110
|
+
if @up_down_cooldown && activities[:up] &&
|
111
|
+
Time.now - activities[:up] < @up_down_cooldown
|
112
|
+
result = false
|
113
|
+
end
|
114
|
+
|
115
|
+
# check down-down
|
116
|
+
if @down_down_cooldown && activities[:down] &&
|
117
|
+
Time.now - activities[:down] < @down_down_cooldown
|
118
|
+
result = false
|
119
|
+
end
|
120
|
+
|
121
|
+
result
|
122
|
+
end
|
123
|
+
|
124
|
+
# Looks at the GroupDesiredCapacity metric for the specified
|
125
|
+
# group, and finds the most recent change in value.
|
126
|
+
#
|
127
|
+
# @returns
|
128
|
+
# * nil if there was a problem getting data. There may have been
|
129
|
+
# scaling events or not, we don't know.
|
130
|
+
# * a hash with two keys, :up and :down, with values indicating
|
131
|
+
# when the last corresponding activity happened. If the
|
132
|
+
# activity was not seen in the examined time period, the value
|
133
|
+
# is nil.
|
134
|
+
def previous_scaling_activities(group)
|
135
|
+
metric = @cloudwatch.metrics.
|
136
|
+
with_namespace("AWS/AutoScaling").
|
137
|
+
with_metric_name("GroupDesiredCapacity").
|
138
|
+
filter('dimensions', [{
|
139
|
+
:name => "AutoScalingGroupName",
|
140
|
+
:value => group.name
|
141
|
+
}]).first
|
142
|
+
|
143
|
+
return nil unless metric
|
144
|
+
|
145
|
+
start_time = Time.now - [ @up_down_cooldown, @down_down_cooldown ].max
|
146
|
+
end_time = Time.now
|
147
|
+
|
148
|
+
stats = metric.statistics(
|
149
|
+
:start_time => start_time,
|
150
|
+
:end_time => end_time,
|
151
|
+
:statistics => [ "Average" ],
|
152
|
+
:period => 60)
|
153
|
+
|
154
|
+
# check if we got enough datapoints... if we didn't, we need to
|
155
|
+
# assume bad data and inform the caller. this code is basically
|
156
|
+
# checking if the # of received datapoints is within 50% of the
|
157
|
+
# expected datapoints.
|
158
|
+
got_datapoints = stats.datapoints.length
|
159
|
+
requested_datapoints = (end_time - start_time) / 60
|
160
|
+
if !Vector.within_threshold(0.5, got_datapoints, requested_datapoints)
|
161
|
+
return nil
|
162
|
+
end
|
163
|
+
|
164
|
+
# iterate over the datapoints in reverse, looking for the first
|
165
|
+
# change in value, which should be the most recent scaling
|
166
|
+
# activity
|
167
|
+
activities = { :down => nil, :up => nil }
|
168
|
+
last_value = nil
|
169
|
+
stats.datapoints.sort {|a,b| b[:timestamp] <=> a[:timestamp] }.each do |dp|
|
170
|
+
next if dp[:average].nil?
|
171
|
+
|
172
|
+
unless last_value.nil?
|
173
|
+
if dp[:average] != last_value
|
174
|
+
direction = (last_value < dp[:average]) ? :down : :up
|
175
|
+
activities[direction] ||= dp[:timestamp]
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
last_value = dp[:average]
|
180
|
+
break unless activities.values.any? {|v| v.nil? }
|
181
|
+
end
|
182
|
+
|
183
|
+
activities
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
module Vector
|
2
|
+
module Function
|
3
|
+
class PredictiveScaling
|
4
|
+
include Vector::HLogger
|
5
|
+
|
6
|
+
def initialize(options)
|
7
|
+
@cloudwatch = options[:cloudwatch]
|
8
|
+
@lookback_windows = options[:lookback_windows]
|
9
|
+
@lookahead_window = options[:lookahead_window]
|
10
|
+
@valid_threshold = options[:valid_threshold]
|
11
|
+
@valid_period = options[:valid_period]
|
12
|
+
end
|
13
|
+
|
14
|
+
def run_for(group)
|
15
|
+
hlog_ctx "group: #{group.name}" do
|
16
|
+
return if @lookback_windows.length == 0
|
17
|
+
|
18
|
+
scaleup_policies = group.scaling_policies.select do |policy|
|
19
|
+
policy.scaling_adjustment > 0
|
20
|
+
end
|
21
|
+
|
22
|
+
scaleup_policies.each do |policy|
|
23
|
+
hlog_ctx "policy: #{policy.name}" do
|
24
|
+
|
25
|
+
policy.alarms.keys.each do |alarm_name|
|
26
|
+
alarm = @cloudwatch.alarms[alarm_name]
|
27
|
+
hlog_ctx "alarm: #{alarm.name} (metric #{alarm.metric.name})" do
|
28
|
+
|
29
|
+
unless alarm.enabled?
|
30
|
+
hlog "Skipping disabled alarm"
|
31
|
+
next
|
32
|
+
end
|
33
|
+
|
34
|
+
# Note that everywhere we say "load" what we mean is
|
35
|
+
# "metric value * number of nodes"
|
36
|
+
now_load, now_num = load_for(group, alarm.metric,
|
37
|
+
Time.now, @valid_period)
|
38
|
+
|
39
|
+
if now_load.nil?
|
40
|
+
hlog "Could not get current total for metric"
|
41
|
+
next
|
42
|
+
end
|
43
|
+
|
44
|
+
@lookback_windows.each do |window|
|
45
|
+
hlog_ctx "window: #{window.inspect}" do
|
46
|
+
then_load, = load_for(group, alarm.metric,
|
47
|
+
Time.now - window, @valid_period)
|
48
|
+
|
49
|
+
if then_load.nil?
|
50
|
+
hlog "Could not get past total value for metric"
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
54
|
+
# check that the past total utilization is within
|
55
|
+
# threshold% of the current total utilization
|
56
|
+
if @valid_threshold &&
|
57
|
+
!Vector.within_threshold(@valid_threshold, now_load, then_load)
|
58
|
+
hlog "Past metric total value not within threshold (current #{now_load}, then #{then_load})"
|
59
|
+
next
|
60
|
+
end
|
61
|
+
|
62
|
+
past_load, = load_for(group, alarm.metric,
|
63
|
+
Time.now - window + @lookahead_window,
|
64
|
+
alarm.period)
|
65
|
+
|
66
|
+
if past_load.nil?
|
67
|
+
hlog "Could not get past + #{@lookahead_window.inspect} total value for metric"
|
68
|
+
next
|
69
|
+
end
|
70
|
+
|
71
|
+
# now take the past total load and divide it by the
|
72
|
+
# current number of instances to get the predicted value
|
73
|
+
predicted_value = past_load.to_f / now_num
|
74
|
+
hlog "Predicted #{alarm.metric.name}: #{predicted_value}"
|
75
|
+
|
76
|
+
if check_alarm_threshold(alarm, predicted_value)
|
77
|
+
hlog "Executing policy"
|
78
|
+
policy.execute(honor_cooldown: true)
|
79
|
+
|
80
|
+
# don't need to evaluate further windows or policies on this group
|
81
|
+
return
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
protected
|
93
|
+
|
94
|
+
def check_alarm_threshold(alarm, value)
|
95
|
+
case alarm.comparison_operator
|
96
|
+
when "GreaterThanOrEqualToThreshold"
|
97
|
+
value >= alarm.threshold
|
98
|
+
when "GreaterThanThreshold"
|
99
|
+
value > alarm.threshold
|
100
|
+
when "LessThanThreshold"
|
101
|
+
value < alarm.threshold
|
102
|
+
when "LessThanOrEqualToThreshold"
|
103
|
+
value <= alarm.threshold
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def load_for(group, metric, time, window)
|
108
|
+
num_instances_metric = @cloudwatch.metrics.
|
109
|
+
with_namespace("AWS/AutoScaling").
|
110
|
+
with_metric_name("GroupInServiceInstances").
|
111
|
+
filter('dimensions', [{
|
112
|
+
:name => 'AutoScalingGroupName',
|
113
|
+
:value => group.name
|
114
|
+
}]).first
|
115
|
+
|
116
|
+
unless num_instances_metric
|
117
|
+
raise "Could not find GroupInServicesInstances metric for #{group.name}"
|
118
|
+
end
|
119
|
+
|
120
|
+
start_time = time - (window / 2)
|
121
|
+
end_time = time + (window / 2)
|
122
|
+
|
123
|
+
avg = average_for_metric(metric, start_time, end_time)
|
124
|
+
num = average_for_metric(num_instances_metric, start_time, end_time)
|
125
|
+
|
126
|
+
if avg.nil? || num.nil?
|
127
|
+
return [ nil, nil ]
|
128
|
+
end
|
129
|
+
|
130
|
+
[ avg * num, num ]
|
131
|
+
end
|
132
|
+
|
133
|
+
def average_for_metric(metric, start_time, end_time)
|
134
|
+
stats = metric.statistics(
|
135
|
+
:start_time => start_time,
|
136
|
+
:end_time => end_time,
|
137
|
+
:statistics => [ "Average" ],
|
138
|
+
:period => 60)
|
139
|
+
|
140
|
+
return nil if stats.datapoints.length == 0
|
141
|
+
|
142
|
+
sum = stats.datapoints.inject(0) do |r, dp|
|
143
|
+
r + dp[:average]
|
144
|
+
end
|
145
|
+
|
146
|
+
sum.to_f / stats.datapoints.length
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
data/vector.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'vector/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "vector"
|
8
|
+
spec.version = Vector::VERSION
|
9
|
+
spec.authors = ["Zach Wily"]
|
10
|
+
spec.email = ["zach@zwily.com"]
|
11
|
+
spec.summary = %q{AWS Auto-Scaling Assistant}
|
12
|
+
spec.homepage = "http://github.com/instructure/vector"
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_dependency "aws-sdk"
|
21
|
+
spec.add_dependency "aws-asg-fleet"
|
22
|
+
spec.add_dependency "activesupport"
|
23
|
+
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
spec.add_development_dependency "rake"
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: vector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Zach Wily
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-07-09 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: aws-sdk
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: aws-asg-fleet
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: activesupport
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: bundler
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '1.3'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '1.3'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rake
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description:
|
95
|
+
email:
|
96
|
+
- zach@zwily.com
|
97
|
+
executables:
|
98
|
+
- vector
|
99
|
+
extensions: []
|
100
|
+
extra_rdoc_files: []
|
101
|
+
files:
|
102
|
+
- .gitignore
|
103
|
+
- Gemfile
|
104
|
+
- LICENSE.txt
|
105
|
+
- README.md
|
106
|
+
- Rakefile
|
107
|
+
- bin/vector
|
108
|
+
- lib/vector.rb
|
109
|
+
- lib/vector/cli.rb
|
110
|
+
- lib/vector/functions/flexible_down_scaling.rb
|
111
|
+
- lib/vector/functions/predictive_scaling.rb
|
112
|
+
- lib/vector/version.rb
|
113
|
+
- vector.gemspec
|
114
|
+
homepage: http://github.com/instructure/vector
|
115
|
+
licenses:
|
116
|
+
- MIT
|
117
|
+
post_install_message:
|
118
|
+
rdoc_options: []
|
119
|
+
require_paths:
|
120
|
+
- lib
|
121
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ! '>='
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
127
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
128
|
+
none: false
|
129
|
+
requirements:
|
130
|
+
- - ! '>='
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
133
|
+
requirements: []
|
134
|
+
rubyforge_project:
|
135
|
+
rubygems_version: 1.8.23
|
136
|
+
signing_key:
|
137
|
+
specification_version: 3
|
138
|
+
summary: AWS Auto-Scaling Assistant
|
139
|
+
test_files: []
|
140
|
+
has_rdoc:
|