staugaard-cloudmaster 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,326 @@
1
+ require 'aws_context'
2
+ require 'ec2_instance_enumerator'
3
+ require 'instance'
4
+ require 'yaml'
5
+
6
+ module Cloudmaster
7
+
8
+ # Stores and operates on a collection of instances
9
+ #
10
+ # Internally, instances are stored as an array of Instance objects.
11
+ class InstancePool
12
+ include Enumerable
13
+
14
+ # Create an instance pool.
15
+ # This class knows how to start and stop instances, and how to detect
16
+ # that new instance have come about, or existing ones have gone away.
17
+ # The constructor takes:
18
+ # [config] describes configurable instance properties
19
+ def initialize(reporter, config)
20
+ @ec2 = AwsContext.instance.ec2
21
+ @reporter = reporter
22
+ @config = config
23
+ @state_change_time = Clock.now
24
+ @instances = [] # holds Instance objects
25
+ end
26
+
27
+ # private
28
+
29
+ # Create and return options, in a way that is acceptable to EC2.
30
+ def start_opts
31
+ groups = @config[:security_groups]
32
+ # this can throw an exception if groups is not formetteed properly
33
+ begin
34
+ groups = eval(groups) if groups.kind_of?(String)
35
+ rescue
36
+ groups = [:default]
37
+ end
38
+ {:key_name => @config[:key_pair_name],
39
+ :user_data => YAML.dump(@config[:user_data]),
40
+ :security_groups => groups,
41
+ :instance_type => @config[:instance_type]}
42
+ end
43
+
44
+ def max2(a, b)
45
+ a > b ? a : b
46
+ end
47
+
48
+ # Allows iteration through instances.
49
+ # So enumeration on InstancePool is implicitly enumeration
50
+ # on @instances.
51
+ def each
52
+ @instances.each {|i| yield i}
53
+ end
54
+
55
+ # return first instance
56
+ def first
57
+ @instances.first
58
+ end
59
+
60
+ # Return the number of instances in the pool.
61
+ def size
62
+ @instances.size
63
+ end
64
+
65
+ # Create an instance and add to the list.
66
+ # Return the newly created instance.
67
+ def add(id, public_dns)
68
+ new_instance = Instance.new(id, public_dns, @config)
69
+ @instances << new_instance
70
+ new_instance
71
+ end
72
+
73
+ # Delete the instance from the list
74
+ def delete(instance)
75
+ @instances.delete(instance)
76
+ end
77
+
78
+ # Find an instance given its instance id.
79
+ def find_by_id(id)
80
+ find {|i| i.id == id}
81
+ end
82
+
83
+ # Return a list of all instance ids.
84
+ def id_list
85
+ map {|i| i.id}
86
+ end
87
+
88
+ # Return the maximum number of instances allowed.
89
+ def maximum
90
+ @config[:maximum_number_of_instances].to_i
91
+ end
92
+
93
+ # Return the minimum number of instances allowed.
94
+ def minimum
95
+ @config[:minimum_number_of_instances].to_i
96
+ end
97
+
98
+ # Return true if the number of instances is less than the minimum.
99
+ def less_than_minimum?
100
+ size < minimum
101
+ end
102
+
103
+ # return true if number of instances is more than maximum
104
+ def greater_than_maximum?
105
+ size > maximum
106
+ end
107
+
108
+ # Return count of instance below minimum
109
+ def below_minimum_count
110
+ less_than_minimum? ? minimum - size : 0
111
+ end
112
+
113
+ # Return count of instance above maximum
114
+ def above_maximum_count
115
+ greater_than_maximum? ? size - maximum : 0
116
+ end
117
+
118
+ # Return a list of instances missing public_dns.
119
+ def missing_public_dns_instances
120
+ find_all {|i| i.public_dns.nil? || i.public_dns.empty? }
121
+ end
122
+
123
+ # Return ids of all instances missing a public_dns.
124
+ def missing_public_dns_ids
125
+ missing_public_dns_instances.collect {|i| i.id}
126
+ end
127
+
128
+ # Find the instance identified by id and update its public_dns
129
+ # If there is no dns information, then skip it.
130
+ def update_public_dns(id, public_dns)
131
+ return if public_dns.nil? || public_dns.empty?
132
+ i = find_by_id(id)
133
+ i.public_dns = public_dns if i
134
+ end
135
+
136
+ # Return instances that have not seen status in watchdog_interval
137
+ def hung_instances
138
+ find_all {|i| i.watchdog_time_elapsed?}
139
+ end
140
+
141
+ # Return all instances in active state
142
+ def active_instances
143
+ find_all {|i| i.state == :active}
144
+ end
145
+
146
+ # Return all instances in shut_down state
147
+ def shut_down_instances
148
+ find_all {|i| i.state == :shut_down}
149
+ end
150
+
151
+ # Return instances that are active and have load <= target_load
152
+ def active_idle_instances
153
+ target_load = 0
154
+ active_instances.find_all {|i| i.load_estimate <= target_load}
155
+ end
156
+
157
+ # Shut down all instances who have a load below the target.
158
+ # Shut down is not the same as stop -- the instances continue to
159
+ # provide service, but are no longer allocated new clients.
160
+ def shut_down_idle_instances
161
+ target_load = @config[:shut_down_threshold].to_i
162
+ shut_down_instances.find_all {|i| i.load_estimate <= target_load}
163
+ end
164
+
165
+ # Return instances that are shut_down and have
166
+ # time_since_state_change > shut_down_interval.
167
+ def shut_down_timeout_instances
168
+ shut_down_interval = @config[:shut_down_interval].to_i * 60
169
+
170
+ shut_down_instances.find_all {|i| i.time_since_state_change > shut_down_interval}
171
+ end
172
+
173
+ # Return the latest time since any state change of any instance.
174
+ def state_change_time
175
+ @state_change_time = inject(@state_change_time) do |latest, instance|
176
+ max2(latest, instance.state_change_time)
177
+ end
178
+ end
179
+
180
+ # Return the sum of all the extra capacity of active instances
181
+ # that have excess capacity (load less than target load).
182
+ def excess_capacity
183
+ target_load = @config[:target_upper_load].to_f
184
+ active_instances.inject(0) do |sum, instance|
185
+ sum + max2(target_load - instance.load_estimate, 0)
186
+ end
187
+ end
188
+
189
+ # Return the sum of capacity in excess of the target upper load
190
+ def over_capacity
191
+ target_load = @config[:target_upper_load].to_f
192
+ active_instances.inject(0) do |sum, instance|
193
+ sum + max2(instance.load_estimate - target_load, 0)
194
+ end
195
+ end
196
+
197
+ # Return the total load of all active instances
198
+ def total_load
199
+ active_instances.inject(0) do |sum, instance|
200
+ sum + instance.load_estimate
201
+ end
202
+ end
203
+
204
+ # Update the status of an instance using the contents of the status message.
205
+ def update_status(msg)
206
+ id = msg[:instance_id]
207
+ if instance = find_by_id(id)
208
+ instance.update_status(msg)
209
+ else
210
+ @reporter.error("Received status message from unknown instance: #{id}") unless id == 'unknown'
211
+ end
212
+ end
213
+
214
+ # Return a YAML encoded representation of the active set.
215
+ # The active set describes the id, public DNS, and load average of
216
+ # each active instance in the pool.
217
+ def active_set
218
+ message = active_instances.collect do |instance|
219
+ { :id => instance.id,
220
+ :public_dns => instance.public_dns,
221
+ :load_estimate => instance.load_estimate }
222
+ end
223
+ YAML.dump(message)
224
+ end
225
+
226
+ # Return all the instance, sortd by lowest load estimate.
227
+ def sorted_by_lowest_load
228
+ @instances.sort do |a,b|
229
+ # Compare the elapsed lifetime status. If the status differs, instances
230
+ # that have lived beyond the minimum lifetime will be sorted earlier.
231
+ if a.minimum_lifetime_elapsed? != b.minimum_lifetime_elapsed?
232
+ if a.minimum_lifetime_elapsed?
233
+ -1 # This instance has lived long enough, the other hasn't
234
+ else
235
+ 1 # The other instance has lived long enough, this one hasn't
236
+ end
237
+ else
238
+ a.load_estimate - b.load_estimate
239
+ end
240
+ end
241
+ end
242
+
243
+ # Find all instances for which we don't have a public_dns,
244
+ # For each one,see of EC2 now has the public DNS. If so, store it.
245
+ def update_public_dns_all
246
+ missing_ids = missing_public_dns_ids
247
+ return if missing_ids.size == 0
248
+ EC2InstanceEnumerator.new(missing_ids).each do |instance|
249
+ update_public_dns(instance[:id], instance[:public_dns])
250
+ end
251
+ end
252
+
253
+ # Return instances that match our ami_id that are either pending
254
+ # or running.
255
+ # These instances are as returned by ec2: containing fields such as
256
+ # id and :public_dns.
257
+ def our_running_instances
258
+ EC2InstanceEnumerator.new.find_all do |instance|
259
+ instance[:image_id] == @config[:ami_id] &&
260
+ %w[pending running].include?(instance[:state])
261
+ end
262
+ end
263
+
264
+ # Audit the list of instances based on what is currently known to EC2.
265
+ # In other words, bring our list of instances into agreement with the instances
266
+ # EC2 knows about by
267
+ # (1) adding instances that EC2 knows but that we do not, and
268
+ # (2) deleting instance that EC2 no longer knows about.
269
+ # This is used initially to build the instance list, and
270
+ # periodically thereafter to catch instances started or stopped
271
+ # outside cloudmaster.
272
+ def audit_existing_instances
273
+ running_instances = our_running_instances
274
+
275
+ # add running instances that we don't have
276
+ running_instances.each do |running|
277
+ if ! find_by_id(running[:id])
278
+ add(running[:id], running[:public_dns])
279
+ @reporter.info("Instance discovered #{running[:public_dns]}", running[:id])
280
+ end
281
+ end
282
+ # delete instances that are no longer running
283
+ each do |instance|
284
+ if ! running_instances.find {|running| running[:id] == instance.id}
285
+ delete(instance)
286
+ @reporter.info("Instance disappeared #{instance.public_dns}", instance.id)
287
+ end
288
+ end
289
+ end
290
+
291
+ # Start the given number of instances.
292
+ # Remember started instances by creating an Instance object and storing it
293
+ # in the pool.
294
+ # Return an array of the ones we just started.
295
+ def start_n_instances(number_to_start)
296
+ return [] if number_to_start <= 0
297
+ started_instances = @ec2.run_instances(@config[:ami_id], 1,
298
+ number_to_start, start_opts)[:instances]
299
+ started_instances.collect do |started_instance|
300
+ # the public dns is not available yet
301
+ add(started_instance[:id], nil)
302
+ end
303
+ end
304
+
305
+ # Stop the given set of instances.
306
+ # Remove stopped instance from the pool.
307
+ # Return an array of stopped instances.
308
+ def stop_instances(instances_to_stop)
309
+ instances_to_stop.collect do |instance|
310
+ @ec2.terminate_instances(instance.id.to_s)
311
+ delete(instance)
312
+ instance
313
+ end
314
+ end
315
+
316
+ # Shut down the given set of instances.
317
+ # Set the state to shut_down
318
+ # Return an array of shut down instances.
319
+ def shut_down(instances_to_shut_down)
320
+ instances_to_shut_down.collect do |instance|
321
+ instance.shutdown
322
+ instance
323
+ end
324
+ end
325
+ end
326
+ end
@@ -0,0 +1,75 @@
1
+ require 'aws_context'
2
+
3
+ module Cloudmaster
4
+
5
+ # Implements a queue specified by name.
6
+ # The queue name is used to look up the actual SQS queue.
7
+ # The queue name is used in a substring match, so possibly more than one
8
+ # string may match.
9
+ # If no queue is found for given name, or if there are
10
+ # multiple, the NamedQueue throws an exception.
11
+ # Implements the read and delete operations.
12
+ # It also caches queue_depth, so it need not query SQS every time
13
+ # it is needed.
14
+ class NamedQueue
15
+ attr_reader :queue_depth
16
+
17
+ # Create a queue given the queue name.
18
+ # A SQS interface must be supplied: it is used to interact with Amazon.
19
+ # The queue name is given, and is used to look up the actual queue.
20
+ # Raises an exception if there is more than one queue matching the given
21
+ # name.
22
+ # If there is no queue with the given name, create one.
23
+ def initialize(queue_name)
24
+ @sqs = AwsContext.instance.sqs
25
+ queues = @sqs.list_queues(queue_name)
26
+ @queue_depth = 0
27
+ @queue = case queues.length
28
+ when 0
29
+ queue = @sqs.create_queue(queue_name)
30
+ raise "Bad Configuration -- no queue: #{queue_name}" if queue.nil?
31
+ queue
32
+ when 1
33
+ queues.first
34
+ else
35
+ raise "Bad configuration -- multiple queues match #{queue_name}"
36
+ end
37
+ end
38
+
39
+ # Read some messages off a queue.
40
+ # For some reason, we never receive more than 1.
41
+ # But we are prepared for more.
42
+ # return an array of the messages that were read.
43
+ def read_messages(count = 1)
44
+ @sqs.receive_messages(@queue, count)
45
+ end
46
+
47
+ # Delete a message from the queue given its id.
48
+ def delete_message(receipt_handle)
49
+ @sqs.delete_message(@queue, receipt_handle)
50
+ end
51
+
52
+ # Read and discard all messages on the queue.
53
+ # Return the number read.
54
+ def empty_queue
55
+ n = 0
56
+ while true
57
+ msgs = read_messages
58
+ break if msgs.size == 0
59
+ n += 1
60
+ msgs.each {|msg| delete_message(msg[:receipt_handle])}
61
+ end
62
+ n
63
+ end
64
+
65
+ # Get qeue depth and store it.
66
+ # In case of failure, keep the old value.
67
+ # The last value read is available through the queue_depth attribute.
68
+ def read_queue_depth
69
+ attr = 'ApproximateNumberOfMessages'
70
+ attrs = @sqs.get_queue_attributes(@queue, attr)
71
+ @queue_depth = attrs[attr] if attrs.has_key?(attr)
72
+ @queue_depth
73
+ end
74
+ end
75
+ end
data/app/policy.rb ADDED
@@ -0,0 +1,113 @@
1
+ require 'policy_limit'
2
+
3
+ module Cloudmaster
4
+
5
+ # Provides the common data and behaviors for policies.
6
+ # This includes storing the configuration and instance collection.
7
+ # It also includes implementing essential methods such as ensure_limite and
8
+ # stop_hung_instances.
9
+ # Finally, this implements helpers that process the queue and image names
10
+ # which might be desirable to override.
11
+ class Policy
12
+ def initialize(reporter, config, instances)
13
+ @reporter = reporter
14
+ @config = config
15
+ @instances = instances
16
+ @limit_policy = PolicyLimit.new(reporter, config, instances)
17
+ end
18
+
19
+ # Make sure there are at least the minimum instances running.
20
+ # Also make sure there are no more than maximum number of instances.
21
+ # If not, start or stop enough to conform.
22
+ # These actions bypass the creation and termination limits normally in place
23
+ def ensure_limits
24
+ n = @limit_policy.adjust_limits
25
+ case
26
+ when n > 0
27
+ start_instances(n)
28
+ when n < 0
29
+ stop_instances(-n)
30
+ end
31
+ end
32
+
33
+ # If instances have not sent status in a long time, they
34
+ # are probably hung, and should be stopped.
35
+ def stop_hung_instances
36
+ if @instances.hung_instances.size > 0
37
+ @reporter.info("Stopping hung instances #{@instances.hung_instances.size}")
38
+ stop_instances(@instances.hung_instances)
39
+ end
40
+ end
41
+
42
+ # Default policy application function
43
+ # Calculate the number to adjust by, and then start or
44
+ # stop instances accordingly.
45
+ def apply
46
+ n = @limit_policy.adjust(adjust)
47
+ case
48
+ when n > 0
49
+ start_instances(n)
50
+ when n == 0
51
+ 0
52
+ when n < 0
53
+ stop_instances(-n)
54
+ end
55
+ end
56
+
57
+ # Returns the number of additional instances to start (positive)
58
+ # or the number to stop (negative)
59
+ # This must be implemented by derived class.
60
+ def adjust
61
+ raise NotImpelemntedError("#{self.class.name}#adjust is not implemented.")
62
+ end
63
+
64
+ protected
65
+
66
+ # Start the given number of instances.
67
+ # Return the number started.
68
+ def start_instances(number_to_start)
69
+ started_instances = @instances.start_n_instances(number_to_start)
70
+ if started_instances.size < number_to_start
71
+ @reporter.error("Failed to start requested number of instances. (#{started_instances.size} instead of #{number_to_start})")
72
+ end
73
+ started_instances.each { |started| @reporter.info("Started instance #{started.id}")}
74
+ started_instances.size
75
+ end
76
+
77
+ # Stop the given list of instances.
78
+ # Don't stop ones whose minimum lifetime has not elapsed.
79
+ # Returns the number stopped.
80
+ def stop_instances_list(instances_to_stop, force = false)
81
+ instances_to_stop = instances_to_stop.find_all do |instance|
82
+ # Don't stop instances before minimum_time
83
+ force || instance.minimum_time_elapsed?
84
+ end
85
+ instances = @instances.stop_instances(instances_to_stop)
86
+ instances.each {|i| @reporter.info("Terminating instance ", i.id) }
87
+ instances.size
88
+ end
89
+
90
+ # Stop a given number of instances.
91
+ # Return the number stopped.
92
+ def stop_n_instances(number_to_stop)
93
+ return if number_to_stop <= 0
94
+ # stop the instances with the lowest load estimate
95
+ instances_with_lowest_load = @instances.sorted_by_lowest_load
96
+ stop_instances_list(instances_with_lowest_load[0...number_to_stop])
97
+ end
98
+
99
+ # Stop instances.
100
+ # If passed a number, it stops that number of instances.
101
+ # If passed a list of instance ids, it stops those instances.
102
+ def stop_instances(to_stop, *params)
103
+ if to_stop.class == Fixnum
104
+ stop_n_instances(to_stop)
105
+ elsif to_stop.class == Array
106
+ stop_instances_list(to_stop, *params)
107
+ else
108
+ raise "Bad call -- stop_instances #{to_stop.class}"
109
+ end
110
+ end
111
+ end
112
+
113
+ end
@@ -0,0 +1,18 @@
1
+ require 'policy'
2
+
3
+ module Cloudmaster
4
+
5
+ # Provide example daytime policy
6
+ # This increases the pool size during the day.
7
+ # This is provided as part of an example on how to create custom policies.
8
+ class PolicyDaytime < Policy
9
+ # Define the adjust function increase the pool in the
10
+ # daytime if it is below daytime minimum
11
+ def adjust
12
+ hour = Clock.hour
13
+ return 0 if hour < 10 || hour > 17
14
+ return 0 if @instances.size >= @config[:minimum_number_of_instances_daytime].to_i
15
+ return 1
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,16 @@
1
+ # This is a factory for Policy implementations.
2
+ require 'factory'
3
+
4
+ module Cloudmaster
5
+ class PolicyFactory
6
+ include Factory
7
+ def PolicyFactory.create(policy_name, *params)
8
+ name = policy_name.nil? ? 'default' : policy_name.to_s
9
+ require 'policy_' + name.downcase
10
+ class_name = 'Policy' +name.capitalize
11
+ policy = Factory.create_object_from_string(class_name, *params)
12
+ raise "Bad configuration -- no policy #{class_name}" unless policy
13
+ policy
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,19 @@
1
+ require 'policy'
2
+
3
+ module Cloudmaster
4
+
5
+ # Provide fixed policy.
6
+ # This never adjusts the size of the pool.
7
+ # PolicyFixed is still useful, because it still ensures that the number
8
+ # of instances stay beteen the maximum and minimum (because this is
9
+ # enforced in the base class apply).
10
+ class PolicyFixed < Policy
11
+ # Use everything from the base class.
12
+
13
+ # Adjust never changes instances.
14
+ def adjust
15
+ 0
16
+ end
17
+ end
18
+
19
+ end
data/app/policy_job.rb ADDED
@@ -0,0 +1,54 @@
1
+
2
+ require 'policy'
3
+
4
+ module Cloudmaster
5
+
6
+ # Provides job policy
7
+ # Instances managed by the job policy take work from a queue and report
8
+ # status thrugh another queue.
9
+ class PolicyJob < Policy
10
+ # Initialize the policy by giving it access to the configuration and
11
+ # the collection of instances.
12
+ def initialize(reporter, config, instances)
13
+ super(reporter, config, instances)
14
+ @config = config
15
+ end
16
+
17
+ # Adjust the pool size.
18
+ # Add instances if the queue is getting backed up.
19
+ # Delete instances if the queue is empty and instances are idle.
20
+ def adjust
21
+ depth = @config[:work_queue].queue_depth
22
+
23
+ # See if we should add instances
24
+ if (depth > 0 && @instances.size == 0) || depth >= @config[:start_threshold].to_i
25
+ additional = 0
26
+ # need this many more to service the queued work
27
+ needed = (depth.to_f / @config[:start_threshold].to_f).floor
28
+ needed = 1 if needed < 1
29
+ if needed > @instances.size
30
+ additional = needed - @instances.size
31
+ end
32
+ @reporter.info("Job policy need additional #{additional} depth: #{depth} needed #{needed}")
33
+ return additional
34
+ end
35
+
36
+ # Queue not empty -- don't stop any
37
+ return 0 if depth > 0
38
+
39
+ # See if we should stop some
40
+ # count how many are active idle and active
41
+ idle = @instances.active_idle_instances
42
+ active = @instances.active_instances
43
+ if idle.size > 0
44
+ # stop some fraction of the idle instances
45
+ excess = (idle.size / @config[:idle_threshold].to_f).round
46
+ @reporter.info("Job policy need fewer #{excess} idle: #{idle.size} active #{active.size}")
47
+ return -excess
48
+ end
49
+ 0
50
+ end
51
+
52
+ # uses the default apply
53
+ end
54
+ end
@@ -0,0 +1,68 @@
1
+ module Cloudmaster
2
+
3
+ # This enforces the limits on starting and stopping instances.
4
+ # It can be used by other policies to make sure thrie own adjust method
5
+ # does not increase or decrease by more than the configurable start limit or stop limit.
6
+ # This also provides an exaplanation for its actions.
7
+ class PolicyLimit
8
+ def initialize(reporter, config, instances)
9
+ @reporter = reporter
10
+ @config = config
11
+ @instances = instances
12
+ end
13
+
14
+ def max2(a, b)
15
+ a > b ? a : b
16
+ end
17
+
18
+ # Make sure there are at least the minimum instances running.
19
+ # Also make sure there are no more than maximum number of instances.
20
+ # Return the number to start (positive) or stop (negative) to stay within the limits.
21
+ # Do not enforce start_limit and stop_limit.
22
+ def adjust_limits
23
+ if @instances.less_than_minimum?
24
+ number_to_start = @instances.below_minimum_count
25
+ @reporter.info("Less than minimum -- start more #{number_to_start}")
26
+ number_to_start
27
+ elsif @instances.greater_than_maximum?
28
+ number_to_stop = @instances.above_maximum_count
29
+ @reporter.info("Greater than maximum -- stop some #{number_to_stop}")
30
+ -number_to_stop
31
+ else
32
+ 0
33
+ end
34
+ end
35
+
36
+ # After other policies have computed a value for adjust, then this one possibly
37
+ # modifies the value by ensuring that the start_limit and stop_limit constraints are
38
+ # honored. It also makes sure that the adjustment makes the instance count
39
+ # stay between the minimum and maximum.
40
+ def adjust(n)
41
+ case
42
+ when n > 0
43
+ start_limit = @config[:start_limit].to_i
44
+ if n > start_limit
45
+ @reporter.info("Limit start -- requested: #{n} limit: #{start_limit}")
46
+ n = start_limit
47
+ end
48
+ remaining = max2(@instances.maximum - @instances.size, 0)
49
+ if n > remaining
50
+ @reporter.info("Limit start -- requested: #{n} remaining: #{remaining}")
51
+ n = remaining
52
+ end
53
+ when n < 0
54
+ stop_limit = @config[:stop_limit].to_i
55
+ if -n > stop_limit
56
+ @reporter.info("Limit stop -- requested: #{-n} limit: #{stop_limit}")
57
+ n = -stop_limit
58
+ end
59
+ remaining = max2(@instances.size - @instances.minimum, 0)
60
+ if -n > remaining
61
+ @reporter.info("Limit stop -- requested: #{n} remaining: #{remaining}")
62
+ n = -remaining
63
+ end
64
+ end
65
+ n
66
+ end
67
+ end
68
+ end