staugaard-cloudmaster 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION.yml +1 -1
- data/app/active_set_factory.rb +16 -0
- data/app/active_set_none.rb +16 -0
- data/app/active_set_queue.rb +27 -0
- data/app/active_set_s3.rb +25 -0
- data/app/configuration.rb +85 -0
- data/app/default-config.ini +95 -0
- data/app/ec2_image_enumerator.rb +41 -0
- data/app/ec2_instance_enumerator.rb +25 -0
- data/app/instance.rb +146 -0
- data/app/instance_pool.rb +326 -0
- data/app/named_queue.rb +75 -0
- data/app/policy.rb +113 -0
- data/app/policy_daytime.rb +18 -0
- data/app/policy_factory.rb +16 -0
- data/app/policy_fixed.rb +19 -0
- data/app/policy_job.rb +54 -0
- data/app/policy_limit.rb +68 -0
- data/app/policy_manual.rb +36 -0
- data/app/policy_resource.rb +110 -0
- data/app/pool_configuration.rb +172 -0
- data/app/pool_manager.rb +239 -0
- data/app/pool_runner.rb +54 -0
- data/app/reporter.rb +81 -0
- data/app/status_parser_factory.rb +16 -0
- data/app/status_parser_lifeguard.rb +48 -0
- data/app/status_parser_std.rb +11 -0
- metadata +27 -1
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'policy'
|
2
|
+
|
3
|
+
module Cloudmaster
|
4
|
+
|
5
|
+
# Provide manual policy.
|
6
|
+
# This policy only changes the instances when requested to do so.
|
7
|
+
# This implementation uses a queue to convey manual requests
|
8
|
+
# to the policy module.
|
9
|
+
class PolicyManual < Policy
|
10
|
+
def initialize(reporter, config, instances)
|
11
|
+
super(reporter, config, instances)
|
12
|
+
@config = config
|
13
|
+
@sqs = AwsContext.instance.sqs
|
14
|
+
manual_queue_name = @config.append_env(config[:manual_queue_name])
|
15
|
+
@manual_queue = NamedQueue.new(manual_queue_name)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Adjust never changes instances.
|
19
|
+
def adjust
|
20
|
+
n = 0
|
21
|
+
# Read all the messages out of the manual queue.
|
22
|
+
# Sum up all adjustments.
|
23
|
+
while true
|
24
|
+
messages = @manual_queue.read_messages(10)
|
25
|
+
break(n) if messages.size == 0
|
26
|
+
messages.each do |message|
|
27
|
+
msg = YAML.load(message[:body])
|
28
|
+
n += msg[:adjust]
|
29
|
+
@manual_queue.delete_message(message[:receipt_handle])
|
30
|
+
end
|
31
|
+
end
|
32
|
+
# the value of the while is n
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
|
2
|
+
require 'policy'
|
3
|
+
|
4
|
+
module Cloudmaster
|
5
|
+
|
6
|
+
# Provides resource policy implementation.
|
7
|
+
# Instances managed under a resource policy are expected to issue
|
8
|
+
# periodic status messages, giving their estimated load (generally
|
9
|
+
# between 0 and 1).
|
10
|
+
class PolicyResource < Policy
|
11
|
+
# Each policy object gets the configuration and the instance collection.
|
12
|
+
def initialize(reporter, config, instances)
|
13
|
+
super(reporter, config, instances)
|
14
|
+
@config = config
|
15
|
+
end
|
16
|
+
|
17
|
+
# Activate the given number of shut_down instances.
|
18
|
+
# We prefer those with highest load.
|
19
|
+
# Return the number actually activated.
|
20
|
+
def activate_shut_down_instances(number_to_activate)
|
21
|
+
shutdown_instances = @instances.shut_down_instances.sort do |a,b|
|
22
|
+
b.load_estimate - a.load_estimate
|
23
|
+
end
|
24
|
+
shutdown_instances = shutdown_instances[0..number_to_activate]
|
25
|
+
shutdown_instances.each { |i| i.activate }
|
26
|
+
shutdown_instances.each { |i| @reporter.info("Activating instance ", i.id) }
|
27
|
+
shutdown_instances.size
|
28
|
+
end
|
29
|
+
|
30
|
+
# Shut down the given instances, by changing their state to shut_down.
|
31
|
+
def shut_down_instances(instances_to_shut_down)
|
32
|
+
instances = @instances.shut_down(instances_to_shut_down)
|
33
|
+
instances.each {|i| @reporter.info("Shutting down instance ", i.id) }
|
34
|
+
instances.size
|
35
|
+
end
|
36
|
+
|
37
|
+
# Shut down the given number of instances.
|
38
|
+
# Shut down the ones with the lowest load.
|
39
|
+
def shut_down_n_instances(number_to_shut_down)
|
40
|
+
return if number_to_shut_down <= 0
|
41
|
+
instances_with_lowest_load = @instances.sorted_by_lowest_load
|
42
|
+
instances_to_shut_down = instances_with_lowest_load.find_all do |instance|
|
43
|
+
# Don't stop instances before minimum_active_time
|
44
|
+
instance.minimum_active_time_elapsed?
|
45
|
+
end
|
46
|
+
shut_down_instances(instances_to_shut_down[0...number_to_shut_down])
|
47
|
+
end
|
48
|
+
|
49
|
+
# Stop any shut down instances with load below threshold.
|
50
|
+
# Also stop instances that have exceeded shut_down_interval.
|
51
|
+
def clean_up_shut_down_instances
|
52
|
+
idle_instances = @instances.shut_down_idle_instances
|
53
|
+
timeout_instances = @instances.shut_down_timeout_instances
|
54
|
+
stop_instances(idle_instances | timeout_instances)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Adjust the instance pool up or down.
|
58
|
+
# If no instance are running, and there are requests in the work queue, start
|
59
|
+
# some.
|
60
|
+
# Additional instances are added if the load is too high.
|
61
|
+
# Instances are shut down, and then stopped if the load is low.
|
62
|
+
def adjust
|
63
|
+
depth = @config[:work_queue].empty_queue
|
64
|
+
if @instances.active_instances.size == 0
|
65
|
+
# capacity consumed by new arrivals
|
66
|
+
new_load = depth.to_f / @config[:queue_load_factor].to_f
|
67
|
+
initial = (new_load / @config[:target_upper_load].to_f).ceil
|
68
|
+
@reporter.info("Resource policy need initial #{initial} depth: #{depth} new_load #{new_load}") if initial > 0
|
69
|
+
return initial
|
70
|
+
end
|
71
|
+
if depth > 0
|
72
|
+
@reporter.info("Resource policy residual depth: #{depth}")
|
73
|
+
return 0
|
74
|
+
end
|
75
|
+
# the total capacity remaining below the upper bound
|
76
|
+
excess_capacity = @instances.excess_capacity
|
77
|
+
if excess_capacity == 0
|
78
|
+
# need this many more running at upper bound
|
79
|
+
over_capacity = @instances.over_capacity
|
80
|
+
additional = (over_capacity / @config[:target_upper_load].to_f).ceil
|
81
|
+
@reporter.info("Resource policy need additional #{additional} depth: #{depth} over_capacity #{over_capacity}")
|
82
|
+
return additional
|
83
|
+
end
|
84
|
+
# how many are needed to carry the total load at the lower bound
|
85
|
+
needed = (@instances.total_load / @config[:target_lower_load].to_f).ceil
|
86
|
+
if needed < @instances.size
|
87
|
+
excess = @instances.size - needed
|
88
|
+
@reporter.info("Resource policy need fewer #{excess} depth: #{depth} needed #{needed}")
|
89
|
+
return -excess
|
90
|
+
end
|
91
|
+
return 0
|
92
|
+
end
|
93
|
+
|
94
|
+
# We are not using the default apply, because we want to:
|
95
|
+
# * activate shut down instances, if posible, otherwise start
|
96
|
+
# * shut down instances if fewer are needed
|
97
|
+
# * stop inactive or expired shut_down instances
|
98
|
+
def apply
|
99
|
+
n = @limit_policy.adjust(adjust)
|
100
|
+
case
|
101
|
+
when n > 0
|
102
|
+
n -= activate_shut_down_instances(n)
|
103
|
+
start_instances(n)
|
104
|
+
when n < 0
|
105
|
+
shut_down_n_instances(-n)
|
106
|
+
end
|
107
|
+
clean_up_shut_down_instances
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
require 'named_queue'
|
2
|
+
require 'ec2_image_enumerator'
|
3
|
+
|
4
|
+
module Cloudmaster
|
5
|
+
|
6
|
+
# All configuration parameters passed in through the constructor.
|
7
|
+
# Items with * must be defined
|
8
|
+
#
|
9
|
+
# ==aws_config==
|
10
|
+
# aws_env -- used to form queue, instance, and s3 key names --
|
11
|
+
# typically development|test|production
|
12
|
+
# *aws_access_key -- the AWS access key
|
13
|
+
# *aws_secret_key -- the AWS secret key
|
14
|
+
# *aws_user -- the user name, used to build the image name
|
15
|
+
# *aws_bucket -- the bucket to use when storing the active set
|
16
|
+
# *aws_keypair -- full path name of the keypair file to use for
|
17
|
+
# connecting to instances
|
18
|
+
#
|
19
|
+
# ==config==
|
20
|
+
# ===GENERAL===
|
21
|
+
# *name -- the name of this config
|
22
|
+
# *policy -- :none, :job, :resource
|
23
|
+
# ===QUEUES===
|
24
|
+
# poll_interval -- how often to check work queue, etc (seconds)
|
25
|
+
# receive_count -- how many status messages to receive at once
|
26
|
+
# *work_queue -- name of work queue (aws_env)
|
27
|
+
# *status_queue -- name of status queue (aws_env)
|
28
|
+
# ===ACTIVE SET===
|
29
|
+
# active_set_type -- which active set algorithm to use: :none, :s3, :queue
|
30
|
+
# active_set_bucket -- the S3 bucket to use to store the active set
|
31
|
+
# active_set_key -- the S3 key used to store the active set (aws_env)
|
32
|
+
# active_set_interval -- how often to write active_set
|
33
|
+
# ===INSTANCE CREATION PARAMETERS===
|
34
|
+
# *ami_name -- the ami name to start and monitor (aws_env)
|
35
|
+
# key_pair_name -- the name if the keypair to start the instance with
|
36
|
+
# security_groups -- array of security group names to start the instance with
|
37
|
+
# instance_type -- the smi instance type to create
|
38
|
+
# user_data -- instance data made available to running instance
|
39
|
+
# through http://169.254.169.254/latest/user-data
|
40
|
+
# This is given as a hash, which is serialized by cloudmaster.
|
41
|
+
#
|
42
|
+
# ===INSTANCE MANAGEMENT POLICIES===
|
43
|
+
# policy_interval -- how often to apply job or resource policy
|
44
|
+
# audit_instance_interval -- how often (in minutes) to audit instances (-1 for never)
|
45
|
+
# maximum_number_of_instances -- the max number to allow
|
46
|
+
# minimum_number_of_instances -- the min number to allow
|
47
|
+
# ===INSTANCE START POLICIES===
|
48
|
+
# start_limit -- how many instances to start at one time
|
49
|
+
# ===INSTANCE STOP POLICIES===
|
50
|
+
# stop_limit -- how many to stop at one time
|
51
|
+
# minimum_lifetime -- don't stop an instance unless it has run this long (minutes)
|
52
|
+
# minimum_active_time -- the minimum amount of time (in minutes) that an instance
|
53
|
+
# may remain in the active state
|
54
|
+
# watchdog_interval -- if a machine does not report status in this interval, it is
|
55
|
+
# considered to be hung, and is stopped
|
56
|
+
# ===JOB POLICIES===
|
57
|
+
# start_threshold -- if work queue size is greater than start_threshold * number of
|
58
|
+
# active instances, start more instances
|
59
|
+
# idle_threshold -- if more than idle_threshold active instances with load 0
|
60
|
+
# exist, stop some of them
|
61
|
+
# ===RESOURCE POLICIES===
|
62
|
+
# target_upper_load -- try to keep instances below this load
|
63
|
+
# target_lower_load -- try to keep instances above this load
|
64
|
+
# queue_load_factor -- the portion of the load that a single queue entry represents.
|
65
|
+
# If a server can serve a maximum of 10 clients, then this is 10.
|
66
|
+
# shut_down_threshold -- stop instances that have load_estimate below this value
|
67
|
+
# shut_down_interval -- stop instances that have been in shut_down state for
|
68
|
+
# longer than this interval
|
69
|
+
# ===MANUAL POLICIES===
|
70
|
+
# manual_queue_name -- the name of the queue used to send manual instance adjustments
|
71
|
+
# ===REPORTING===
|
72
|
+
# summary_interval -- how often to give summary
|
73
|
+
# instance_log -- if set, it is a patname to a directory where individual log files
|
74
|
+
# are written for each instance
|
75
|
+
# instance_report_interval -- how often to show instance reports
|
76
|
+
|
77
|
+
# PoolConfiguration holds the configuration parameters for one pool.
|
78
|
+
# It also stores aws parameters and defaults, providing a single lookup mechanism
|
79
|
+
# for all.
|
80
|
+
# If lookup files, then it raise an exception.
|
81
|
+
|
82
|
+
class PoolConfiguration
|
83
|
+
# Create a new PoolConfiguration. The default parameters
|
84
|
+
# are used if the desired parameter is not given.
|
85
|
+
def initialize(aws_config, default, config)
|
86
|
+
# these parameters merge the defaults and the given parbameters
|
87
|
+
# merged parameters are also evaluated
|
88
|
+
@merge_params = [:user_data]
|
89
|
+
@aws_config = aws_config
|
90
|
+
@default = default
|
91
|
+
@config = config
|
92
|
+
end
|
93
|
+
|
94
|
+
# Get a parameter, either from aws_config, config or default.
|
95
|
+
# Don't raise an exception if there is no value.
|
96
|
+
def get(param)
|
97
|
+
@aws_config[param] || @config[param] || @default[param]
|
98
|
+
end
|
99
|
+
|
100
|
+
# Get a parameter, either from config or from default.
|
101
|
+
# Raise an exception if there is none.
|
102
|
+
def [](param)
|
103
|
+
if @default.nil?
|
104
|
+
raise "Missing defaults"
|
105
|
+
end
|
106
|
+
config_param = @aws_config[param] || @config[param]
|
107
|
+
if (res = config_param || @default[param]).nil?
|
108
|
+
raise "Missing config: #{param}"
|
109
|
+
end
|
110
|
+
begin
|
111
|
+
if @merge_params.include?(param)
|
112
|
+
# fix up default param if needed -- it must be a hash
|
113
|
+
@default[param] = {} if @default[param].nil?
|
114
|
+
@default[param] = eval(@default[param]) if @default[param].is_a?(String)
|
115
|
+
if config_param
|
116
|
+
@default[param].merge(eval(config_param))
|
117
|
+
else
|
118
|
+
@default[param]
|
119
|
+
end
|
120
|
+
else
|
121
|
+
res
|
122
|
+
end
|
123
|
+
rescue
|
124
|
+
raise "Config bad format: #{param} #{config_param} #{$!}"
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# Store (create or replace) a parameter.
|
129
|
+
def []=(param, val)
|
130
|
+
@config[param] = val
|
131
|
+
end
|
132
|
+
|
133
|
+
def append_env(name)
|
134
|
+
aws_env = @aws_config[:aws_env]
|
135
|
+
aws_env.nil? || aws_env == '' ? name : "#{name}-#{aws_env}"
|
136
|
+
end
|
137
|
+
|
138
|
+
# Test to see that the derived parameters are valid.
|
139
|
+
def valid?
|
140
|
+
@config[:ami_id] &&
|
141
|
+
@config[:work_queue] && @config[:work_queue].valid? &&
|
142
|
+
@config[:status_queue] && @config[:status_queue].valid?
|
143
|
+
end
|
144
|
+
|
145
|
+
# Looks up a queue given its name.
|
146
|
+
# Stores the result in config under the given key (if given).
|
147
|
+
# Returns the queue.
|
148
|
+
# Raises an exception if none found.
|
149
|
+
def setup_queue(key, name)
|
150
|
+
return nil unless name
|
151
|
+
name = append_env(@config[name])
|
152
|
+
queue = NamedQueue.new(name)
|
153
|
+
raise "Bad configuration -- no queue #{name}" if !queue
|
154
|
+
@config[key] = queue if key
|
155
|
+
queue
|
156
|
+
end
|
157
|
+
|
158
|
+
# Looks up the image, given its name.
|
159
|
+
# Stores the result in config under the given key (if given).
|
160
|
+
# Returns the image.
|
161
|
+
# Raises an exception if none found.
|
162
|
+
def setup_image(key, name)
|
163
|
+
return nil unless name
|
164
|
+
name = append_env(@config[name])
|
165
|
+
image = EC2ImageEnumerator.new.find_image_id_by_name(name)
|
166
|
+
raise "Bad configuration -- no image #{name}" if !image
|
167
|
+
@config[key] = image if key
|
168
|
+
image
|
169
|
+
end
|
170
|
+
|
171
|
+
end
|
172
|
+
end
|
data/app/pool_manager.rb
ADDED
@@ -0,0 +1,239 @@
|
|
1
|
+
require 'periodic'
|
2
|
+
require 'pp'
|
3
|
+
require 'logger'
|
4
|
+
require 'reporter'
|
5
|
+
require 'instance_pool'
|
6
|
+
require 'aws_context'
|
7
|
+
require 'policy_factory'
|
8
|
+
require 'active_set_factory'
|
9
|
+
require 'status_parser_factory'
|
10
|
+
require 'logger_factory'
|
11
|
+
require 'policy'
|
12
|
+
|
13
|
+
module Cloudmaster
|
14
|
+
|
15
|
+
# PoolManager
|
16
|
+
#
|
17
|
+
# Manages one InstancePool, which is collections of EC2 instances
|
18
|
+
# running the same image.
|
19
|
+
# The InstancePoolMaanger is responsible for starting and terminating
|
20
|
+
# instances.
|
21
|
+
# It's policies are meant to balance acceptable performance while
|
22
|
+
# minimizing cost.
|
23
|
+
# To help achieve this goal, the PoolManager receives
|
24
|
+
# status reports from instances, through a status queue.
|
25
|
+
#
|
26
|
+
# Two classes of policies are defined: job and resource.
|
27
|
+
# These roughly correspond to stateless and stateful services.
|
28
|
+
#
|
29
|
+
# ==Job Policy==
|
30
|
+
# In the job policy, instances are assigned work through a work queue.
|
31
|
+
# * Each request is stateless, and can be serviceed by any instance.
|
32
|
+
# * Each instance processes one request at a time.
|
33
|
+
# * Each instance is either starting_up or active.
|
34
|
+
# * Once it is active, it is either busy (load 1.0) or idle (load 0.0).
|
35
|
+
# At startup, the instance reports when it is ready to begin processing, and
|
36
|
+
# enters the active state.
|
37
|
+
# Each instance reports the load through the status queue when it
|
38
|
+
# starts/stops processing a job.
|
39
|
+
#
|
40
|
+
# The job policy aims to keep the work queue to a reasonable size while not
|
41
|
+
# maintaining an excessive number of idle instances.
|
42
|
+
#
|
43
|
+
# ==Resource Policy==
|
44
|
+
# Instance managed by theresource policy have stateful associations with
|
45
|
+
# clients, and provide them services on demand.
|
46
|
+
# * Each instance processes requests made by clients as requested.
|
47
|
+
# * An external entity (the alllocator) assigns clients to instances
|
48
|
+
# based on an instance report, which lists the active instances
|
49
|
+
# and their associated load.
|
50
|
+
# * The instance report (called the active set) is stored in
|
51
|
+
# S3, at a configurable bucket and key.
|
52
|
+
# * The allocator assigns clients to instances, and also creates a
|
53
|
+
# work-queue entry each time it assigns a new client.
|
54
|
+
# * The allocator is expected to assign clients only to those instances
|
55
|
+
# listed in the active set.
|
56
|
+
# * The work queue is emptied by cloudmaster.
|
57
|
+
# * Each instance may be starting_up, active, or shutting_down.
|
58
|
+
# * At startup, the instance reports when it is ready to begin processing,
|
59
|
+
# and enters the active state.
|
60
|
+
# * The policy decides when to shut down an instance.
|
61
|
+
# It puts it in the shut_down state, but does not stop
|
62
|
+
# it immediately (to avoid disturbing existing clients).
|
63
|
+
# Instances in shutting_down state with zero load, or who have
|
64
|
+
# remained in this state for an excessive time are stopped.
|
65
|
+
# * Active instances are available to accept new clients;
|
66
|
+
# shutting_down instances are not.
|
67
|
+
# During any given time period, each instance can be partially busy (load
|
68
|
+
# between 0.0 and 1.0)
|
69
|
+
# Each instance periodically reports is load estimate for that period through
|
70
|
+
# the status queue.
|
71
|
+
# The resource policy seeks to maintain a load between an
|
72
|
+
# upper threshold and a lower threshold.
|
73
|
+
# It starts instances or stops them to achieve this.
|
74
|
+
|
75
|
+
class PoolManager
|
76
|
+
attr_reader :instances, :logger # for testing only
|
77
|
+
|
78
|
+
# Set up PoolManager.
|
79
|
+
# Creates objects used to access SQS and EC2.
|
80
|
+
# Creates instance pool, policy classes, repoter, and queues.
|
81
|
+
# Actual processing does not start until "run" is called.
|
82
|
+
def initialize(config)
|
83
|
+
# set up AWS access objects
|
84
|
+
keys = [ config[:aws_access_key], config[:aws_secret_key]]
|
85
|
+
aws = AwsContext.instance
|
86
|
+
@ec2 = aws.ec2(*keys)
|
87
|
+
@sqs = aws.sqs(*keys)
|
88
|
+
@s3 = aws.s3(*keys)
|
89
|
+
@config = config
|
90
|
+
|
91
|
+
# set up reporter
|
92
|
+
@logger = LoggerFactory.create(@config[:logger], @config[:logfile])
|
93
|
+
@reporter = Reporter.setup(@config[:name], @logger)
|
94
|
+
|
95
|
+
# Create instance pool.
|
96
|
+
# Used to keep track of instances in the pool.
|
97
|
+
@instances = InstancePool.new(@reporter, @config)
|
98
|
+
|
99
|
+
# Create a policy class
|
100
|
+
@policy = PolicyFactory.create(@config[:policy], @reporter, @config, @instances)
|
101
|
+
|
102
|
+
# Create ActiveSet
|
103
|
+
@active_set = ActiveSetFactory.create(@config[:active_set_type], @config)
|
104
|
+
|
105
|
+
# Create StatusParser
|
106
|
+
@status_parser = StatusParserFactory.create(@config[:status_parser])
|
107
|
+
|
108
|
+
unless @config[:instance_log].empty?
|
109
|
+
@reporter.log_instances(@config[:instance_log])
|
110
|
+
end
|
111
|
+
|
112
|
+
# Look up the work queues and the image from their names.
|
113
|
+
# Have policy do most of the work.
|
114
|
+
@work_queue = @config.setup_queue(:work_queue, :work_queue_name)
|
115
|
+
@status_queue = @config.setup_queue(:status_queue, :status_queue_name)
|
116
|
+
@ami_id = @config.setup_image(:ami_id, :ami_name)
|
117
|
+
|
118
|
+
@keep_running = true
|
119
|
+
end
|
120
|
+
|
121
|
+
# Main loop of cloudmaster
|
122
|
+
#
|
123
|
+
# * Reads and processes status messages.
|
124
|
+
# * Starts and stops instances according to policies
|
125
|
+
# * Detects hung instances, and stops them.
|
126
|
+
# * Displays periodic reports.
|
127
|
+
def run(end_time = nil)
|
128
|
+
summary_period = Periodic.new(@config[:summary_interval].to_i)
|
129
|
+
instance_report_period = Periodic.new(@config[:instance_report_interval].to_i)
|
130
|
+
policy_period = Periodic.new(@config[:policy_interval].to_i)
|
131
|
+
active_set_period = Periodic.new(@config[:active_set_interval].to_i * 60)
|
132
|
+
audit_instances_period = Periodic.new(@config[:audit_instance_interval].to_i * 60)
|
133
|
+
|
134
|
+
# loop reading messages from the status queue
|
135
|
+
while keep_running(end_time) do
|
136
|
+
# upate instance list and get queue depth
|
137
|
+
audit_instances_period.check do
|
138
|
+
@instances.audit_existing_instances
|
139
|
+
end
|
140
|
+
|
141
|
+
@work_queue.read_queue_depth
|
142
|
+
break unless @keep_running
|
143
|
+
|
144
|
+
# start first instance, if necessary, and ensure the
|
145
|
+
# number of running instances stays between maximum and minimum
|
146
|
+
@policy.ensure_limits
|
147
|
+
break unless @keep_running
|
148
|
+
|
149
|
+
# handle status and log messages
|
150
|
+
process_messages(@config[:receive_count].to_i)
|
151
|
+
|
152
|
+
# update public dns (for new instances) and show summary reports
|
153
|
+
@instances.update_public_dns_all
|
154
|
+
summary_period.check do
|
155
|
+
@reporter.info("Instances: #{@instances.size} Queue Depth: #{@work_queue.queue_depth}")
|
156
|
+
end
|
157
|
+
instance_report_period.check do
|
158
|
+
@reporter.info("---Instance Summary---")
|
159
|
+
@instances.each do |instance|
|
160
|
+
@reporter.info(" #{instance.id} #{instance.report}\n")
|
161
|
+
end
|
162
|
+
@reporter.info("----------------------")
|
163
|
+
end
|
164
|
+
break unless @keep_running
|
165
|
+
|
166
|
+
# Based on queue depth and load_estimate, make a decision on
|
167
|
+
# whether to start or stop servers.
|
168
|
+
policy_period.check { @policy.apply }
|
169
|
+
|
170
|
+
active_set_period.check { update_active_set }
|
171
|
+
|
172
|
+
# Stop instances that have not given recent status.
|
173
|
+
@policy.stop_hung_instances
|
174
|
+
break unless @keep_running
|
175
|
+
|
176
|
+
Clock.sleep @config[:poll_interval].to_i
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Shut down the manager.
|
181
|
+
# This may take a little time.
|
182
|
+
def shutdown
|
183
|
+
@keep_running = false
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
# Process a batch of status and log messaage.
|
189
|
+
# Status messages update the instance usage information, and
|
190
|
+
# log messages are just logged.
|
191
|
+
# Observed behavior is that only one message is returned per call
|
192
|
+
# to SQS, no matter how many are requested.
|
193
|
+
def process_message_batch(count)
|
194
|
+
# read some messages
|
195
|
+
messages = @status_queue.read_messages(count)
|
196
|
+
messages.each do |message|
|
197
|
+
# parse message
|
198
|
+
msg = @status_parser.parse_message(message[:body])
|
199
|
+
case msg[:type]
|
200
|
+
when "status"
|
201
|
+
# save the status and load_estimate
|
202
|
+
@instances.update_status(msg)
|
203
|
+
when "log"
|
204
|
+
# just log the message
|
205
|
+
@reporter.info(msg[:message], msg[:instance_id])
|
206
|
+
end
|
207
|
+
# delete the message once it has been processed
|
208
|
+
@status_queue.delete_message(message[:receipt_handle])
|
209
|
+
end
|
210
|
+
messages.size
|
211
|
+
end
|
212
|
+
|
213
|
+
# Process messages (up to count)
|
214
|
+
# Continue until there are no messages remaining.
|
215
|
+
def process_messages(count)
|
216
|
+
n_remaining = count
|
217
|
+
while n_remaining > 0
|
218
|
+
n = process_message_batch(n_remaining)
|
219
|
+
break if n == 0
|
220
|
+
n_remaining -= n
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Write active set if it has changed since the last write.
|
225
|
+
def update_active_set
|
226
|
+
@active_set.update(@instances.active_set)
|
227
|
+
end
|
228
|
+
|
229
|
+
# Returns true if the manager should keep running.
|
230
|
+
def keep_running(end_time)
|
231
|
+
if end_time && Clock.now > end_time
|
232
|
+
false
|
233
|
+
else
|
234
|
+
@keep_running
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
239
|
+
end
|
data/app/pool_runner.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'pool_configuration'
|
2
|
+
require 'pool_manager'
|
3
|
+
|
4
|
+
module Cloudmaster
|
5
|
+
|
6
|
+
# oolRunner
|
7
|
+
#
|
8
|
+
# Manages separate PoolManagers, each in a separate thread.
|
9
|
+
#
|
10
|
+
# Knows how to start (run) and stop (shutdown) the pools.
|
11
|
+
#
|
12
|
+
# Creates a thread for each pool in config, and runs a PoolManager in it.
|
13
|
+
# This needs to be passed a configuration, normally a InifileConfig object
|
14
|
+
# The configuration object contains all the information needed to control
|
15
|
+
# the pools, including the number of pools and each one's characteristics.
|
16
|
+
class PoolRunner
|
17
|
+
attr_reader :pool_managers # for testing only
|
18
|
+
# Create empty runner. Until the run method is called, the
|
19
|
+
# individual pool managers are not created.
|
20
|
+
def initialize(config)
|
21
|
+
@config = config
|
22
|
+
@pool_managers = []
|
23
|
+
Signal.trap("INT") do
|
24
|
+
self.shutdown
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Create each of the pool managers described in the configuration.
|
29
|
+
# We can limit the amount of time it runs, for testing purposes only
|
30
|
+
# In testing we can call run again after it returns, so we make sure
|
31
|
+
# that we only create pool managers the first time through.
|
32
|
+
def run(limit = nil)
|
33
|
+
if @pool_managers == []
|
34
|
+
@config.pools.each do |pool_config|
|
35
|
+
# Wrap pool config parameters up with defaults.
|
36
|
+
config = PoolConfiguration.new(@config.aws, @config.default, pool_config)
|
37
|
+
@pool_managers << PoolManager.new(config)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
threads = []
|
41
|
+
@pool_managers.each do |pool_manager|
|
42
|
+
threads << Thread.new(pool_manager) do |mgr|
|
43
|
+
mgr.run(limit)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
threads.each { |thread| thread.join }
|
47
|
+
end
|
48
|
+
|
49
|
+
# Shut down each of the pool managers.
|
50
|
+
def shutdown
|
51
|
+
@pool_managers.each { |mgr| mgr.shutdown }
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/app/reporter.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'instance_logger'
|
2
|
+
|
3
|
+
module Cloudmaster
|
4
|
+
|
5
|
+
# Creates and outputs log messages
|
6
|
+
# These are formatted with a timestamp and an instance name.
|
7
|
+
# This remembers the log device, which is anything with puts.
|
8
|
+
# This is treated as a global. It is initialized by calling "Reporter.setup"
|
9
|
+
# and then anyone can get a copy by calling "Reporter.instance".
|
10
|
+
class Reporter
|
11
|
+
attr_accessor :level
|
12
|
+
|
13
|
+
NONE = 0
|
14
|
+
ERROR = 1
|
15
|
+
WARNING = 2
|
16
|
+
INFO = 3
|
17
|
+
TRACE = 4
|
18
|
+
DEBUG = 5
|
19
|
+
ALL = 10
|
20
|
+
|
21
|
+
# Reporter displays the given name on every line.
|
22
|
+
# reports go to the given log (an IO).
|
23
|
+
def initialize(name, log)
|
24
|
+
@level = ALL
|
25
|
+
@name = name
|
26
|
+
@log = log || STDOUT
|
27
|
+
@instance_logger = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
def Reporter.setup(name, log)
|
31
|
+
new(name, log)
|
32
|
+
end
|
33
|
+
|
34
|
+
def log_instances(dir)
|
35
|
+
@instance_logger = InstanceLogger.new(dir)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Log a message
|
39
|
+
def log(message, *opts)
|
40
|
+
send_to_log("INFO:", message, *opts)
|
41
|
+
end
|
42
|
+
|
43
|
+
def err(msg, *opts)
|
44
|
+
send_to_log("ERROR:", msg, *opts) if @level >= ERROR
|
45
|
+
end
|
46
|
+
alias error err
|
47
|
+
|
48
|
+
def warning(msg, *opts)
|
49
|
+
send_to_log("WARNING:", msg, *opts) if @level >= WARNING
|
50
|
+
end
|
51
|
+
|
52
|
+
def info(msg, *opts)
|
53
|
+
send_to_log("INFO:", msg, *opts) if @level >= INFO
|
54
|
+
end
|
55
|
+
|
56
|
+
def trace(msg, *opts)
|
57
|
+
send_to_log("TRACE:", msg, *opts) if @level >= TRACE
|
58
|
+
end
|
59
|
+
|
60
|
+
def debug(msg, *opts)
|
61
|
+
send_to_log("DEBUG:", msg, *opts) if @level >= DEBUG
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def send_to_log(type, message, instance_id = nil)
|
67
|
+
msg = [type, format_timestamp(Clock.now), @name]
|
68
|
+
msg << instance_id if instance_id
|
69
|
+
msg << message
|
70
|
+
message = msg.join(' ')
|
71
|
+
@log.puts(message)
|
72
|
+
if instance_id && @instance_logger
|
73
|
+
@instance_logger.puts(instance_id, message)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def format_timestamp(ts)
|
78
|
+
"#{Clock.now.strftime("%m-%d-%y %H:%M:%S")}"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|