ey_stonith 0.1.4 → 0.1.5.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. data/bin/ey-monitor +2 -5
  2. data/bin/{ey-monitor-reset → stonith} +1 -1
  3. data/bin/{ey-monitor-stop → stonith-check} +1 -1
  4. data/bin/stonith-claim +5 -0
  5. data/bin/{ey-monitor-resume → stonith-cron} +1 -1
  6. data/bin/stonith-notify +5 -0
  7. data/bin/stonith-reset +5 -0
  8. data/bin/stonith-resume +5 -0
  9. data/bin/stonith-status +5 -0
  10. data/bin/{ey-monitor-status → stonith-stop} +1 -1
  11. data/bin/stonith-takeover +5 -0
  12. data/lib/ey_stonith/address_stealer.rb +1 -6
  13. data/lib/ey_stonith/awsm_notifier.rb +28 -29
  14. data/lib/ey_stonith/check_recorder.rb +19 -17
  15. data/lib/ey_stonith/commands/abstract.rb +94 -0
  16. data/lib/ey_stonith/commands/check.rb +58 -0
  17. data/lib/ey_stonith/commands/claim.rb +113 -0
  18. data/lib/ey_stonith/commands/commands.rb +26 -0
  19. data/lib/ey_stonith/commands/cron.rb +40 -0
  20. data/lib/ey_stonith/commands/help.rb +16 -0
  21. data/lib/ey_stonith/commands/not_found.rb +11 -0
  22. data/lib/ey_stonith/commands/notify.rb +85 -0
  23. data/lib/ey_stonith/commands/reset.rb +21 -0
  24. data/lib/ey_stonith/commands/resume.rb +19 -0
  25. data/lib/ey_stonith/commands/status.rb +23 -0
  26. data/lib/ey_stonith/commands/stop.rb +21 -0
  27. data/lib/ey_stonith/commands/takeover.rb +106 -0
  28. data/lib/ey_stonith/commands.rb +40 -0
  29. data/lib/ey_stonith/config.rb +107 -14
  30. data/lib/ey_stonith/data.rb +5 -1
  31. data/lib/ey_stonith/database.rb +28 -6
  32. data/lib/ey_stonith/history.rb +1 -1
  33. data/lib/ey_stonith.rb +2 -8
  34. metadata +57 -61
  35. data/lib/ey_stonith/abstract_master.rb +0 -15
  36. data/lib/ey_stonith/box.rb +0 -61
  37. data/lib/ey_stonith/cli.rb +0 -138
  38. data/lib/ey_stonith/local_master.rb +0 -28
  39. data/lib/ey_stonith/master.rb +0 -37
  40. data/lib/ey_stonith/meta_data.rb +0 -11
  41. data/lib/ey_stonith/slave.rb +0 -41
@@ -0,0 +1,85 @@
1
+ module EY
2
+ module Stonith
3
+ module Commands
4
+ class Notify < Abstract
5
+ def self.command
6
+ 'notify'
7
+ end
8
+
9
+ def self.banner
10
+ 'Notify the provisioning server about a takeover'
11
+ end
12
+
13
+ def notify_path
14
+ @notify_path ||= config.notify_path
15
+ end
16
+
17
+ def invoke
18
+ abort_if_unintentional
19
+
20
+ notify_path.delete if notify_path.exist?
21
+
22
+ database.with_data do |data|
23
+ abort_if_master(data.hostname)
24
+ notify!
25
+ end
26
+
27
+ abort_no_data
28
+ end
29
+
30
+ def abort_if_unintentional
31
+ return if @force || notify_path.exist?
32
+
33
+ abort <<-ERROR
34
+ Cannot notify, #{notify_path} does not exist.
35
+ If you want to (destructively) notify AWSM that this instance is master, call with --force.
36
+ ERROR
37
+ end
38
+
39
+ def abort_if_master(hostname)
40
+ if config.meta_data_hostname != hostname
41
+ Stonith.logger.error "Cannot notify, I am not master. Master is #{hostname}. Cancelling notify."
42
+ abort "Cannot notify, I am not master."
43
+ end
44
+ end
45
+
46
+ def abort_no_data
47
+ msg = "Cannot notify, there is no master in the database. Giving up."
48
+ Stonith.logger.error msg
49
+ abort msg
50
+ end
51
+
52
+ def notify!
53
+ notifier = AwsmNotifier.new(config.meta_data_id, config.notify_uri, config.awsm_credentials)
54
+ notifier.notify(method(:success), method(:unreachable), method(:refused))
55
+ end
56
+
57
+ def success
58
+ Stonith.logger.info "AWSM notified!"
59
+ history << :notified
60
+ exit
61
+ end
62
+
63
+ def unreachable
64
+ notify_path.open('w') {}
65
+ msg = "Unable to reach AWSM for promotion to master."
66
+ Stonith.logger.warn msg
67
+ abort "#{msg}\nIf you're running this from the command line, you should run ey-monitor-notify again."
68
+ end
69
+
70
+ def refused(response_body)
71
+ msg = "Notify refused by endpoint. Giving up.\nResponse: #{response_body}"
72
+ Stonith.logger.error msg
73
+ abort msg
74
+ end
75
+
76
+ def parser
77
+ super.on('-f', '--force', "Force the command (only applicable to claim currently)") do |f|
78
+ @force = f
79
+ end
80
+ super
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,21 @@
1
+ module EY
2
+ module Stonith
3
+ module Commands
4
+ class Reset < Abstract
5
+ def self.command
6
+ 'reset'
7
+ end
8
+
9
+ def self.banner
10
+ "Reset the state of all stonith commands"
11
+ end
12
+
13
+ def invoke
14
+ database.reset
15
+ config.state_path.rmtree
16
+ Stonith.logger.info "All state and database reset!"
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ module EY
2
+ module Stonith
3
+ module Commands
4
+ class Resume < Abstract
5
+ def self.command
6
+ 'resume'
7
+ end
8
+
9
+ def self.banner
10
+ "Resume Stonith monitoring"
11
+ end
12
+
13
+ def invoke
14
+ config.stop_path.delete if config.stop_path.exist?
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,23 @@
1
+ module EY
2
+ module Stonith
3
+ module Commands
4
+ class Status < Abstract
5
+ def self.command
6
+ 'status'
7
+ end
8
+
9
+ def self.banner
10
+ "List the last one or two actions performed"
11
+ end
12
+
13
+ def invoke
14
+ puts status unless status.empty?
15
+ end
16
+
17
+ def status
18
+ @status ||= history.to_s
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,21 @@
1
+ module EY
2
+ module Stonith
3
+ module Commands
4
+ class Stop < Abstract
5
+ def self.command
6
+ 'stop'
7
+ end
8
+
9
+ def self.banner
10
+ "Stop Stonith monitoring"
11
+ end
12
+
13
+ def invoke
14
+ config.stop_path.open('w') {}
15
+ sleep(0.5) until history.last == "stop"
16
+ puts "takeover" if history.include?(:takeover)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,106 @@
1
+ require 'pathname'
2
+
3
+ module EY
4
+ module Stonith
5
+ module Commands
6
+ class Takeover < Abstract
7
+ def self.command
8
+ 'takeover'
9
+ end
10
+
11
+ def self.banner
12
+ 'Cause a takeover of the specified master'
13
+ end
14
+
15
+ def invoke
16
+ abort_if_no_instance_id
17
+ abort_if_self
18
+
19
+ abort_if_takeover_lock
20
+ set_takeover_lock
21
+
22
+ database.with_locked_data do |data|
23
+ restore_data_on_fail(data)
24
+
25
+ if instance_id == data.instance_id
26
+ locked!(data.instance_id, data.ip)
27
+ else
28
+ relent!
29
+ end
30
+ end
31
+ end
32
+
33
+ def abort_if_no_instance_id
34
+ if !instance_id || instance_id == ""
35
+ abort "Please call with the instance_id of the master to takeover.\n\n#{parser}"
36
+ end
37
+ end
38
+
39
+ def abort_if_self
40
+ if instance_id == config.meta_data_id
41
+ abort "Cannot takeover self!"
42
+ end
43
+ end
44
+
45
+ def abort_if_takeover_lock
46
+ if config.takeover_path.exist?
47
+ abort "Already attempting takeover!\n#{config.takeover_path.read}"
48
+ end
49
+ end
50
+
51
+ def takeover_path
52
+ @takeover_path ||= config.takeover_path
53
+ end
54
+
55
+ def set_takeover_lock
56
+ at_exit { takeover_path.delete if takeover_path.exist? }
57
+ takeover_path.open('w') { |f| f << "Takeover started at #{Time.now}" }
58
+ end
59
+
60
+ def restore_data_on_fail(data)
61
+ at_exit do
62
+ if database.set(data) # always replace the data if it wasn't set
63
+ Stonith.logger.error("Emergency replacement of redis data #{data.inspect} occurred.")
64
+ end
65
+ end
66
+ end
67
+
68
+ def locked!(instance_id, master_ip)
69
+ Stonith.logger.info "Locked! Taking over #{instance_id}."
70
+ history << :takeover
71
+
72
+ ip = steal_address(instance_id, master_ip)
73
+ data = Data.new(config.meta_data_hostname, config.meta_data_id, ip)
74
+ database.set data
75
+
76
+ exec "#{SCRIPT_NAME} notify --force#{command_options}"
77
+ end
78
+
79
+ def relent!
80
+ history << :relent
81
+ msg = "Failed to grab lock, relenting."
82
+ Stonith.logger.info msg
83
+ abort msg
84
+ end
85
+
86
+ def steal_address(instance_id, ip)
87
+ Stonith.logger.info "Stealing IP #{ip} from #{instance_id}."
88
+ address = AddressStealer.new(instance_id, ip, config.fog_credentials)
89
+ address.associate(config.meta_data_id)
90
+ address.ip
91
+ end
92
+
93
+ def instance_id
94
+ @instance_id
95
+ end
96
+
97
+ def parser
98
+ super.on('-i', '--instance INSTANCE_ID', "Amazon Instance ID of the failing master") do |instance|
99
+ @instance_id = instance
100
+ end
101
+ super
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,40 @@
1
+ module EY
2
+ module Stonith
3
+ module Commands
4
+
5
+ dir = 'ey_stonith/commands/'
6
+ autoload :Abstract, dir + 'abstract'
7
+ autoload :NotFound, dir + 'not_found'
8
+
9
+ COMMANDS = Hash.new(:NotFound).merge({
10
+ 'check' => :Check,
11
+ 'claim' => :Claim,
12
+ 'commands' => :Commands,
13
+ 'cron' => :Cron,
14
+ 'help' => :Help,
15
+ 'notify' => :Notify,
16
+ 'reset' => :Reset,
17
+ 'resume' => :Resume,
18
+ 'status' => :Status,
19
+ 'stop' => :Stop,
20
+ 'takeover' => :Takeover,
21
+ })
22
+
23
+ COMMANDS.each do |command, klass|
24
+ autoload klass, dir + command
25
+ end
26
+
27
+ def self.invoke(argv)
28
+ klass = const_get(COMMANDS[argv.shift])
29
+ klass.new(argv).call
30
+ end
31
+
32
+ def self.formatted_command_list
33
+ COMMANDS.sort { |(a,_),(b,_)| a <=> b }.map { |cmd, klass_name|
34
+ klass = const_get(klass_name)
35
+ " #{cmd.ljust(8)} #{klass.banner}"
36
+ }.join("\n")
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,26 +1,119 @@
1
1
  require 'yaml'
2
2
  require 'json'
3
+ require 'open-uri'
3
4
 
4
5
  module EY
5
6
  module Stonith
6
- class Config < Struct.new(:ey_cloud, :redis_yml, :dna_json, :history_path, :pid_path, :log_path)
7
- def initialize(*args)
8
- super *args.map { |arg| arg && Pathname.new(arg) }
7
+ class Config
8
+ class Error < StandardError
9
+ def initialize(path, message)
10
+ super("Config file #{path}: #{message}")
11
+ end
9
12
  end
10
-
11
- def cloud_credentials() YAML::load_file(ey_cloud) end
12
- def notify_uri() "#{cloud_credentials[:api]}/api/promote_instance_to_master" end
13
13
 
14
- def monitor_heartbeat() 10 end
15
- def redis_key() 'ey:stonith' end
16
- def redis_db() 14 end
17
- def redis_host() redis[:host] end # support sometimes changes this, do not cache!
18
- def redis_port() redis[:port] end
19
- def master_hostname_from_dna() dna['master_app_server']['private_dns_name'] end
14
+ class FileNotFound < Error
15
+ def initialize(path)
16
+ super(path, "File not found.")
17
+ end
18
+ end
19
+
20
+ class RequiredSetting < Error
21
+ def initialize(path, key)
22
+ super(path, "Missing required setting #{key}.")
23
+ end
24
+ end
25
+
26
+ DEFAULT = {
27
+ 'log' => '/var/log/stonith.log',
28
+ 'state_dir' => '/var/run/stonith',
29
+ 'heartbeat' => 10,
30
+ 'notify_uri' => nil,
31
+
32
+ 'monitor_host' => nil,
33
+ 'monitor_path' => '/haproxy/monitor',
34
+
35
+ 'redis_host' => nil,
36
+ 'redis_port' => 6379,
37
+ 'redis_key' => 'stonith',
38
+ 'redis_db' => 14,
39
+ 'redis_timeout' => 60 * 5,
40
+
41
+ 'aws_secret_id' => nil,
42
+ 'aws_secret_key' => nil,
43
+ }
44
+
45
+ def initialize(config_path)
46
+ path = Pathname.new(config_path)
47
+ raise FileNotFound.new(path) unless path.readable?
48
+
49
+ realpath = path.realpath
50
+ @data = DEFAULT.merge(YAML.load_file(realpath))
51
+ @path = realpath
52
+ end
53
+
54
+ attr_reader :path
55
+
56
+ def log_path() pathname(log) end
57
+ def state_path() ensure_exists(pathname(state_dir)) end
58
+ def stop_path() state_path + 'stop' end
59
+ def claim_path() state_path + 'claim' end
60
+ def checks_path() state_path + 'checks' end
61
+ def notify_path() state_path + 'notify' end
62
+ def history_path() state_path + 'history' end
63
+ def takeover_path() state_path + 'takeover' end
64
+ def notify_uri() URI.parse(method_missing('notify_uri')) end
65
+
66
+ def meta_data_hostname() @data['meta_data_hostname'] ||= meta_data('local-hostname') end
67
+ def meta_data_id() @data['meta_data_id'] ||= meta_data('instance-id') end
68
+ def meta_data_ip() @data['meta_data_ip'] || meta_data('public-ipv4') end # don't cache
69
+
70
+ def respond_to?(meth)
71
+ @data.key?(meth) || super
72
+ end
73
+
74
+ def awsm_credentials
75
+ {
76
+ 'aws_secret_id' => aws_secret_id,
77
+ 'aws_secret_key' => aws_secret_key,
78
+ }
79
+ end
80
+
81
+ def fog_credentials
82
+ {
83
+ :aws_access_key_id => aws_secret_id,
84
+ :aws_secret_access_key => aws_secret_key,
85
+ }
86
+ end
20
87
 
21
88
  private
22
- def dna() JSON.parse(dna_json.read) end
23
- def redis() YAML::load_file(redis_yml) end
89
+
90
+ def self.secret_hack
91
+ @secret_hack = true
92
+ end
93
+
94
+ def self.pwned?
95
+ @secret_hack
96
+ end
97
+
98
+ def meta_data(key)
99
+ open("http://169.254.169.254/latest/meta-data/#{key}").read
100
+ end
101
+
102
+ def pathname(path) path && Pathname.new(path) end
103
+
104
+ def ensure_exists(path)
105
+ path.mkpath unless self.class.pwned?
106
+ path
107
+ end
108
+
109
+ def method_missing(meth, *args)
110
+ meth_s = meth.to_s
111
+ if respond_to?(meth_s)
112
+ @data[meth_s] || raise(RequiredSetting.new(path, meth_s))
113
+ else
114
+ super
115
+ end
116
+ end
24
117
  end
25
118
  end
26
119
  end
@@ -1,7 +1,11 @@
1
1
  module EY
2
2
  module Stonith
3
3
  class Data < Struct.new(:hostname, :instance_id, :ip)
4
- def key() instance_id end
4
+ alias_method :key, :instance_id
5
+
6
+ def to_s
7
+ "[#{hostname.inspect}, #{instance_id.inspect}, #{ip.inspect}]"
8
+ end
5
9
  end
6
10
  end
7
11
  end
@@ -23,9 +23,10 @@ module EY
23
23
  end
24
24
 
25
25
  def set(data)
26
- unless get
26
+ unless get # very small race condition (very)
27
27
  redis.lpush(master_key, Marshal.dump(data))
28
28
  @locked = false
29
+ true
29
30
  end
30
31
  end
31
32
 
@@ -34,22 +35,43 @@ module EY
34
35
  end
35
36
 
36
37
  private
37
-
38
+
38
39
  def get
39
40
  result = redis.lindex(master_key, 0) # index 0
40
41
  result && Marshal.load(result)
41
42
  end
42
43
 
44
+ # popping & locking
43
45
  def locked_get
44
- Marshal.load redis.blpop(master_key, 0).last # don't timeout (this number gets passed to the actual redis command)
46
+ Marshal.load redis.blpop(master_key, @config.redis_timeout).last
45
47
  end
46
48
 
47
49
  def master_key
48
- @config.redis_key
50
+ "#{@config.redis_key}:master"
49
51
  end
50
-
52
+
51
53
  def redis
52
- @redis ||= Redis.new(:host => @config.redis_host, :port => @config.redis_port, :db => @config.redis_db, :timeout => 0)
54
+ @redis ||= load_redis
55
+ end
56
+
57
+ def load_redis
58
+ redis = Redis.new(:host => @config.redis_host, :port => @config.redis_port, :db => @config.redis_db, :timeout => @config.redis_timeout)
59
+ check_redis_version redis.info[:redis_version]
60
+ redis
61
+ rescue Errno::ECONNREFUSED
62
+ abort "Unable to connect to redis"
63
+ end
64
+
65
+ def check_redis_version(version)
66
+ major, minor, patch = version.split('.').map { |num| num.to_i }
67
+ unless major > 1 ||
68
+ (major == 1 && minor > 3) ||
69
+ (major == 1 && minor == 3 && patch >= 1)
70
+ abort <<-ERROR
71
+ Redis server version [#{version}] is too old.
72
+ >= 1.3.1 required for blpop support.
73
+ ERROR
74
+ end
53
75
  end
54
76
  end
55
77
  end
@@ -30,7 +30,7 @@ module EY
30
30
  end
31
31
 
32
32
  def last() read.last end
33
- def read() @path.read.to_s.split(SEPARATOR) end
33
+ def read() @path.read.to_s.strip.split(SEPARATOR) end
34
34
  end
35
35
  end
36
36
  end
data/lib/ey_stonith.rb CHANGED
@@ -11,6 +11,7 @@ module EY
11
11
  autoload :AddressStealer, 'ey_stonith/address_stealer'
12
12
  autoload :Box, 'ey_stonith/box'
13
13
  autoload :CheckRecorder, 'ey_stonith/check_recorder'
14
+ autoload :Commands, 'ey_stonith/commands'
14
15
  autoload :CLI, 'ey_stonith/cli'
15
16
  autoload :Config, 'ey_stonith/config'
16
17
  autoload :Data, 'ey_stonith/data'
@@ -21,15 +22,8 @@ module EY
21
22
  autoload :MetaData, 'ey_stonith/meta_data'
22
23
  autoload :Slave, 'ey_stonith/slave'
23
24
 
24
- def self.logger=(logger) @@logger = logger end
25
+ def self.log_to(io) @@logger = Logger.new(io) end
25
26
  def self.logger() @@logger end
26
- self.logger = Logger.new(STDOUT)
27
- logger.level = Logger::INFO
28
-
29
- def self.meta_data() @@meta_data end
30
- def self.meta_data=(meta) @@meta_data = meta end
31
- def self.reset_meta_data() @@meta_data = MetaData end
32
- reset_meta_data
33
27
  end
34
28
  end
35
29