firenxis-god 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (123) hide show
  1. data/Announce.txt +135 -0
  2. data/History.txt +393 -0
  3. data/README.txt +59 -0
  4. data/Rakefile +142 -0
  5. data/bin/god +132 -0
  6. data/ext/god/.gitignore +5 -0
  7. data/ext/god/extconf.rb +55 -0
  8. data/ext/god/kqueue_handler.c +125 -0
  9. data/ext/god/netlink_handler.c +168 -0
  10. data/god.gemspec +164 -0
  11. data/lib/god.rb +701 -0
  12. data/lib/god/behavior.rb +52 -0
  13. data/lib/god/behaviors/clean_pid_file.rb +21 -0
  14. data/lib/god/behaviors/clean_unix_socket.rb +21 -0
  15. data/lib/god/behaviors/notify_when_flapping.rb +51 -0
  16. data/lib/god/cli/command.rb +256 -0
  17. data/lib/god/cli/run.rb +172 -0
  18. data/lib/god/cli/version.rb +23 -0
  19. data/lib/god/compat19.rb +36 -0
  20. data/lib/god/condition.rb +96 -0
  21. data/lib/god/conditions/always.rb +23 -0
  22. data/lib/god/conditions/complex.rb +86 -0
  23. data/lib/god/conditions/cpu_usage.rb +80 -0
  24. data/lib/god/conditions/degrading_lambda.rb +52 -0
  25. data/lib/god/conditions/disk_usage.rb +32 -0
  26. data/lib/god/conditions/file_mtime.rb +28 -0
  27. data/lib/god/conditions/flapping.rb +128 -0
  28. data/lib/god/conditions/http_response_code.rb +168 -0
  29. data/lib/god/conditions/lambda.rb +25 -0
  30. data/lib/god/conditions/memory_usage.rb +82 -0
  31. data/lib/god/conditions/process_exits.rb +72 -0
  32. data/lib/god/conditions/process_running.rb +74 -0
  33. data/lib/god/conditions/tries.rb +44 -0
  34. data/lib/god/configurable.rb +57 -0
  35. data/lib/god/contact.rb +114 -0
  36. data/lib/god/contacts/campfire.rb +121 -0
  37. data/lib/god/contacts/email.rb +136 -0
  38. data/lib/god/contacts/jabber.rb +75 -0
  39. data/lib/god/contacts/prowl.rb +57 -0
  40. data/lib/god/contacts/scout.rb +55 -0
  41. data/lib/god/contacts/twitter.rb +51 -0
  42. data/lib/god/contacts/webhook.rb +73 -0
  43. data/lib/god/dependency_graph.rb +41 -0
  44. data/lib/god/diagnostics.rb +37 -0
  45. data/lib/god/driver.rb +206 -0
  46. data/lib/god/errors.rb +24 -0
  47. data/lib/god/event_handler.rb +108 -0
  48. data/lib/god/event_handlers/dummy_handler.rb +13 -0
  49. data/lib/god/event_handlers/kqueue_handler.rb +17 -0
  50. data/lib/god/event_handlers/netlink_handler.rb +13 -0
  51. data/lib/god/logger.rb +109 -0
  52. data/lib/god/metric.rb +59 -0
  53. data/lib/god/process.rb +363 -0
  54. data/lib/god/registry.rb +32 -0
  55. data/lib/god/simple_logger.rb +59 -0
  56. data/lib/god/socket.rb +107 -0
  57. data/lib/god/sugar.rb +47 -0
  58. data/lib/god/sys_logger.rb +45 -0
  59. data/lib/god/system/portable_poller.rb +42 -0
  60. data/lib/god/system/process.rb +50 -0
  61. data/lib/god/system/slash_proc_poller.rb +92 -0
  62. data/lib/god/task.rb +503 -0
  63. data/lib/god/timeline.rb +25 -0
  64. data/lib/god/trigger.rb +43 -0
  65. data/lib/god/watch.rb +188 -0
  66. data/test/configs/child_events/child_events.god +44 -0
  67. data/test/configs/child_events/simple_server.rb +3 -0
  68. data/test/configs/child_polls/child_polls.god +37 -0
  69. data/test/configs/child_polls/simple_server.rb +12 -0
  70. data/test/configs/complex/complex.god +59 -0
  71. data/test/configs/complex/simple_server.rb +3 -0
  72. data/test/configs/contact/contact.god +108 -0
  73. data/test/configs/contact/simple_server.rb +3 -0
  74. data/test/configs/daemon_events/daemon_events.god +37 -0
  75. data/test/configs/daemon_events/simple_server.rb +8 -0
  76. data/test/configs/daemon_events/simple_server_stop.rb +11 -0
  77. data/test/configs/daemon_polls/daemon_polls.god +17 -0
  78. data/test/configs/daemon_polls/simple_server.rb +6 -0
  79. data/test/configs/degrading_lambda/degrading_lambda.god +31 -0
  80. data/test/configs/degrading_lambda/tcp_server.rb +15 -0
  81. data/test/configs/lifecycle/lifecycle.god +25 -0
  82. data/test/configs/matias/matias.god +50 -0
  83. data/test/configs/real.rb +59 -0
  84. data/test/configs/running_load/running_load.god +16 -0
  85. data/test/configs/stop_options/simple_server.rb +12 -0
  86. data/test/configs/stop_options/stop_options.god +39 -0
  87. data/test/configs/stress/simple_server.rb +3 -0
  88. data/test/configs/stress/stress.god +15 -0
  89. data/test/configs/task/logs/.placeholder +0 -0
  90. data/test/configs/task/task.god +26 -0
  91. data/test/configs/test.rb +61 -0
  92. data/test/helper.rb +141 -0
  93. data/test/suite.rb +6 -0
  94. data/test/test_behavior.rb +18 -0
  95. data/test/test_campfire.rb +23 -0
  96. data/test/test_condition.rb +50 -0
  97. data/test/test_conditions_disk_usage.rb +50 -0
  98. data/test/test_conditions_http_response_code.rb +109 -0
  99. data/test/test_conditions_process_running.rb +40 -0
  100. data/test/test_conditions_tries.rb +67 -0
  101. data/test/test_contact.rb +109 -0
  102. data/test/test_dependency_graph.rb +62 -0
  103. data/test/test_driver.rb +11 -0
  104. data/test/test_email.rb +34 -0
  105. data/test/test_event_handler.rb +80 -0
  106. data/test/test_god.rb +570 -0
  107. data/test/test_handlers_kqueue_handler.rb +16 -0
  108. data/test/test_jabber.rb +29 -0
  109. data/test/test_logger.rb +55 -0
  110. data/test/test_metric.rb +72 -0
  111. data/test/test_process.rb +247 -0
  112. data/test/test_prowl.rb +15 -0
  113. data/test/test_registry.rb +15 -0
  114. data/test/test_socket.rb +34 -0
  115. data/test/test_sugar.rb +42 -0
  116. data/test/test_system_portable_poller.rb +17 -0
  117. data/test/test_system_process.rb +30 -0
  118. data/test/test_task.rb +246 -0
  119. data/test/test_timeline.rb +37 -0
  120. data/test/test_trigger.rb +59 -0
  121. data/test/test_watch.rb +279 -0
  122. data/test/test_webhook.rb +15 -0
  123. metadata +362 -0
@@ -0,0 +1,128 @@
1
+ module God
2
+ module Conditions
3
+
4
+ # Condition Symbol :flapping
5
+ # Type: Trigger
6
+ #
7
+ # Trigger when a Task transitions to or from a state or states a given number
8
+ # of times within a given period.
9
+ #
10
+ # Paramaters
11
+ # Required
12
+ # +times+ is the number of times that the Task must transition before
13
+ # triggering.
14
+ # +within+ is the number of seconds within which the Task must transition
15
+ # the specified number of times before triggering. You may use
16
+ # the sugar methods #seconds, #minutes, #hours, #days to clarify
17
+ # your code (see examples).
18
+ # --one or both of--
19
+ # +from_state+ is the state (as a Symbol) from which the transition must occur.
20
+ # +to_state is the state (as a Symbol) to which the transition must occur.
21
+ #
22
+ # Optional:
23
+ # +retry_in+ is the number of seconds after which to re-monitor the Task after
24
+ # it has been disabled by the condition.
25
+ # +retry_times+ is the number of times after which to permanently unmonitor
26
+ # the Task.
27
+ # +retry_within+ is the number of seconds within which
28
+ #
29
+ # Examples
30
+ #
31
+ # Trigger if
32
+ class Flapping < TriggerCondition
33
+ attr_accessor :times,
34
+ :within,
35
+ :from_state,
36
+ :to_state,
37
+ :retry_in,
38
+ :retry_times,
39
+ :retry_within
40
+
41
+ def initialize
42
+ self.info = "process is flapping"
43
+ end
44
+
45
+ def prepare
46
+ @timeline = Timeline.new(self.times)
47
+ @retry_timeline = Timeline.new(self.retry_times)
48
+ end
49
+
50
+ def valid?
51
+ valid = true
52
+ valid &= complain("Attribute 'times' must be specified", self) if self.times.nil?
53
+ valid &= complain("Attribute 'within' must be specified", self) if self.within.nil?
54
+ valid &= complain("Attributes 'from_state', 'to_state', or both must be specified", self) if self.from_state.nil? && self.to_state.nil?
55
+ valid
56
+ end
57
+
58
+ def process(event, payload)
59
+ begin
60
+ if event == :state_change
61
+ event_from_state, event_to_state = *payload
62
+
63
+ from_state_match = !self.from_state || self.from_state && Array(self.from_state).include?(event_from_state)
64
+ to_state_match = !self.to_state || self.to_state && Array(self.to_state).include?(event_to_state)
65
+
66
+ if from_state_match && to_state_match
67
+ @timeline << Time.now
68
+
69
+ concensus = (@timeline.size == self.times)
70
+ duration = (@timeline.last - @timeline.first) < self.within
71
+
72
+ if concensus && duration
73
+ @timeline.clear
74
+ trigger
75
+ retry_mechanism
76
+ end
77
+ end
78
+ end
79
+ rescue => e
80
+ puts e.message
81
+ puts e.backtrace.join("\n")
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def retry_mechanism
88
+ if self.retry_in
89
+ @retry_timeline << Time.now
90
+
91
+ concensus = (@retry_timeline.size == self.retry_times)
92
+ duration = (@retry_timeline.last - @retry_timeline.first) < self.retry_within
93
+
94
+ if concensus && duration
95
+ # give up
96
+ Thread.new do
97
+ sleep 1
98
+
99
+ # log
100
+ msg = "#{self.watch.name} giving up"
101
+ applog(self.watch, :info, msg)
102
+ end
103
+ else
104
+ # try again later
105
+ Thread.new do
106
+ sleep 1
107
+
108
+ # log
109
+ msg = "#{self.watch.name} auto-reenable monitoring in #{self.retry_in} seconds"
110
+ applog(self.watch, :info, msg)
111
+
112
+ sleep self.retry_in
113
+
114
+ # log
115
+ msg = "#{self.watch.name} auto-reenabling monitoring"
116
+ applog(self.watch, :info, msg)
117
+
118
+ if self.watch.state == :unmonitored
119
+ self.watch.monitor
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,168 @@
1
+ require 'net/http'
2
+
3
+ module God
4
+ module Conditions
5
+
6
+ # Condition Symbol :http_response_code
7
+ # Type: Poll
8
+ #
9
+ # Trigger based on the response from an HTTP request.
10
+ #
11
+ # Paramaters
12
+ # Required
13
+ # +host+ is the hostname to connect [required]
14
+ # --one of code_is or code_is_not--
15
+ # +code_is+ trigger if the response code IS one of these
16
+ # e.g. 500 or '500' or [404, 500] or %w{404 500}
17
+ # +code_is_not+ trigger if the response code IS NOT one of these
18
+ # e.g. 200 or '200' or [200, 302] or %w{200 302}
19
+ # Optional
20
+ # +port+ is the port to connect (default 80)
21
+ # +path+ is the path to connect (default '/')
22
+ # +headers+ is the hash of HTTP headers to send (default none)
23
+ # +times+ is the number of times after which to trigger (default 1)
24
+ # e.g. 3 (times in a row) or [3, 5] (three out of fives times)
25
+ # +timeout+ is the time to wait for a connection (default 60.seconds)
26
+ #
27
+ # Examples
28
+ #
29
+ # Trigger if the response code from www.example.com/foo/bar
30
+ # is not a 200 (or if the connection is refused or times out:
31
+ #
32
+ # on.condition(:http_response_code) do |c|
33
+ # c.host = 'www.example.com'
34
+ # c.path = '/foo/bar'
35
+ # c.code_is_not = 200
36
+ # end
37
+ #
38
+ # Trigger if the response code is a 404 or a 500 (will not
39
+ # be triggered by a connection refusal or timeout):
40
+ #
41
+ # on.condition(:http_response_code) do |c|
42
+ # c.host = 'www.example.com'
43
+ # c.path = '/foo/bar'
44
+ # c.code_is = [404, 500]
45
+ # end
46
+ #
47
+ # Trigger if the response code is not a 200 five times in a row:
48
+ #
49
+ # on.condition(:http_response_code) do |c|
50
+ # c.host = 'www.example.com'
51
+ # c.path = '/foo/bar'
52
+ # c.code_is_not = 200
53
+ # c.times = 5
54
+ # end
55
+ #
56
+ # Trigger if the response code is not a 200 or does not respond
57
+ # within 10 seconds:
58
+ #
59
+ # on.condition(:http_response_code) do |c|
60
+ # c.host = 'www.example.com'
61
+ # c.path = '/foo/bar'
62
+ # c.code_is_not = 200
63
+ # c.timeout = 10
64
+ # end
65
+ class HttpResponseCode < PollCondition
66
+ attr_accessor :code_is, # e.g. 500 or '500' or [404, 500] or %w{404 500}
67
+ :code_is_not, # e.g. 200 or '200' or [200, 302] or %w{200 302}
68
+ :times, # e.g. 3 or [3, 5]
69
+ :host, # e.g. www.example.com
70
+ :port, # e.g. 8080
71
+ :timeout, # e.g. 60.seconds
72
+ :path, # e.g. '/'
73
+ :headers # e.g. {'Host' => 'myvirtual.mydomain.com'}
74
+
75
+ def initialize
76
+ super
77
+ self.port = 80
78
+ self.path = '/'
79
+ self.headers = {}
80
+ self.times = [1, 1]
81
+ self.timeout = 60.seconds
82
+ end
83
+
84
+ def prepare
85
+ self.code_is = Array(self.code_is).map { |x| x.to_i } if self.code_is
86
+ self.code_is_not = Array(self.code_is_not).map { |x| x.to_i } if self.code_is_not
87
+
88
+ if self.times.kind_of?(Integer)
89
+ self.times = [self.times, self.times]
90
+ end
91
+
92
+ @timeline = Timeline.new(self.times[1])
93
+ @history = Timeline.new(self.times[1])
94
+ end
95
+
96
+ def reset
97
+ @timeline.clear
98
+ @history.clear
99
+ end
100
+
101
+ def valid?
102
+ valid = true
103
+ valid &= complain("Attribute 'host' must be specified", self) if self.host.nil?
104
+ valid &= complain("One (and only one) of attributes 'code_is' and 'code_is_not' must be specified", self) if
105
+ (self.code_is.nil? && self.code_is_not.nil?) || (self.code_is && self.code_is_not)
106
+ valid
107
+ end
108
+
109
+ def test
110
+ response = nil
111
+
112
+ Net::HTTP.start(self.host, self.port) do |http|
113
+ http.read_timeout = self.timeout
114
+ response = http.get(self.path, self.headers)
115
+ end
116
+
117
+ actual_response_code = response.code.to_i
118
+ if self.code_is && self.code_is.include?(actual_response_code)
119
+ pass(actual_response_code)
120
+ elsif self.code_is_not && !self.code_is_not.include?(actual_response_code)
121
+ pass(actual_response_code)
122
+ else
123
+ fail(actual_response_code)
124
+ end
125
+ rescue Errno::ECONNREFUSED
126
+ self.code_is ? fail('Refused') : pass('Refused')
127
+ rescue Errno::ECONNRESET
128
+ self.code_is ? fail('Reset') : pass('Reset')
129
+ rescue EOFError
130
+ self.code_is ? fail('EOF') : pass('EOF')
131
+ rescue Timeout::Error
132
+ self.code_is ? fail('Timeout') : pass('Timeout')
133
+ rescue Errno::ETIMEDOUT
134
+ self.code_is ? fail('Timedout') : pass('Timedout')
135
+ rescue Exception => failure
136
+ self.code_is ? fail(failure.class.name) : pass(failure.class.name)
137
+ end
138
+
139
+ private
140
+
141
+ def pass(code)
142
+ @timeline << true
143
+ if @timeline.select { |x| x }.size >= self.times.first
144
+ self.info = "http response abnormal #{history(code, true)}"
145
+ true
146
+ else
147
+ self.info = "http response nominal #{history(code, true)}"
148
+ false
149
+ end
150
+ end
151
+
152
+ def fail(code)
153
+ @timeline << false
154
+ self.info = "http response nominal #{history(code, false)}"
155
+ false
156
+ end
157
+
158
+ def history(code, passed)
159
+ entry = code.to_s.dup
160
+ entry = '*' + entry if passed
161
+ @history << entry
162
+ '[' + @history.join(", ") + ']'
163
+ end
164
+
165
+ end
166
+
167
+ end
168
+ end
@@ -0,0 +1,25 @@
1
+ module God
2
+ module Conditions
3
+
4
+ class Lambda < PollCondition
5
+ attr_accessor :lambda
6
+
7
+ def valid?
8
+ valid = true
9
+ valid &= complain("Attribute 'lambda' must be specified", self) if self.lambda.nil?
10
+ valid
11
+ end
12
+
13
+ def test
14
+ if self.lambda.call()
15
+ self.info = "lambda condition was satisfied"
16
+ true
17
+ else
18
+ self.info = "lambda condition was not satisfied"
19
+ false
20
+ end
21
+ end
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,82 @@
1
+ module God
2
+ module Conditions
3
+
4
+ # Condition Symbol :memory_usage
5
+ # Type: Poll
6
+ #
7
+ # Trigger when the resident memory of a process is above a specified limit.
8
+ #
9
+ # Paramaters
10
+ # Required
11
+ # +pid_file+ is the pid file of the process in question. Automatically
12
+ # populated for Watches.
13
+ # +above+ is the amount of resident memory (in kilobytes) above which
14
+ # the condition should trigger. You can also use the sugar
15
+ # methods #kilobytes, #megabytes, and #gigabytes to clarify
16
+ # this amount (see examples).
17
+ #
18
+ # Examples
19
+ #
20
+ # Trigger if the process is using more than 100 megabytes of resident
21
+ # memory (from a Watch):
22
+ #
23
+ # on.condition(:memory_usage) do |c|
24
+ # c.above = 100.megabytes
25
+ # end
26
+ #
27
+ # Non-Watch Tasks must specify a PID file:
28
+ #
29
+ # on.condition(:memory_usage) do |c|
30
+ # c.above = 100.megabytes
31
+ # c.pid_file = "/var/run/mongrel.3000.pid"
32
+ # end
33
+ class MemoryUsage < PollCondition
34
+ attr_accessor :above, :times, :pid_file
35
+
36
+ def initialize
37
+ super
38
+ self.above = nil
39
+ self.times = [1, 1]
40
+ end
41
+
42
+ def prepare
43
+ if self.times.kind_of?(Integer)
44
+ self.times = [self.times, self.times]
45
+ end
46
+
47
+ @timeline = Timeline.new(self.times[1])
48
+ end
49
+
50
+ def reset
51
+ @timeline.clear
52
+ end
53
+
54
+ def pid
55
+ self.pid_file ? File.read(self.pid_file).strip.to_i : self.watch.pid
56
+ end
57
+
58
+ def valid?
59
+ valid = true
60
+ valid &= complain("Attribute 'pid_file' must be specified", self) if self.pid_file.nil? && self.watch.pid_file.nil?
61
+ valid &= complain("Attribute 'above' must be specified", self) if self.above.nil?
62
+ valid
63
+ end
64
+
65
+ def test
66
+ process = System::Process.new(self.pid)
67
+ @timeline.push(process.memory)
68
+
69
+ history = "[" + @timeline.map { |x| "#{x > self.above ? '*' : ''}#{x}kb" }.join(", ") + "]"
70
+
71
+ if @timeline.select { |x| x > self.above }.size >= self.times.first
72
+ self.info = "memory out of bounds #{history}"
73
+ return true
74
+ else
75
+ self.info = "memory within bounds #{history}"
76
+ return false
77
+ end
78
+ end
79
+ end
80
+
81
+ end
82
+ end
@@ -0,0 +1,72 @@
1
+ module God
2
+ module Conditions
3
+
4
+ # Condition Symbol :process_exits
5
+ # Type: Event
6
+ #
7
+ # Trigger when a process exits.
8
+ #
9
+ # Paramaters
10
+ # Required
11
+ # +pid_file+ is the pid file of the process in question. Automatically
12
+ # populated for Watches.
13
+ #
14
+ # Examples
15
+ #
16
+ # Trigger if process exits (from a Watch):
17
+ #
18
+ # on.condition(:process_exits)
19
+ #
20
+ # Trigger if process exits:
21
+ #
22
+ # on.condition(:process_exits) do |c|
23
+ # c.pid_file = "/var/run/mongrel.3000.pid"
24
+ # end
25
+ class ProcessExits < EventCondition
26
+ attr_accessor :pid_file
27
+
28
+ def initialize
29
+ self.info = "process exited"
30
+ end
31
+
32
+ def valid?
33
+ true
34
+ end
35
+
36
+ def pid
37
+ self.pid_file ? File.read(self.pid_file).strip.to_i : self.watch.pid
38
+ end
39
+
40
+ def register
41
+ pid = self.pid
42
+
43
+ begin
44
+ EventHandler.register(pid, :proc_exit) do |extra|
45
+ formatted_extra = extra.size > 0 ? " #{extra.inspect}" : ""
46
+ self.info = "process #{pid} exited#{formatted_extra}"
47
+ self.watch.trigger(self)
48
+ end
49
+
50
+ msg = "#{self.watch.name} registered 'proc_exit' event for pid #{pid}"
51
+ applog(self.watch, :info, msg)
52
+ rescue StandardError
53
+ raise EventRegistrationFailedError.new
54
+ end
55
+ end
56
+
57
+ def deregister
58
+ pid = self.pid
59
+ if pid
60
+ EventHandler.deregister(pid, :proc_exit)
61
+
62
+ msg = "#{self.watch.name} deregistered 'proc_exit' event for pid #{pid}"
63
+ applog(self.watch, :info, msg)
64
+ else
65
+ pid_file_location = self.pid_file || self.watch.pid_file
66
+ applog(self.watch, :error, "#{self.watch.name} could not deregister: no cached PID or PID file #{pid_file_location} (#{self.base_name})")
67
+ end
68
+ end
69
+ end
70
+
71
+ end
72
+ end