mcproc 2016.2.20

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. checksums.yaml +7 -0
  2. data/Announce.txt +135 -0
  3. data/Gemfile +9 -0
  4. data/History.txt +469 -0
  5. data/LICENSE +22 -0
  6. data/README.md +37 -0
  7. data/Rakefile +185 -0
  8. data/TODO.md +37 -0
  9. data/bin/mcproc +134 -0
  10. data/doc/intro.asciidoc +20 -0
  11. data/doc/mcproc.asciidoc +1592 -0
  12. data/ext/god/.gitignore +5 -0
  13. data/ext/god/extconf.rb +56 -0
  14. data/ext/god/kqueue_handler.c +133 -0
  15. data/ext/god/netlink_handler.c +182 -0
  16. data/lib/god.rb +780 -0
  17. data/lib/god/behavior.rb +52 -0
  18. data/lib/god/behaviors/clean_pid_file.rb +21 -0
  19. data/lib/god/behaviors/clean_unix_socket.rb +21 -0
  20. data/lib/god/behaviors/notify_when_flapping.rb +51 -0
  21. data/lib/god/cli/command.rb +268 -0
  22. data/lib/god/cli/run.rb +170 -0
  23. data/lib/god/cli/version.rb +23 -0
  24. data/lib/god/compat19.rb +33 -0
  25. data/lib/god/condition.rb +96 -0
  26. data/lib/god/conditions/always.rb +36 -0
  27. data/lib/god/conditions/complex.rb +86 -0
  28. data/lib/god/conditions/cpu_usage.rb +80 -0
  29. data/lib/god/conditions/degrading_lambda.rb +52 -0
  30. data/lib/god/conditions/disk_usage.rb +32 -0
  31. data/lib/god/conditions/file_mtime.rb +28 -0
  32. data/lib/god/conditions/file_touched.rb +44 -0
  33. data/lib/god/conditions/flapping.rb +128 -0
  34. data/lib/god/conditions/http_response_code.rb +184 -0
  35. data/lib/god/conditions/lambda.rb +25 -0
  36. data/lib/god/conditions/memory_usage.rb +82 -0
  37. data/lib/god/conditions/process_exits.rb +66 -0
  38. data/lib/god/conditions/process_running.rb +63 -0
  39. data/lib/god/conditions/socket_responding.rb +142 -0
  40. data/lib/god/conditions/tries.rb +44 -0
  41. data/lib/god/configurable.rb +57 -0
  42. data/lib/god/contact.rb +114 -0
  43. data/lib/god/contacts/airbrake.rb +44 -0
  44. data/lib/god/contacts/campfire.rb +121 -0
  45. data/lib/god/contacts/email.rb +130 -0
  46. data/lib/god/contacts/hipchat.rb +117 -0
  47. data/lib/god/contacts/jabber.rb +75 -0
  48. data/lib/god/contacts/prowl.rb +57 -0
  49. data/lib/god/contacts/scout.rb +55 -0
  50. data/lib/god/contacts/sensu.rb +59 -0
  51. data/lib/god/contacts/slack.rb +98 -0
  52. data/lib/god/contacts/statsd.rb +46 -0
  53. data/lib/god/contacts/twitter.rb +51 -0
  54. data/lib/god/contacts/webhook.rb +74 -0
  55. data/lib/god/driver.rb +238 -0
  56. data/lib/god/errors.rb +24 -0
  57. data/lib/god/event_handler.rb +112 -0
  58. data/lib/god/event_handlers/dummy_handler.rb +13 -0
  59. data/lib/god/event_handlers/kqueue_handler.rb +17 -0
  60. data/lib/god/event_handlers/netlink_handler.rb +13 -0
  61. data/lib/god/logger.rb +109 -0
  62. data/lib/god/metric.rb +87 -0
  63. data/lib/god/process.rb +381 -0
  64. data/lib/god/registry.rb +32 -0
  65. data/lib/god/simple_logger.rb +59 -0
  66. data/lib/god/socket.rb +113 -0
  67. data/lib/god/sugar.rb +62 -0
  68. data/lib/god/sys_logger.rb +45 -0
  69. data/lib/god/system/portable_poller.rb +42 -0
  70. data/lib/god/system/process.rb +50 -0
  71. data/lib/god/system/slash_proc_poller.rb +92 -0
  72. data/lib/god/task.rb +552 -0
  73. data/lib/god/timeline.rb +25 -0
  74. data/lib/god/trigger.rb +43 -0
  75. data/lib/god/watch.rb +340 -0
  76. data/mcproc.gemspec +192 -0
  77. data/test/configs/child_events/child_events.god +44 -0
  78. data/test/configs/child_events/simple_server.rb +3 -0
  79. data/test/configs/child_polls/child_polls.god +37 -0
  80. data/test/configs/child_polls/simple_server.rb +12 -0
  81. data/test/configs/complex/complex.god +59 -0
  82. data/test/configs/complex/simple_server.rb +3 -0
  83. data/test/configs/contact/contact.god +118 -0
  84. data/test/configs/contact/simple_server.rb +3 -0
  85. data/test/configs/daemon_events/daemon_events.god +37 -0
  86. data/test/configs/daemon_events/simple_server.rb +8 -0
  87. data/test/configs/daemon_events/simple_server_stop.rb +11 -0
  88. data/test/configs/daemon_polls/daemon_polls.god +17 -0
  89. data/test/configs/daemon_polls/simple_server.rb +6 -0
  90. data/test/configs/degrading_lambda/degrading_lambda.god +31 -0
  91. data/test/configs/degrading_lambda/tcp_server.rb +15 -0
  92. data/test/configs/keepalive/keepalive.god +9 -0
  93. data/test/configs/keepalive/keepalive.rb +12 -0
  94. data/test/configs/lifecycle/lifecycle.god +25 -0
  95. data/test/configs/matias/matias.god +50 -0
  96. data/test/configs/real.rb +59 -0
  97. data/test/configs/running_load/running_load.god +16 -0
  98. data/test/configs/stop_options/simple_server.rb +12 -0
  99. data/test/configs/stop_options/stop_options.god +39 -0
  100. data/test/configs/stress/simple_server.rb +3 -0
  101. data/test/configs/stress/stress.god +15 -0
  102. data/test/configs/task/logs/.placeholder +0 -0
  103. data/test/configs/task/task.god +26 -0
  104. data/test/configs/test.rb +61 -0
  105. data/test/configs/usr1_trapper.rb +10 -0
  106. data/test/helper.rb +172 -0
  107. data/test/suite.rb +6 -0
  108. data/test/test_airbrake.rb +14 -0
  109. data/test/test_behavior.rb +18 -0
  110. data/test/test_campfire.rb +22 -0
  111. data/test/test_condition.rb +52 -0
  112. data/test/test_conditions_disk_usage.rb +50 -0
  113. data/test/test_conditions_http_response_code.rb +109 -0
  114. data/test/test_conditions_process_running.rb +40 -0
  115. data/test/test_conditions_socket_responding.rb +176 -0
  116. data/test/test_conditions_tries.rb +67 -0
  117. data/test/test_contact.rb +109 -0
  118. data/test/test_driver.rb +26 -0
  119. data/test/test_email.rb +34 -0
  120. data/test/test_event_handler.rb +82 -0
  121. data/test/test_god.rb +710 -0
  122. data/test/test_god_system.rb +201 -0
  123. data/test/test_handlers_kqueue_handler.rb +16 -0
  124. data/test/test_hipchat.rb +23 -0
  125. data/test/test_jabber.rb +29 -0
  126. data/test/test_logger.rb +55 -0
  127. data/test/test_metric.rb +74 -0
  128. data/test/test_process.rb +263 -0
  129. data/test/test_prowl.rb +15 -0
  130. data/test/test_registry.rb +15 -0
  131. data/test/test_sensu.rb +11 -0
  132. data/test/test_slack.rb +57 -0
  133. data/test/test_socket.rb +34 -0
  134. data/test/test_statsd.rb +22 -0
  135. data/test/test_sugar.rb +42 -0
  136. data/test/test_system_portable_poller.rb +17 -0
  137. data/test/test_system_process.rb +30 -0
  138. data/test/test_task.rb +246 -0
  139. data/test/test_timeline.rb +37 -0
  140. data/test/test_trigger.rb +63 -0
  141. data/test/test_watch.rb +286 -0
  142. data/test/test_webhook.rb +22 -0
  143. metadata +475 -0
@@ -0,0 +1,28 @@
1
+ module God
2
+ module Conditions
3
+
4
+ class FileMtime < PollCondition
5
+ attr_accessor :path, :max_age
6
+
7
+ def initialize
8
+ super
9
+ self.path = nil
10
+ self.max_age = nil
11
+ end
12
+
13
+ def valid?
14
+ valid = true
15
+ valid &= complain("Attribute 'path' must be specified", self) if self.path.nil?
16
+ valid &= complain("Attribute 'max_age' must be specified", self) if self.max_age.nil?
17
+ valid
18
+ end
19
+
20
+ def test
21
+ (Time.now - File.mtime(self.path)) > self.max_age
22
+ end
23
+ end
24
+
25
+ end
26
+ end
27
+
28
+
@@ -0,0 +1,44 @@
1
+ module God
2
+ module Conditions
3
+
4
+ # Condition Symbol :file_touched
5
+ # Type: Poll
6
+ #
7
+ # Trigger when a specified file is touched.
8
+ #
9
+ # Paramaters
10
+ # Required
11
+ # +path+ is the path to the file to watch.
12
+ #
13
+ # Examples
14
+ #
15
+ # Trigger if 'tmp/restart.txt' file is touched (from a Watch):
16
+ #
17
+ # on.condition(:file_touched) do |c|
18
+ # c.path = 'tmp/restart.txt'
19
+ # end
20
+ #
21
+ class FileTouched < PollCondition
22
+ attr_accessor :path
23
+
24
+ def initialize
25
+ super
26
+ self.path = nil
27
+ end
28
+
29
+ def valid?
30
+ valid = true
31
+ valid &= complain("Attribute 'path' must be specified", self) if self.path.nil?
32
+ valid
33
+ end
34
+
35
+ def test
36
+ if File.exists?(self.path)
37
+ (Time.now - File.mtime(self.path)) <= self.interval
38
+ else
39
+ false
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,128 @@
1
+ module God
2
+ module Conditions
3
+
4
+ # Condition Symbol :flapping
5
+ # Type: Trigger
6
+ #
7
+ # Trigger when a Task transitions to or from a state or states a given number
8
+ # of times within a given period.
9
+ #
10
+ # Paramaters
11
+ # Required
12
+ # +times+ is the number of times that the Task must transition before
13
+ # triggering.
14
+ # +within+ is the number of seconds within which the Task must transition
15
+ # the specified number of times before triggering. You may use
16
+ # the sugar methods #seconds, #minutes, #hours, #days to clarify
17
+ # your code (see examples).
18
+ # --one or both of--
19
+ # +from_state+ is the state (as a Symbol) from which the transition must occur.
20
+ # +to_state is the state (as a Symbol) to which the transition must occur.
21
+ #
22
+ # Optional:
23
+ # +retry_in+ is the number of seconds after which to re-monitor the Task after
24
+ # it has been disabled by the condition.
25
+ # +retry_times+ is the number of times after which to permanently unmonitor
26
+ # the Task.
27
+ # +retry_within+ is the number of seconds within which
28
+ #
29
+ # Examples
30
+ #
31
+ # Trigger if
32
+ class Flapping < TriggerCondition
33
+ attr_accessor :times,
34
+ :within,
35
+ :from_state,
36
+ :to_state,
37
+ :retry_in,
38
+ :retry_times,
39
+ :retry_within
40
+
41
+ def initialize
42
+ self.info = "process is flapping"
43
+ end
44
+
45
+ def prepare
46
+ @timeline = Timeline.new(self.times)
47
+ @retry_timeline = Timeline.new(self.retry_times)
48
+ end
49
+
50
+ def valid?
51
+ valid = true
52
+ valid &= complain("Attribute 'times' must be specified", self) if self.times.nil?
53
+ valid &= complain("Attribute 'within' must be specified", self) if self.within.nil?
54
+ valid &= complain("Attributes 'from_state', 'to_state', or both must be specified", self) if self.from_state.nil? && self.to_state.nil?
55
+ valid
56
+ end
57
+
58
+ def process(event, payload)
59
+ begin
60
+ if event == :state_change
61
+ event_from_state, event_to_state = *payload
62
+
63
+ from_state_match = !self.from_state || self.from_state && Array(self.from_state).include?(event_from_state)
64
+ to_state_match = !self.to_state || self.to_state && Array(self.to_state).include?(event_to_state)
65
+
66
+ if from_state_match && to_state_match
67
+ @timeline << Time.now
68
+
69
+ concensus = (@timeline.size == self.times)
70
+ duration = (@timeline.last - @timeline.first) < self.within
71
+
72
+ if concensus && duration
73
+ @timeline.clear
74
+ trigger
75
+ retry_mechanism
76
+ end
77
+ end
78
+ end
79
+ rescue => e
80
+ puts e.message
81
+ puts e.backtrace.join("\n")
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def retry_mechanism
88
+ if self.retry_in
89
+ @retry_timeline << Time.now
90
+
91
+ concensus = (@retry_timeline.size == self.retry_times)
92
+ duration = (@retry_timeline.last - @retry_timeline.first) < self.retry_within
93
+
94
+ if concensus && duration
95
+ # give up
96
+ Thread.new do
97
+ sleep 1
98
+
99
+ # log
100
+ msg = "#{self.watch.name} giving up"
101
+ applog(self.watch, :info, msg)
102
+ end
103
+ else
104
+ # try again later
105
+ Thread.new do
106
+ sleep 1
107
+
108
+ # log
109
+ msg = "#{self.watch.name} auto-reenable monitoring in #{self.retry_in} seconds"
110
+ applog(self.watch, :info, msg)
111
+
112
+ sleep self.retry_in
113
+
114
+ # log
115
+ msg = "#{self.watch.name} auto-reenabling monitoring"
116
+ applog(self.watch, :info, msg)
117
+
118
+ if self.watch.state == :unmonitored
119
+ self.watch.monitor
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,184 @@
1
+ require 'net/http'
2
+ require 'net/https'
3
+
4
+ module God
5
+ module Conditions
6
+
7
+ # Condition Symbol :http_response_code
8
+ # Type: Poll
9
+ #
10
+ # Trigger based on the response from an HTTP request.
11
+ #
12
+ # Paramaters
13
+ # Required
14
+ # +host+ is the hostname to connect [required]
15
+ # --one of code_is or code_is_not--
16
+ # +code_is+ trigger if the response code IS one of these
17
+ # e.g. 500 or '500' or [404, 500] or %w{404 500}
18
+ # +code_is_not+ trigger if the response code IS NOT one of these
19
+ # e.g. 200 or '200' or [200, 302] or %w{200 302}
20
+ # Optional
21
+ # +port+ is the port to connect (default 80)
22
+ # +path+ is the path to connect (default '/')
23
+ # +headers+ is the hash of HTTP headers to send (default none)
24
+ # +times+ is the number of times after which to trigger (default 1)
25
+ # e.g. 3 (times in a row) or [3, 5] (three out of fives times)
26
+ # +timeout+ is the time to wait for a connection (default 60.seconds)
27
+ # +ssl+ should the connection use ssl (default false)
28
+ #
29
+ # Examples
30
+ #
31
+ # Trigger if the response code from www.example.com/foo/bar
32
+ # is not a 200 (or if the connection is refused or times out:
33
+ #
34
+ # on.condition(:http_response_code) do |c|
35
+ # c.host = 'www.example.com'
36
+ # c.path = '/foo/bar'
37
+ # c.code_is_not = 200
38
+ # end
39
+ #
40
+ # Trigger if the response code is a 404 or a 500 (will not
41
+ # be triggered by a connection refusal or timeout):
42
+ #
43
+ # on.condition(:http_response_code) do |c|
44
+ # c.host = 'www.example.com'
45
+ # c.path = '/foo/bar'
46
+ # c.code_is = [404, 500]
47
+ # end
48
+ #
49
+ # Trigger if the response code is not a 200 five times in a row:
50
+ #
51
+ # on.condition(:http_response_code) do |c|
52
+ # c.host = 'www.example.com'
53
+ # c.path = '/foo/bar'
54
+ # c.code_is_not = 200
55
+ # c.times = 5
56
+ # end
57
+ #
58
+ # Trigger if the response code is not a 200 or does not respond
59
+ # within 10 seconds:
60
+ #
61
+ # on.condition(:http_response_code) do |c|
62
+ # c.host = 'www.example.com'
63
+ # c.path = '/foo/bar'
64
+ # c.code_is_not = 200
65
+ # c.timeout = 10
66
+ # end
67
+ class HttpResponseCode < PollCondition
68
+ attr_accessor :code_is, # e.g. 500 or '500' or [404, 500] or %w{404 500}
69
+ :code_is_not, # e.g. 200 or '200' or [200, 302] or %w{200 302}
70
+ :times, # e.g. 3 or [3, 5]
71
+ :host, # e.g. www.example.com
72
+ :port, # e.g. 8080
73
+ :ssl, # e.g. true or false
74
+ :ca_file, # e.g /path/to/pem_file for ssl verification (checkout http://curl.haxx.se/ca/cacert.pem)
75
+ :timeout, # e.g. 60.seconds
76
+ :path, # e.g. '/'
77
+ :headers # e.g. {'Host' => 'myvirtual.mydomain.com'}
78
+
79
+ def initialize
80
+ super
81
+ self.port = 80
82
+ self.path = '/'
83
+ self.headers = {}
84
+ self.times = [1, 1]
85
+ self.timeout = 60.seconds
86
+ self.ssl = false
87
+ self.ca_file = nil
88
+ end
89
+
90
+ def prepare
91
+ self.code_is = Array(self.code_is).map { |x| x.to_i } if self.code_is
92
+ self.code_is_not = Array(self.code_is_not).map { |x| x.to_i } if self.code_is_not
93
+
94
+ if self.times.kind_of?(Integer)
95
+ self.times = [self.times, self.times]
96
+ end
97
+
98
+ @timeline = Timeline.new(self.times[1])
99
+ @history = Timeline.new(self.times[1])
100
+ end
101
+
102
+ def reset
103
+ @timeline.clear
104
+ @history.clear
105
+ end
106
+
107
+ def valid?
108
+ valid = true
109
+ valid &= complain("Attribute 'host' must be specified", self) if self.host.nil?
110
+ valid &= complain("One (and only one) of attributes 'code_is' and 'code_is_not' must be specified", self) if
111
+ (self.code_is.nil? && self.code_is_not.nil?) || (self.code_is && self.code_is_not)
112
+ valid
113
+ end
114
+
115
+ def test
116
+ response = nil
117
+
118
+ connection = Net::HTTP.new(self.host, self.port)
119
+ connection.use_ssl = self.port == 443 ? true : self.ssl
120
+ connection.verify_mode = OpenSSL::SSL::VERIFY_NONE if connection.use_ssl?
121
+
122
+ if connection.use_ssl? && self.ca_file
123
+ pem = File.read(self.ca_file)
124
+ connection.ca_file = self.ca_file
125
+ connection.verify_mode = OpenSSL::SSL::VERIFY_PEER
126
+ end
127
+
128
+ connection.start do |http|
129
+ http.read_timeout = self.timeout
130
+ response = http.get(self.path, self.headers)
131
+ end
132
+
133
+ actual_response_code = response.code.to_i
134
+ if self.code_is && self.code_is.include?(actual_response_code)
135
+ pass(actual_response_code)
136
+ elsif self.code_is_not && !self.code_is_not.include?(actual_response_code)
137
+ pass(actual_response_code)
138
+ else
139
+ fail(actual_response_code)
140
+ end
141
+ rescue Errno::ECONNREFUSED
142
+ self.code_is ? fail('Refused') : pass('Refused')
143
+ rescue Errno::ECONNRESET
144
+ self.code_is ? fail('Reset') : pass('Reset')
145
+ rescue EOFError
146
+ self.code_is ? fail('EOF') : pass('EOF')
147
+ rescue Timeout::Error
148
+ self.code_is ? fail('Timeout') : pass('Timeout')
149
+ rescue Errno::ETIMEDOUT
150
+ self.code_is ? fail('Timedout') : pass('Timedout')
151
+ rescue Exception => failure
152
+ self.code_is ? fail(failure.class.name) : pass(failure.class.name)
153
+ end
154
+
155
+ private
156
+
157
+ def pass(code)
158
+ @timeline << true
159
+ if @timeline.select { |x| x }.size >= self.times.first
160
+ self.info = "http response abnormal #{history(code, true)}"
161
+ true
162
+ else
163
+ self.info = "http response nominal #{history(code, true)}"
164
+ false
165
+ end
166
+ end
167
+
168
+ def fail(code)
169
+ @timeline << false
170
+ self.info = "http response nominal #{history(code, false)}"
171
+ false
172
+ end
173
+
174
+ def history(code, passed)
175
+ entry = code.to_s.dup
176
+ entry = '*' + entry if passed
177
+ @history << entry
178
+ '[' + @history.join(", ") + ']'
179
+ end
180
+
181
+ end
182
+
183
+ end
184
+ end
@@ -0,0 +1,25 @@
1
+ module God
2
+ module Conditions
3
+
4
+ class Lambda < PollCondition
5
+ attr_accessor :lambda
6
+
7
+ def valid?
8
+ valid = true
9
+ valid &= complain("Attribute 'lambda' must be specified", self) if self.lambda.nil?
10
+ valid
11
+ end
12
+
13
+ def test
14
+ if self.lambda.call()
15
+ self.info = "lambda condition was satisfied"
16
+ true
17
+ else
18
+ self.info = "lambda condition was not satisfied"
19
+ false
20
+ end
21
+ end
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,82 @@
1
+ module God
2
+ module Conditions
3
+
4
+ # Condition Symbol :memory_usage
5
+ # Type: Poll
6
+ #
7
+ # Trigger when the resident memory of a process is above a specified limit.
8
+ #
9
+ # Paramaters
10
+ # Required
11
+ # +pid_file+ is the pid file of the process in question. Automatically
12
+ # populated for Watches.
13
+ # +above+ is the amount of resident memory (in kilobytes) above which
14
+ # the condition should trigger. You can also use the sugar
15
+ # methods #kilobytes, #megabytes, and #gigabytes to clarify
16
+ # this amount (see examples).
17
+ #
18
+ # Examples
19
+ #
20
+ # Trigger if the process is using more than 100 megabytes of resident
21
+ # memory (from a Watch):
22
+ #
23
+ # on.condition(:memory_usage) do |c|
24
+ # c.above = 100.megabytes
25
+ # end
26
+ #
27
+ # Non-Watch Tasks must specify a PID file:
28
+ #
29
+ # on.condition(:memory_usage) do |c|
30
+ # c.above = 100.megabytes
31
+ # c.pid_file = "/var/run/mongrel.3000.pid"
32
+ # end
33
+ class MemoryUsage < PollCondition
34
+ attr_accessor :above, :times, :pid_file
35
+
36
+ def initialize
37
+ super
38
+ self.above = nil
39
+ self.times = [1, 1]
40
+ end
41
+
42
+ def prepare
43
+ if self.times.kind_of?(Integer)
44
+ self.times = [self.times, self.times]
45
+ end
46
+
47
+ @timeline = Timeline.new(self.times[1])
48
+ end
49
+
50
+ def reset
51
+ @timeline.clear
52
+ end
53
+
54
+ def pid
55
+ self.pid_file ? File.read(self.pid_file).strip.to_i : self.watch.pid
56
+ end
57
+
58
+ def valid?
59
+ valid = true
60
+ valid &= complain("Attribute 'pid_file' must be specified", self) if self.pid_file.nil? && self.watch.pid_file.nil?
61
+ valid &= complain("Attribute 'above' must be specified", self) if self.above.nil?
62
+ valid
63
+ end
64
+
65
+ def test
66
+ process = System::Process.new(self.pid)
67
+ @timeline.push(process.memory)
68
+ self.info = []
69
+
70
+ history = "[" + @timeline.map { |x| "#{x > self.above ? '*' : ''}#{x}kb" }.join(", ") + "]"
71
+
72
+ if @timeline.select { |x| x > self.above }.size >= self.times.first
73
+ self.info = "memory out of bounds #{history}"
74
+ return true
75
+ else
76
+ return false
77
+ end
78
+ end
79
+ end
80
+
81
+ end
82
+ end