flapjack 0.6.38 → 0.6.39

Sign up to get free protection for your applications and to get access to all the features.
@@ -12,6 +12,7 @@ require 'redis'
12
12
  require 'yajl/json_gem'
13
13
 
14
14
  require 'flapjack/data/entity_check'
15
+ require 'flapjack/data/global'
15
16
  require 'flapjack/pikelet'
16
17
 
17
18
  module Flapjack
@@ -20,144 +21,13 @@ module Flapjack
20
21
 
21
22
  include Flapjack::Pikelet
22
23
 
24
+ PAGERDUTY_EVENTS_API_URL = 'https://events.pagerduty.com/generic/2010-04-15/create_event.json'
25
+ SEM_PAGERDUTY_ACKS_RUNNING = 'sem_pagerduty_acks_running'
26
+
23
27
  def setup
24
28
  @redis = build_redis_connection_pool
25
29
  logger.debug("New Pagerduty pikelet with the following options: #{@config.inspect}")
26
30
 
27
- @pagerduty_events_api_url = 'https://events.pagerduty.com/generic/2010-04-15/create_event.json'
28
- @pagerduty_acks_started = nil
29
- @sem_pagerduty_acks_running = 'sem_pagerduty_acks_running'
30
- end
31
-
32
- def send_pagerduty_event(event)
33
- options = { :body => Yajl::Encoder.encode(event) }
34
- http = EM::HttpRequest.new(@pagerduty_events_api_url).post(options)
35
- response = Yajl::Parser.parse(http.response)
36
- status = http.response_header.status
37
- logger.debug "send_pagerduty_event got a return code of #{status.to_s} - #{response.inspect}"
38
- [status, response]
39
- end
40
-
41
- def test_pagerduty_connection
42
- noop = { "service_key" => "11111111111111111111111111111111",
43
- "incident_key" => "Flapjack is running a NOOP",
44
- "event_type" => "nop",
45
- "description" => "I love APIs with noops." }
46
- code, results = send_pagerduty_event(noop)
47
- return true if code == 200 && results['status'] =~ /success/i
48
- logger.error "Error: test_pagerduty_connection: API returned #{code.to_s} #{results.inspect}"
49
- false
50
- end
51
-
52
- # this should be moved to a checks data model perhaps
53
- def unacknowledged_failing_checks
54
- failing_checks = @redis_timer.zrange('failed_checks', '0', '-1')
55
- unless failing_checks.is_a?(Array)
56
- @logger.error("redis.zrange returned something other than an array! Here it is: " + failing_checks.inspect)
57
- end
58
- ufc = failing_checks.reject {|check|
59
- @redis_timer.exists(check + ':unscheduled_maintenance')
60
- }
61
- @logger.debug "found unacknowledged failing checks as follows: " + ufc.join(', ')
62
- ufc
63
- end
64
-
65
- def pagerduty_acknowledged?(opts)
66
- subdomain = opts['subdomain']
67
- username = opts['username']
68
- password = opts['password']
69
- check = opts['check']
70
-
71
- url = 'https://' + subdomain + '.pagerduty.com/api/v1/incidents'
72
- query = { 'fields' => 'incident_number,status,last_status_change_by',
73
- 'since' => (Time.new.utc - (60*60*24*7)).iso8601,
74
- 'until' => (Time.new.utc + (60*60*24)).iso8601,
75
- 'incident_key' => check,
76
- 'status' => 'acknowledged' }
77
-
78
- options = { :head => { 'authorization' => [username, password] },
79
- :query => query }
80
-
81
- http = EM::HttpRequest.new(url).get(options)
82
- # DEBUG flapjack-pagerduty: pagerduty_acknowledged?: decoded response as:
83
- # {"incidents"=>[{"incident_number"=>40, "status"=>"acknowledged",
84
- # "last_status_change_by"=>{"id"=>"PO1NWPS", "name"=>"Jesse Reynolds",
85
- # "email"=>"jesse@bulletproof.net",
86
- # "html_url"=>"http://bltprf.pagerduty.com/users/PO1NWPS"}}], "limit"=>100, "offset"=>0,
87
- # "total"=>1}
88
- begin
89
- response = Yajl::Parser.parse(http.response)
90
- rescue Yajl::ParseError
91
- @logger.error("failed to parse json from a post to #{url} ... response headers and body follows...")
92
- @logger.error(http.response_header.inspect)
93
- @logger.error(http.response)
94
- return nil, nil
95
- end
96
- status = http.response_header.status
97
-
98
- @logger.debug("pagerduty_acknowledged?: decoded response as: #{response.inspect}")
99
- if response.nil?
100
- @logger.error('no valid response received from pagerduty!')
101
- return nil, nil
102
- end
103
-
104
- if response['incidents'].nil?
105
- @logger.error('no incidents found in response')
106
- return nil, nil
107
- end
108
-
109
- if response['incidents'].length > 0
110
- pg_acknowledged_by = response['incidents'].first['last_status_change_by']
111
- return true, :pg_acknowledged_by => pg_acknowledged_by
112
- else
113
- return false, nil
114
- end
115
- end
116
-
117
- def catch_pagerduty_acks
118
-
119
- # ensure we're the only instance of the pagerduty acknowledgement check running (with a naive
120
- # timeout of five minutes to guard against stale locks caused by crashing code) either in this
121
- # process or in other processes
122
- if (@pagerduty_acks_started and @pagerduty_acks_started > (Time.now.to_i - 300)) or
123
- @redis_timer.get(@sem_pagerduty_acks_running) == 'true'
124
- logger.debug("skipping looking for acks in pagerduty as this is already happening")
125
- return
126
- end
127
-
128
- @pagerduty_acks_started = Time.now.to_i
129
- @redis_timer.set(@sem_pagerduty_acks_running, 'true')
130
- @redis_timer.expire(@sem_pagerduty_acks_running, 300)
131
-
132
- logger.debug("looking for acks in pagerduty for unack'd problems")
133
-
134
- # ok lets do it
135
- unacknowledged_failing_checks.each {|check|
136
- entity_check = Flapjack::Data::EntityCheck.for_event_id(check, { :redis => @redis_timer, :logger => @logger } )
137
- pagerduty_credentials = entity_check.pagerduty_credentials(:redis => @redis_timer)
138
-
139
- if pagerduty_credentials.length == 0
140
- @logger.debug("Found no pagerduty creditials for #{entity_check.entity_name}:#{entity_check.check}, moving on")
141
- next
142
- end
143
-
144
- # FIXME: try each set of credentials until one works (may have stale contacts turning up)
145
- options = pagerduty_credentials.first.merge('check' => check)
146
-
147
- pagerduty_acknowledged, result_hash = pagerduty_acknowledged?(options)
148
- if pagerduty_acknowledged
149
- pg_acknowledged_by = result_hash[:pg_acknowledged_by] unless result_hash.nil?
150
- @logger.debug "#{check} is acknowledged in pagerduty, creating flapjack acknowledgement ... "
151
- who_text = ""
152
- if !pg_acknowledged_by.nil? && !pg_acknowledged_by['name'].nil?
153
- who_text = " by #{pg_acknowledged_by['name']}"
154
- end
155
- entity_check.create_acknowledgement('summary' => "Acknowledged on PagerDuty" + who_text)
156
- else
157
- @logger.debug "#{check} is not acknowledged in pagerduty, moving on"
158
- end
159
- }
160
- @redis_timer.del(@sem_pagerduty_acks_running)
161
31
  @pagerduty_acks_started = nil
162
32
  end
163
33
 
@@ -173,12 +43,12 @@ module Flapjack
173
43
  raise "Can't connect to the pagerduty API" unless test_pagerduty_connection
174
44
 
175
45
  # TODO: only clear this if there isn't another pagerduty gateway instance running
176
- # or better, include on instance ID in the semaphore key name
177
- @redis.del(@sem_pagerduty_acks_running)
46
+ # or better, include an instance ID in the semaphore key name
47
+ @redis.del(SEM_PAGERDUTY_ACKS_RUNNING)
178
48
 
179
49
  acknowledgement_timer = EM::Synchrony.add_periodic_timer(10) do
180
50
  @redis_timer ||= build_redis_connection_pool
181
- catch_pagerduty_acks
51
+ find_pagerduty_acknowledgements_if_safe
182
52
  end
183
53
 
184
54
  queue = @config['queue']
@@ -190,9 +60,7 @@ module Flapjack
190
60
  event = Yajl::Parser.parse(events[queue][1])
191
61
  type = event['notification_type']
192
62
  logger.debug("pagerduty notification event popped off the queue: " + event.inspect)
193
- if 'shutdown'.eql?(type)
194
- # do anything in particular?
195
- else
63
+ unless 'shutdown'.eql?(type)
196
64
  event_id = event['event_id']
197
65
  entity, check = event_id.split(':')
198
66
  state = event['state']
@@ -228,6 +96,138 @@ module Flapjack
228
96
  @redis_timer.empty! if @redis_timer
229
97
  end
230
98
 
99
+ # considering this as part of the public API -- exposes it for testing.
100
+ def find_pagerduty_acknowledgements_if_safe
101
+
102
+ # ensure we're the only instance of the pagerduty acknowledgement check running (with a naive
103
+ # timeout of five minutes to guard against stale locks caused by crashing code) either in this
104
+ # process or in other processes
105
+ if (@pagerduty_acks_started and @pagerduty_acks_started > (Time.now.to_i - 300)) or
106
+ @redis_timer.get(SEM_PAGERDUTY_ACKS_RUNNING) == 'true'
107
+ logger.debug("skipping looking for acks in pagerduty as this is already happening")
108
+ return
109
+ end
110
+
111
+ @pagerduty_acks_started = Time.now.to_i
112
+ @redis_timer.set(SEM_PAGERDUTY_ACKS_RUNNING, 'true')
113
+ @redis_timer.expire(SEM_PAGERDUTY_ACKS_RUNNING, 300)
114
+
115
+ find_pagerduty_acknowledgements
116
+
117
+ @redis_timer.del(SEM_PAGERDUTY_ACKS_RUNNING)
118
+ @pagerduty_acks_started = nil
119
+ end
120
+
121
+ private
122
+
123
+ def test_pagerduty_connection
124
+ noop = { "service_key" => "11111111111111111111111111111111",
125
+ "incident_key" => "Flapjack is running a NOOP",
126
+ "event_type" => "nop",
127
+ "description" => "I love APIs with noops." }
128
+ code, results = send_pagerduty_event(noop)
129
+ return true if code == 200 && results['status'] =~ /success/i
130
+ logger.error "Error: test_pagerduty_connection: API returned #{code.to_s} #{results.inspect}"
131
+ false
132
+ end
133
+
134
+ def send_pagerduty_event(event)
135
+ options = { :body => Yajl::Encoder.encode(event) }
136
+ http = EM::HttpRequest.new(PAGERDUTY_EVENTS_API_URL).post(options)
137
+ response = Yajl::Parser.parse(http.response)
138
+ status = http.response_header.status
139
+ logger.debug "send_pagerduty_event got a return code of #{status.to_s} - #{response.inspect}"
140
+ [status, response]
141
+ end
142
+
143
+ def find_pagerduty_acknowledgements
144
+
145
+ logger.debug("looking for acks in pagerduty for unack'd problems")
146
+
147
+ unacknowledged_failing_checks = Flapjack::Data::Global.unacknowledged_failing_checks(:redis => @redis_timer)
148
+
149
+ @logger.debug "found unacknowledged failing checks as follows: " + unacknowledged_failing_checks.join(', ')
150
+
151
+ unacknowledged_failing_checks.each do |entity_check|
152
+ pagerduty_credentials = entity_check.pagerduty_credentials(:redis => @redis_timer)
153
+ check = entity_check.check
154
+
155
+ if pagerduty_credentials.empty?
156
+ @logger.debug("No pagerduty credentials found for #{entity_check.entity_name}:#{check}, skipping")
157
+ next
158
+ end
159
+
160
+ # FIXME: try each set of credentials until one works (may have stale contacts turning up)
161
+ options = pagerduty_credentials.first.merge('check' => check)
162
+
163
+ acknowledged = pagerduty_acknowledged?(options)
164
+ if acknowledged.nil?
165
+ @logger.debug "#{check} is not acknowledged in pagerduty, skipping"
166
+ next
167
+ end
168
+
169
+ pg_acknowledged_by = acknowledged[:pg_acknowledged_by]
170
+ @logger.debug "#{check} is acknowledged in pagerduty, creating flapjack acknowledgement... "
171
+ who_text = ""
172
+ if !pg_acknowledged_by.nil? && !pg_acknowledged_by['name'].nil?
173
+ who_text = " by #{pg_acknowledged_by['name']}"
174
+ end
175
+ entity_check.create_acknowledgement('summary' => "Acknowledged on PagerDuty" + who_text)
176
+ end
177
+
178
+ end
179
+
180
+ def pagerduty_acknowledged?(opts)
181
+ subdomain = opts['subdomain']
182
+ username = opts['username']
183
+ password = opts['password']
184
+ check = opts['check']
185
+
186
+ t = Time.now.utc
187
+
188
+ url = 'https://' + subdomain + '.pagerduty.com/api/v1/incidents'
189
+ query = { 'fields' => 'incident_number,status,last_status_change_by',
190
+ 'since' => (t - (60*60*24*7)).iso8601, # the last week
191
+ 'until' => (t + (60*60*24)).iso8601, # 1 day in the future
192
+ 'incident_key' => check,
193
+ 'status' => 'acknowledged' }
194
+
195
+ options = { :head => { 'authorization' => [username, password] },
196
+ :query => query }
197
+
198
+ http = EM::HttpRequest.new(url).get(options)
199
+ # DEBUG flapjack-pagerduty: pagerduty_acknowledged?: decoded response as:
200
+ # {"incidents"=>[{"incident_number"=>40, "status"=>"acknowledged",
201
+ # "last_status_change_by"=>{"id"=>"PO1NWPS", "name"=>"Jesse Reynolds",
202
+ # "email"=>"jesse@bulletproof.net",
203
+ # "html_url"=>"http://bltprf.pagerduty.com/users/PO1NWPS"}}], "limit"=>100, "offset"=>0,
204
+ # "total"=>1}
205
+ begin
206
+ response = Yajl::Parser.parse(http.response)
207
+ rescue Yajl::ParseError
208
+ @logger.error("failed to parse json from a post to #{url} ... response headers and body follows...")
209
+ @logger.error(http.response_header.inspect)
210
+ @logger.error(http.response)
211
+ return nil
212
+ end
213
+ status = http.response_header.status
214
+
215
+ @logger.debug("pagerduty_acknowledged?: decoded response as: #{response.inspect}")
216
+ if response.nil?
217
+ @logger.error('no valid response received from pagerduty!')
218
+ return nil
219
+ end
220
+
221
+ if response['incidents'].nil?
222
+ @logger.error('no incidents found in response')
223
+ return nil
224
+ end
225
+
226
+ return nil if response['incidents'].empty?
227
+
228
+ {:pg_acknowledged_by => response['incidents'].first['last_status_change_by']}
229
+ end
230
+
231
231
  end
232
232
  end
233
233
 
@@ -8,7 +8,7 @@ require 'em-synchrony'
8
8
  require 'redis/connection/synchrony'
9
9
  require 'redis'
10
10
 
11
- # require 'eventmachine/synchrony/connection_pool'
11
+ require 'em-synchrony/connection_pool'
12
12
 
13
13
  module Flapjack
14
14
  class RedisPool < EventMachine::Synchrony::ConnectionPool
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  module Flapjack
4
- VERSION = "0.6.38"
4
+ VERSION = "0.6.39"
5
5
  end
@@ -105,7 +105,7 @@
105
105
  %input{:type => 'submit', :value => 'End Now', :class => 'button'}
106
106
  - else
107
107
  %p No maintenance is scheduled
108
- %h4 Add Scheduled Maintenace
108
+ %h4 Add Scheduled Maintenance
109
109
  %form{:action => "/scheduled_maintenances/#{@entity}/#{@check}", :method => "post"}
110
110
  %fieldset
111
111
  %table
@@ -3,14 +3,171 @@ require 'flapjack/coordinator'
3
3
 
4
4
  describe Flapjack::Coordinator do
5
5
 
6
- it "is initialized"
6
+ let(:fiber) { mock('Fiber') }
7
7
 
8
- it "starts a suite of services based on config settings"
8
+ let(:config) {
9
+ {'redis' => {},
10
+ 'executive' => {'enabled' => 'yes'},
11
+ 'email_notifier' => {'enabled' => 'yes'},
12
+ 'web' => {'enabled' => 'yes'}
13
+ }
14
+ }
9
15
 
10
- it "runs daemonized"
16
+ before(:each) {
17
+ # temporary workaround for failing test due to preserved state;
18
+ # won't be needed soon as this code has been fixed in a branch
19
+ Flapjack::API.class_variable_set('@@redis', nil)
20
+ Flapjack::Web.class_variable_set('@@redis', nil)
21
+ }
11
22
 
12
- it "runs undaemonized"
23
+ # leaving actual testing of daemonisation to that class's tests
24
+ it "daemonizes properly" do
25
+ fc = Flapjack::Coordinator.new(config)
26
+ fc.should_receive(:daemonize)
27
+ fc.should_not_receive(:build_pikelet)
28
+ fc.should_not_receive(:build_resque_pikelet)
29
+ fc.should_not_receive(:build_thin_pikelet)
30
+ fc.start(:daemonize => true, :signals => false)
31
+ end
13
32
 
14
- it "stops its services when closing"
33
+ it "runs undaemonized" do
34
+ EM.should_receive(:synchrony).and_yield
35
+
36
+ fc = Flapjack::Coordinator.new(config)
37
+ fc.should_receive(:build_pikelet)
38
+ fc.should_receive(:build_resque_pikelet)
39
+ fc.should_receive(:build_thin_pikelet)
40
+ fc.start(:daemonize => false, :signals => false)
41
+ end
42
+
43
+ it "starts after daemonizing" do
44
+ EM.should_receive(:synchrony).and_yield
45
+
46
+ fc = Flapjack::Coordinator.new(config)
47
+ fc.should_receive(:build_pikelet)
48
+ fc.should_receive(:build_resque_pikelet)
49
+ fc.should_receive(:build_thin_pikelet)
50
+ fc.after_daemonize
51
+ end
52
+
53
+ it "traps system signals and shuts down"
54
+
55
+ # TODO whem merged with other changes, this will check pik[:class] instead,
56
+ # having to create instances of the pikelet classes is messy
57
+ it "stops its services when closing" do
58
+ fiber_exec = mock('fiber_exec')
59
+ fiber_rsq = mock('fiber_rsq')
60
+
61
+ exec = Flapjack::Executive.new
62
+ exec.should_receive(:add_shutdown_event)
63
+ email = EM::Resque::Worker.new('example')
64
+ email.should_receive(:shutdown)
65
+ web = Thin::Server.new('0.0.0.0', 3000, Flapjack::Web, :signals => false)
66
+ web.should_receive(:stop!)
67
+
68
+ redis = mock('redis')
69
+ redis.should_receive(:quit)
70
+ Redis.should_receive(:new).and_return(redis)
71
+
72
+ fiber.should_receive(:resume)
73
+ fiber_stop = mock('fiber_stop')
74
+ fiber_stop.should_receive(:resume)
75
+ Fiber.should_receive(:new).twice.and_yield.and_return(fiber, fiber_stop)
76
+
77
+ fiber_exec.should_receive(:alive?).and_return(true, false)
78
+ fiber_rsq.should_receive(:alive?).and_return(true, false)
79
+
80
+ EM.should_receive(:stop)
81
+
82
+ pikelets = [{:fiber => fiber_exec, :instance => exec},
83
+ {:fiber => fiber_rsq, :instance => email},
84
+ {:instance => web}]
85
+
86
+ fc = Flapjack::Coordinator.new
87
+ fc.instance_variable_set('@redis_options', {})
88
+ fc.instance_variable_set('@pikelets', pikelets)
89
+ fc.stop
90
+ end
91
+
92
+ it "creates an executive pikelet" do
93
+ exec = mock('executive')
94
+ exec.should_receive(:bootstrap)
95
+ Flapjack::Executive.should_receive(:new).and_return(exec)
96
+ exec.should_receive(:main)
97
+
98
+ fiber.should_receive(:resume)
99
+ Fiber.should_receive(:new).and_yield.and_return(fiber)
100
+
101
+ fc = Flapjack::Coordinator.new
102
+ fc.send(:build_pikelet, 'executive', {})
103
+ pikelets = fc.instance_variable_get('@pikelets')
104
+ pikelets.should_not be_nil
105
+ pikelets.should be_an(Array)
106
+ pikelets.should have(1).pikelet
107
+ pikelets.first.should == {:fiber => fiber, :type => 'executive', :instance => exec}
108
+ end
109
+
110
+ it "handles an exception raised by a jabber pikelet" do
111
+ jabber = mock('jabber')
112
+ jabber.should_receive(:bootstrap)
113
+ Flapjack::Jabber.should_receive(:new).and_return(jabber)
114
+ jabber.should_receive(:main).and_raise(RuntimeError)
115
+
116
+ fiber.should_receive(:resume)
117
+ Fiber.should_receive(:new).and_yield.and_return(fiber)
118
+
119
+ fc = Flapjack::Coordinator.new
120
+ fc.should_receive(:stop)
121
+ fc.send(:build_pikelet, 'jabber_gateway', {})
122
+ pikelets = fc.instance_variable_get('@pikelets')
123
+ pikelets.should_not be_nil
124
+ pikelets.should be_an(Array)
125
+ pikelets.should have(1).pikelet
126
+ pikelets.first.should == {:fiber => fiber, :type => 'jabber_gateway', :instance => jabber}
127
+ end
128
+
129
+ it "creates a resque worker pikelet" do
130
+ redis = mock('redis')
131
+ Flapjack::RedisPool.should_receive(:new).and_return(redis)
132
+ Resque.should_receive(:redis=).with(redis)
133
+
134
+ worker = mock('worker')
135
+ EM::Resque::Worker.should_receive(:new).and_return(worker)
136
+ worker.should_receive(:work)
137
+
138
+ fiber.should_receive(:resume)
139
+ Fiber.should_receive(:new).and_yield.and_return(fiber)
140
+
141
+ fc = Flapjack::Coordinator.new
142
+ fc.send(:build_resque_pikelet, 'email_notifier', {})
143
+ pikelets = fc.instance_variable_get('@pikelets')
144
+ pikelets.should_not be_nil
145
+ pikelets.should be_an(Array)
146
+ pikelets.should have(1).pikelet
147
+ pikelets.first.should == {:fiber => fiber, :type => 'email_notifier', :instance => worker}
148
+ end
149
+
150
+ it "handles an exception raised by a resque worker pikelet"
151
+
152
+ it "creates a thin server pikelet" do
153
+ redis = mock('redis')
154
+ Flapjack::RedisPool.should_receive(:new).and_return(redis)
155
+
156
+ server = mock('server')
157
+ server.should_receive(:start)
158
+ Thin::Server.should_receive(:new).
159
+ with(/^(?:\d{1,3}\.){3}\d{1,3}$/, an_instance_of(Fixnum), Flapjack::Web, :signals => false).
160
+ and_return(server)
161
+
162
+ fc = Flapjack::Coordinator.new
163
+ fc.send(:build_thin_pikelet, 'web', {})
164
+ pikelets = fc.instance_variable_get('@pikelets')
165
+ pikelets.should_not be_nil
166
+ pikelets.should be_an(Array)
167
+ pikelets.should have(1).pikelet
168
+ pikelets.first.should == {:type => 'web', :instance => server}
169
+ end
170
+
171
+ # NB: exceptions are handled directly by the Thin pikelets
15
172
 
16
173
  end