flapjack 0.6.38 → 0.6.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,7 @@ require 'redis'
12
12
  require 'yajl/json_gem'
13
13
 
14
14
  require 'flapjack/data/entity_check'
15
+ require 'flapjack/data/global'
15
16
  require 'flapjack/pikelet'
16
17
 
17
18
  module Flapjack
@@ -20,144 +21,13 @@ module Flapjack
20
21
 
21
22
  include Flapjack::Pikelet
22
23
 
24
+ PAGERDUTY_EVENTS_API_URL = 'https://events.pagerduty.com/generic/2010-04-15/create_event.json'
25
+ SEM_PAGERDUTY_ACKS_RUNNING = 'sem_pagerduty_acks_running'
26
+
23
27
  def setup
24
28
  @redis = build_redis_connection_pool
25
29
  logger.debug("New Pagerduty pikelet with the following options: #{@config.inspect}")
26
30
 
27
- @pagerduty_events_api_url = 'https://events.pagerduty.com/generic/2010-04-15/create_event.json'
28
- @pagerduty_acks_started = nil
29
- @sem_pagerduty_acks_running = 'sem_pagerduty_acks_running'
30
- end
31
-
32
- def send_pagerduty_event(event)
33
- options = { :body => Yajl::Encoder.encode(event) }
34
- http = EM::HttpRequest.new(@pagerduty_events_api_url).post(options)
35
- response = Yajl::Parser.parse(http.response)
36
- status = http.response_header.status
37
- logger.debug "send_pagerduty_event got a return code of #{status.to_s} - #{response.inspect}"
38
- [status, response]
39
- end
40
-
41
- def test_pagerduty_connection
42
- noop = { "service_key" => "11111111111111111111111111111111",
43
- "incident_key" => "Flapjack is running a NOOP",
44
- "event_type" => "nop",
45
- "description" => "I love APIs with noops." }
46
- code, results = send_pagerduty_event(noop)
47
- return true if code == 200 && results['status'] =~ /success/i
48
- logger.error "Error: test_pagerduty_connection: API returned #{code.to_s} #{results.inspect}"
49
- false
50
- end
51
-
52
- # this should be moved to a checks data model perhaps
53
- def unacknowledged_failing_checks
54
- failing_checks = @redis_timer.zrange('failed_checks', '0', '-1')
55
- unless failing_checks.is_a?(Array)
56
- @logger.error("redis.zrange returned something other than an array! Here it is: " + failing_checks.inspect)
57
- end
58
- ufc = failing_checks.reject {|check|
59
- @redis_timer.exists(check + ':unscheduled_maintenance')
60
- }
61
- @logger.debug "found unacknowledged failing checks as follows: " + ufc.join(', ')
62
- ufc
63
- end
64
-
65
- def pagerduty_acknowledged?(opts)
66
- subdomain = opts['subdomain']
67
- username = opts['username']
68
- password = opts['password']
69
- check = opts['check']
70
-
71
- url = 'https://' + subdomain + '.pagerduty.com/api/v1/incidents'
72
- query = { 'fields' => 'incident_number,status,last_status_change_by',
73
- 'since' => (Time.new.utc - (60*60*24*7)).iso8601,
74
- 'until' => (Time.new.utc + (60*60*24)).iso8601,
75
- 'incident_key' => check,
76
- 'status' => 'acknowledged' }
77
-
78
- options = { :head => { 'authorization' => [username, password] },
79
- :query => query }
80
-
81
- http = EM::HttpRequest.new(url).get(options)
82
- # DEBUG flapjack-pagerduty: pagerduty_acknowledged?: decoded response as:
83
- # {"incidents"=>[{"incident_number"=>40, "status"=>"acknowledged",
84
- # "last_status_change_by"=>{"id"=>"PO1NWPS", "name"=>"Jesse Reynolds",
85
- # "email"=>"jesse@bulletproof.net",
86
- # "html_url"=>"http://bltprf.pagerduty.com/users/PO1NWPS"}}], "limit"=>100, "offset"=>0,
87
- # "total"=>1}
88
- begin
89
- response = Yajl::Parser.parse(http.response)
90
- rescue Yajl::ParseError
91
- @logger.error("failed to parse json from a post to #{url} ... response headers and body follows...")
92
- @logger.error(http.response_header.inspect)
93
- @logger.error(http.response)
94
- return nil, nil
95
- end
96
- status = http.response_header.status
97
-
98
- @logger.debug("pagerduty_acknowledged?: decoded response as: #{response.inspect}")
99
- if response.nil?
100
- @logger.error('no valid response received from pagerduty!')
101
- return nil, nil
102
- end
103
-
104
- if response['incidents'].nil?
105
- @logger.error('no incidents found in response')
106
- return nil, nil
107
- end
108
-
109
- if response['incidents'].length > 0
110
- pg_acknowledged_by = response['incidents'].first['last_status_change_by']
111
- return true, :pg_acknowledged_by => pg_acknowledged_by
112
- else
113
- return false, nil
114
- end
115
- end
116
-
117
- def catch_pagerduty_acks
118
-
119
- # ensure we're the only instance of the pagerduty acknowledgement check running (with a naive
120
- # timeout of five minutes to guard against stale locks caused by crashing code) either in this
121
- # process or in other processes
122
- if (@pagerduty_acks_started and @pagerduty_acks_started > (Time.now.to_i - 300)) or
123
- @redis_timer.get(@sem_pagerduty_acks_running) == 'true'
124
- logger.debug("skipping looking for acks in pagerduty as this is already happening")
125
- return
126
- end
127
-
128
- @pagerduty_acks_started = Time.now.to_i
129
- @redis_timer.set(@sem_pagerduty_acks_running, 'true')
130
- @redis_timer.expire(@sem_pagerduty_acks_running, 300)
131
-
132
- logger.debug("looking for acks in pagerduty for unack'd problems")
133
-
134
- # ok lets do it
135
- unacknowledged_failing_checks.each {|check|
136
- entity_check = Flapjack::Data::EntityCheck.for_event_id(check, { :redis => @redis_timer, :logger => @logger } )
137
- pagerduty_credentials = entity_check.pagerduty_credentials(:redis => @redis_timer)
138
-
139
- if pagerduty_credentials.length == 0
140
- @logger.debug("Found no pagerduty creditials for #{entity_check.entity_name}:#{entity_check.check}, moving on")
141
- next
142
- end
143
-
144
- # FIXME: try each set of credentials until one works (may have stale contacts turning up)
145
- options = pagerduty_credentials.first.merge('check' => check)
146
-
147
- pagerduty_acknowledged, result_hash = pagerduty_acknowledged?(options)
148
- if pagerduty_acknowledged
149
- pg_acknowledged_by = result_hash[:pg_acknowledged_by] unless result_hash.nil?
150
- @logger.debug "#{check} is acknowledged in pagerduty, creating flapjack acknowledgement ... "
151
- who_text = ""
152
- if !pg_acknowledged_by.nil? && !pg_acknowledged_by['name'].nil?
153
- who_text = " by #{pg_acknowledged_by['name']}"
154
- end
155
- entity_check.create_acknowledgement('summary' => "Acknowledged on PagerDuty" + who_text)
156
- else
157
- @logger.debug "#{check} is not acknowledged in pagerduty, moving on"
158
- end
159
- }
160
- @redis_timer.del(@sem_pagerduty_acks_running)
161
31
  @pagerduty_acks_started = nil
162
32
  end
163
33
 
@@ -173,12 +43,12 @@ module Flapjack
173
43
  raise "Can't connect to the pagerduty API" unless test_pagerduty_connection
174
44
 
175
45
  # TODO: only clear this if there isn't another pagerduty gateway instance running
176
- # or better, include on instance ID in the semaphore key name
177
- @redis.del(@sem_pagerduty_acks_running)
46
+ # or better, include an instance ID in the semaphore key name
47
+ @redis.del(SEM_PAGERDUTY_ACKS_RUNNING)
178
48
 
179
49
  acknowledgement_timer = EM::Synchrony.add_periodic_timer(10) do
180
50
  @redis_timer ||= build_redis_connection_pool
181
- catch_pagerduty_acks
51
+ find_pagerduty_acknowledgements_if_safe
182
52
  end
183
53
 
184
54
  queue = @config['queue']
@@ -190,9 +60,7 @@ module Flapjack
190
60
  event = Yajl::Parser.parse(events[queue][1])
191
61
  type = event['notification_type']
192
62
  logger.debug("pagerduty notification event popped off the queue: " + event.inspect)
193
- if 'shutdown'.eql?(type)
194
- # do anything in particular?
195
- else
63
+ unless 'shutdown'.eql?(type)
196
64
  event_id = event['event_id']
197
65
  entity, check = event_id.split(':')
198
66
  state = event['state']
@@ -228,6 +96,138 @@ module Flapjack
228
96
  @redis_timer.empty! if @redis_timer
229
97
  end
230
98
 
99
+ # considering this as part of the public API -- exposes it for testing.
100
+ def find_pagerduty_acknowledgements_if_safe
101
+
102
+ # ensure we're the only instance of the pagerduty acknowledgement check running (with a naive
103
+ # timeout of five minutes to guard against stale locks caused by crashing code) either in this
104
+ # process or in other processes
105
+ if (@pagerduty_acks_started and @pagerduty_acks_started > (Time.now.to_i - 300)) or
106
+ @redis_timer.get(SEM_PAGERDUTY_ACKS_RUNNING) == 'true'
107
+ logger.debug("skipping looking for acks in pagerduty as this is already happening")
108
+ return
109
+ end
110
+
111
+ @pagerduty_acks_started = Time.now.to_i
112
+ @redis_timer.set(SEM_PAGERDUTY_ACKS_RUNNING, 'true')
113
+ @redis_timer.expire(SEM_PAGERDUTY_ACKS_RUNNING, 300)
114
+
115
+ find_pagerduty_acknowledgements
116
+
117
+ @redis_timer.del(SEM_PAGERDUTY_ACKS_RUNNING)
118
+ @pagerduty_acks_started = nil
119
+ end
120
+
121
+ private
122
+
123
+ def test_pagerduty_connection
124
+ noop = { "service_key" => "11111111111111111111111111111111",
125
+ "incident_key" => "Flapjack is running a NOOP",
126
+ "event_type" => "nop",
127
+ "description" => "I love APIs with noops." }
128
+ code, results = send_pagerduty_event(noop)
129
+ return true if code == 200 && results['status'] =~ /success/i
130
+ logger.error "Error: test_pagerduty_connection: API returned #{code.to_s} #{results.inspect}"
131
+ false
132
+ end
133
+
134
+ def send_pagerduty_event(event)
135
+ options = { :body => Yajl::Encoder.encode(event) }
136
+ http = EM::HttpRequest.new(PAGERDUTY_EVENTS_API_URL).post(options)
137
+ response = Yajl::Parser.parse(http.response)
138
+ status = http.response_header.status
139
+ logger.debug "send_pagerduty_event got a return code of #{status.to_s} - #{response.inspect}"
140
+ [status, response]
141
+ end
142
+
143
+ def find_pagerduty_acknowledgements
144
+
145
+ logger.debug("looking for acks in pagerduty for unack'd problems")
146
+
147
+ unacknowledged_failing_checks = Flapjack::Data::Global.unacknowledged_failing_checks(:redis => @redis_timer)
148
+
149
+ @logger.debug "found unacknowledged failing checks as follows: " + unacknowledged_failing_checks.join(', ')
150
+
151
+ unacknowledged_failing_checks.each do |entity_check|
152
+ pagerduty_credentials = entity_check.pagerduty_credentials(:redis => @redis_timer)
153
+ check = entity_check.check
154
+
155
+ if pagerduty_credentials.empty?
156
+ @logger.debug("No pagerduty credentials found for #{entity_check.entity_name}:#{check}, skipping")
157
+ next
158
+ end
159
+
160
+ # FIXME: try each set of credentials until one works (may have stale contacts turning up)
161
+ options = pagerduty_credentials.first.merge('check' => check)
162
+
163
+ acknowledged = pagerduty_acknowledged?(options)
164
+ if acknowledged.nil?
165
+ @logger.debug "#{check} is not acknowledged in pagerduty, skipping"
166
+ next
167
+ end
168
+
169
+ pg_acknowledged_by = acknowledged[:pg_acknowledged_by]
170
+ @logger.debug "#{check} is acknowledged in pagerduty, creating flapjack acknowledgement... "
171
+ who_text = ""
172
+ if !pg_acknowledged_by.nil? && !pg_acknowledged_by['name'].nil?
173
+ who_text = " by #{pg_acknowledged_by['name']}"
174
+ end
175
+ entity_check.create_acknowledgement('summary' => "Acknowledged on PagerDuty" + who_text)
176
+ end
177
+
178
+ end
179
+
180
+ def pagerduty_acknowledged?(opts)
181
+ subdomain = opts['subdomain']
182
+ username = opts['username']
183
+ password = opts['password']
184
+ check = opts['check']
185
+
186
+ t = Time.now.utc
187
+
188
+ url = 'https://' + subdomain + '.pagerduty.com/api/v1/incidents'
189
+ query = { 'fields' => 'incident_number,status,last_status_change_by',
190
+ 'since' => (t - (60*60*24*7)).iso8601, # the last week
191
+ 'until' => (t + (60*60*24)).iso8601, # 1 day in the future
192
+ 'incident_key' => check,
193
+ 'status' => 'acknowledged' }
194
+
195
+ options = { :head => { 'authorization' => [username, password] },
196
+ :query => query }
197
+
198
+ http = EM::HttpRequest.new(url).get(options)
199
+ # DEBUG flapjack-pagerduty: pagerduty_acknowledged?: decoded response as:
200
+ # {"incidents"=>[{"incident_number"=>40, "status"=>"acknowledged",
201
+ # "last_status_change_by"=>{"id"=>"PO1NWPS", "name"=>"Jesse Reynolds",
202
+ # "email"=>"jesse@bulletproof.net",
203
+ # "html_url"=>"http://bltprf.pagerduty.com/users/PO1NWPS"}}], "limit"=>100, "offset"=>0,
204
+ # "total"=>1}
205
+ begin
206
+ response = Yajl::Parser.parse(http.response)
207
+ rescue Yajl::ParseError
208
+ @logger.error("failed to parse json from a post to #{url} ... response headers and body follows...")
209
+ @logger.error(http.response_header.inspect)
210
+ @logger.error(http.response)
211
+ return nil
212
+ end
213
+ status = http.response_header.status
214
+
215
+ @logger.debug("pagerduty_acknowledged?: decoded response as: #{response.inspect}")
216
+ if response.nil?
217
+ @logger.error('no valid response received from pagerduty!')
218
+ return nil
219
+ end
220
+
221
+ if response['incidents'].nil?
222
+ @logger.error('no incidents found in response')
223
+ return nil
224
+ end
225
+
226
+ return nil if response['incidents'].empty?
227
+
228
+ {:pg_acknowledged_by => response['incidents'].first['last_status_change_by']}
229
+ end
230
+
231
231
  end
232
232
  end
233
233
 
@@ -8,7 +8,7 @@ require 'em-synchrony'
8
8
  require 'redis/connection/synchrony'
9
9
  require 'redis'
10
10
 
11
- # require 'eventmachine/synchrony/connection_pool'
11
+ require 'em-synchrony/connection_pool'
12
12
 
13
13
  module Flapjack
14
14
  class RedisPool < EventMachine::Synchrony::ConnectionPool
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  module Flapjack
4
- VERSION = "0.6.38"
4
+ VERSION = "0.6.39"
5
5
  end
@@ -105,7 +105,7 @@
105
105
  %input{:type => 'submit', :value => 'End Now', :class => 'button'}
106
106
  - else
107
107
  %p No maintenance is scheduled
108
- %h4 Add Scheduled Maintenace
108
+ %h4 Add Scheduled Maintenance
109
109
  %form{:action => "/scheduled_maintenances/#{@entity}/#{@check}", :method => "post"}
110
110
  %fieldset
111
111
  %table
@@ -3,14 +3,171 @@ require 'flapjack/coordinator'
3
3
 
4
4
  describe Flapjack::Coordinator do
5
5
 
6
- it "is initialized"
6
+ let(:fiber) { mock('Fiber') }
7
7
 
8
- it "starts a suite of services based on config settings"
8
+ let(:config) {
9
+ {'redis' => {},
10
+ 'executive' => {'enabled' => 'yes'},
11
+ 'email_notifier' => {'enabled' => 'yes'},
12
+ 'web' => {'enabled' => 'yes'}
13
+ }
14
+ }
9
15
 
10
- it "runs daemonized"
16
+ before(:each) {
17
+ # temporary workaround for failing test due to preserved state;
18
+ # won't be needed soon as this code has been fixed in a branch
19
+ Flapjack::API.class_variable_set('@@redis', nil)
20
+ Flapjack::Web.class_variable_set('@@redis', nil)
21
+ }
11
22
 
12
- it "runs undaemonized"
23
+ # leaving actual testing of daemonisation to that class's tests
24
+ it "daemonizes properly" do
25
+ fc = Flapjack::Coordinator.new(config)
26
+ fc.should_receive(:daemonize)
27
+ fc.should_not_receive(:build_pikelet)
28
+ fc.should_not_receive(:build_resque_pikelet)
29
+ fc.should_not_receive(:build_thin_pikelet)
30
+ fc.start(:daemonize => true, :signals => false)
31
+ end
13
32
 
14
- it "stops its services when closing"
33
+ it "runs undaemonized" do
34
+ EM.should_receive(:synchrony).and_yield
35
+
36
+ fc = Flapjack::Coordinator.new(config)
37
+ fc.should_receive(:build_pikelet)
38
+ fc.should_receive(:build_resque_pikelet)
39
+ fc.should_receive(:build_thin_pikelet)
40
+ fc.start(:daemonize => false, :signals => false)
41
+ end
42
+
43
+ it "starts after daemonizing" do
44
+ EM.should_receive(:synchrony).and_yield
45
+
46
+ fc = Flapjack::Coordinator.new(config)
47
+ fc.should_receive(:build_pikelet)
48
+ fc.should_receive(:build_resque_pikelet)
49
+ fc.should_receive(:build_thin_pikelet)
50
+ fc.after_daemonize
51
+ end
52
+
53
+ it "traps system signals and shuts down"
54
+
55
+ # TODO whem merged with other changes, this will check pik[:class] instead,
56
+ # having to create instances of the pikelet classes is messy
57
+ it "stops its services when closing" do
58
+ fiber_exec = mock('fiber_exec')
59
+ fiber_rsq = mock('fiber_rsq')
60
+
61
+ exec = Flapjack::Executive.new
62
+ exec.should_receive(:add_shutdown_event)
63
+ email = EM::Resque::Worker.new('example')
64
+ email.should_receive(:shutdown)
65
+ web = Thin::Server.new('0.0.0.0', 3000, Flapjack::Web, :signals => false)
66
+ web.should_receive(:stop!)
67
+
68
+ redis = mock('redis')
69
+ redis.should_receive(:quit)
70
+ Redis.should_receive(:new).and_return(redis)
71
+
72
+ fiber.should_receive(:resume)
73
+ fiber_stop = mock('fiber_stop')
74
+ fiber_stop.should_receive(:resume)
75
+ Fiber.should_receive(:new).twice.and_yield.and_return(fiber, fiber_stop)
76
+
77
+ fiber_exec.should_receive(:alive?).and_return(true, false)
78
+ fiber_rsq.should_receive(:alive?).and_return(true, false)
79
+
80
+ EM.should_receive(:stop)
81
+
82
+ pikelets = [{:fiber => fiber_exec, :instance => exec},
83
+ {:fiber => fiber_rsq, :instance => email},
84
+ {:instance => web}]
85
+
86
+ fc = Flapjack::Coordinator.new
87
+ fc.instance_variable_set('@redis_options', {})
88
+ fc.instance_variable_set('@pikelets', pikelets)
89
+ fc.stop
90
+ end
91
+
92
+ it "creates an executive pikelet" do
93
+ exec = mock('executive')
94
+ exec.should_receive(:bootstrap)
95
+ Flapjack::Executive.should_receive(:new).and_return(exec)
96
+ exec.should_receive(:main)
97
+
98
+ fiber.should_receive(:resume)
99
+ Fiber.should_receive(:new).and_yield.and_return(fiber)
100
+
101
+ fc = Flapjack::Coordinator.new
102
+ fc.send(:build_pikelet, 'executive', {})
103
+ pikelets = fc.instance_variable_get('@pikelets')
104
+ pikelets.should_not be_nil
105
+ pikelets.should be_an(Array)
106
+ pikelets.should have(1).pikelet
107
+ pikelets.first.should == {:fiber => fiber, :type => 'executive', :instance => exec}
108
+ end
109
+
110
+ it "handles an exception raised by a jabber pikelet" do
111
+ jabber = mock('jabber')
112
+ jabber.should_receive(:bootstrap)
113
+ Flapjack::Jabber.should_receive(:new).and_return(jabber)
114
+ jabber.should_receive(:main).and_raise(RuntimeError)
115
+
116
+ fiber.should_receive(:resume)
117
+ Fiber.should_receive(:new).and_yield.and_return(fiber)
118
+
119
+ fc = Flapjack::Coordinator.new
120
+ fc.should_receive(:stop)
121
+ fc.send(:build_pikelet, 'jabber_gateway', {})
122
+ pikelets = fc.instance_variable_get('@pikelets')
123
+ pikelets.should_not be_nil
124
+ pikelets.should be_an(Array)
125
+ pikelets.should have(1).pikelet
126
+ pikelets.first.should == {:fiber => fiber, :type => 'jabber_gateway', :instance => jabber}
127
+ end
128
+
129
+ it "creates a resque worker pikelet" do
130
+ redis = mock('redis')
131
+ Flapjack::RedisPool.should_receive(:new).and_return(redis)
132
+ Resque.should_receive(:redis=).with(redis)
133
+
134
+ worker = mock('worker')
135
+ EM::Resque::Worker.should_receive(:new).and_return(worker)
136
+ worker.should_receive(:work)
137
+
138
+ fiber.should_receive(:resume)
139
+ Fiber.should_receive(:new).and_yield.and_return(fiber)
140
+
141
+ fc = Flapjack::Coordinator.new
142
+ fc.send(:build_resque_pikelet, 'email_notifier', {})
143
+ pikelets = fc.instance_variable_get('@pikelets')
144
+ pikelets.should_not be_nil
145
+ pikelets.should be_an(Array)
146
+ pikelets.should have(1).pikelet
147
+ pikelets.first.should == {:fiber => fiber, :type => 'email_notifier', :instance => worker}
148
+ end
149
+
150
+ it "handles an exception raised by a resque worker pikelet"
151
+
152
+ it "creates a thin server pikelet" do
153
+ redis = mock('redis')
154
+ Flapjack::RedisPool.should_receive(:new).and_return(redis)
155
+
156
+ server = mock('server')
157
+ server.should_receive(:start)
158
+ Thin::Server.should_receive(:new).
159
+ with(/^(?:\d{1,3}\.){3}\d{1,3}$/, an_instance_of(Fixnum), Flapjack::Web, :signals => false).
160
+ and_return(server)
161
+
162
+ fc = Flapjack::Coordinator.new
163
+ fc.send(:build_thin_pikelet, 'web', {})
164
+ pikelets = fc.instance_variable_get('@pikelets')
165
+ pikelets.should_not be_nil
166
+ pikelets.should be_an(Array)
167
+ pikelets.should have(1).pikelet
168
+ pikelets.first.should == {:type => 'web', :instance => server}
169
+ end
170
+
171
+ # NB: exceptions are handled directly by the Thin pikelets
15
172
 
16
173
  end