flapjack 0.7.34 → 0.7.35

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  ## Flapjack Changelog
2
2
 
3
+ # 0.7.35 - 2013-12-10
4
+ - Feature: allow flapper to flap with an arbitrary interval gh-383 (@jessereynolds)
5
+ - Feature: Expose statistics for currency of all checks gh-386 (@jessereynolds)
6
+ - Chore: WebUI: contacts page - move notification rules section higher gh-376 (@jessereynolds)
7
+ - Bug: 500 errors in api are not logged by default gh-379 (@ali-graham)
8
+ - Bug: Exception generating notifications when state_duration is nil gh-372 (@jessereynolds)
9
+
3
10
  # 0.7.34 - 2013-11-20
4
11
  - Feature: update logoage (@jessereynolds)
5
12
  - Bug: flapjack-nagios-receiver to start daemonized from init script (@jessereynolds)
data/Gemfile CHANGED
@@ -11,6 +11,7 @@ group :test do
11
11
  gem 'cucumber'
12
12
  gem 'delorean'
13
13
  gem 'rack-test'
14
+ gem 'async_rack_test'
14
15
  gem 'resque_spec'
15
16
  gem 'webmock'
16
17
  gem 'guard'
data/bin/flapper CHANGED
@@ -25,13 +25,14 @@ ensure
25
25
  Socket.do_not_reverse_lookup = orig
26
26
  end
27
27
 
28
- def main(bind_ip)
29
- start_every = 120
30
- stop_after = 60
31
- bind_port = 12345
28
+ def main(bind_ip, bind_port, frequency)
29
+ raise "bind_port must be an integer" unless bind_port.is_a?(Integer)
30
+ start_every = frequency
31
+ stop_after = frequency.to_f / 2
32
+
32
33
  EM.run {
33
34
 
34
- puts "#{Time.now}: starting server"
35
+ puts "#{Time.now}: starting server on #{bind_ip}:#{bind_port}"
35
36
  server_init = EM.start_server bind_ip, bind_port, Flapper
36
37
  EM.add_timer(stop_after) do
37
38
  puts "#{Time.now}: stopping server"
@@ -39,7 +40,7 @@ def main(bind_ip)
39
40
  end
40
41
 
41
42
  EM.add_periodic_timer(start_every) do
42
- puts "#{Time.now}: starting server"
43
+ puts "#{Time.now}: starting server on #{bind_ip}:#{bind_port}"
43
44
  server = EM.start_server bind_ip, bind_port, Flapper
44
45
  EM.add_timer(stop_after) do
45
46
  puts "#{Time.now}: stopping server"
@@ -82,12 +83,22 @@ OptionParser.new do |opts|
82
83
  options.bind_ip = b
83
84
  end
84
85
 
86
+ opts.on("-P", "--bind-port [PORT]", String, "PORT for flapper to bind to") do |p|
87
+ options.bind_port = p.to_i
88
+ end
89
+
90
+ opts.on("-f", "--frequency [SECONDS]", String, "oscillate with a frequency of SECONDS [120]") do |f|
91
+ options.frequency = f.to_f
92
+ end
93
+
85
94
  end.parse!(ARGV)
86
95
 
87
- pidfile = options.pidfile || "/var/run/flapjack/#{exe}.pid"
88
- logfile = options.log_path || "/var/log/flapjack/#{exe}.log"
89
96
  daemonize = options.daemonize.nil? ? true : options.daemonize
90
- bind_ip = options.bind_ip || local_ip
97
+ pidfile = options.pidfile || "/var/run/flapjack/#{exe}.pid"
98
+ logfile = options.log_path || "/var/log/flapjack/#{exe}.log"
99
+ bind_ip = options.bind_ip || local_ip
100
+ bind_port = options.bind_port || 12345
101
+ frequency = options.frequency || 120.0
91
102
 
92
103
  runner = Dante::Runner.new(exe, :pid_path => pidfile, :log_path => logfile)
93
104
 
@@ -99,7 +110,7 @@ when "start"
99
110
  else
100
111
  print "#{exe} starting..."
101
112
  runner.execute(:daemonize => daemonize) {
102
- main(bind_ip)
113
+ main(bind_ip, bind_port, frequency)
103
114
  }
104
115
  puts " done."
105
116
  end
@@ -117,7 +128,7 @@ when "stop"
117
128
  when "restart"
118
129
  print "#{exe} restarting..."
119
130
  runner.execute(:daemonize => true, :restart => true) {
120
- main(bind_ip)
131
+ main(bind_ip, bind_port, frequency)
121
132
  }
122
133
  puts " done."
123
134
 
@@ -84,8 +84,10 @@ module Flapjack
84
84
  raise "state #{@state.inspect} is invalid" unless
85
85
  allowed_states.include?(@state)
86
86
 
87
- raise "state_duration #{@state_duration.inspect} is invalid" unless
88
- @state_duration && @state_duration.is_a?(Integer) && @state_duration >= 0
87
+ if @state_duration
88
+ raise "state_duration (#{@state_duration.inspect}) is invalid" unless
89
+ @state_duration.is_a?(Integer) && @state_duration >= 0
90
+ end
89
91
 
90
92
  if @rollup_alerts
91
93
  raise "rollup_alerts should be nil or a hash" unless @rollup_alerts.is_a?(Hash)
@@ -116,6 +116,16 @@ module Flapjack
116
116
  Flapjack::Data::EntityCheck.find_all_failing_by_entity(:redis => redis).keys
117
117
  end
118
118
 
119
+ def self.find_all_current(options)
120
+ raise "Redis connection not set" unless redis = options[:redis]
121
+ redis.zrange('current_entities', 0, -1)
122
+ end
123
+
124
+ def self.find_all_current_with_last_update(options)
125
+ raise "Redis connection not set" unless redis = options[:redis]
126
+ redis.zrange('current_entities', 0, -1, {:withscores => true})
127
+ end
128
+
119
129
  def contacts
120
130
  contact_ids = @redis.smembers("contacts_for:#{id}")
121
131
 
@@ -130,6 +130,80 @@ module Flapjack
130
130
  redis.hget("check:#{event_id}", 'state')
131
131
  end
132
132
 
133
+ # takes an array of ages (in seconds) to split all checks up by
134
+ # - age means how long since the last update
135
+ # - 0 age is implied if not explicitly passed
136
+ # returns arrays of all current checks hashed by age range upper bound, eg:
137
+ #
138
+ # EntityCheck.find_all_split_by_freshness([60, 300], opts) =>
139
+ # { 0 => [ 'foo-app-01:SSH' ],
140
+ # 60 => [ 'foo-app-01:Ping', 'foo-app-01:Disk / Utilisation' ],
141
+ # 300 => [] }
142
+ #
143
+ # you can also set :counts to true in options and you'll just get the counts, eg:
144
+ #
145
+ # EntityCheck.find_all_split_by_freshness([60, 300], opts.merge(:counts => true)) =>
146
+ # { 0 => 1,
147
+ # 60 => 3,
148
+ # 300 => 0 }
149
+ #
150
+ # and you can get the last update time with each check too by passing :with_times => true eg:
151
+ #
152
+ # EntityCheck.find_all_split_by_freshness([60, 300], opts.merge(:with_times => true)) =>
153
+ # { 0 => [ ['foo-app-01:SSH', 1382329923.0] ],
154
+ # 60 => [ ['foo-app-01:Ping', 1382329922.0], ['foo-app-01:Disk / Utilisation', 1382329921.0] ],
155
+ # 300 => [] }
156
+ #
157
+ def self.find_all_split_by_freshness(ages, options)
158
+ raise "Redis connection not set" unless redis = options[:redis]
159
+
160
+ raise "ages does not respond_to? :each and :each_with_index" unless ages.respond_to?(:each) && ages.respond_to?(:each_with_index)
161
+ raise "age values must respond_to? :to_i" unless ages.all? {|age| age.respond_to?(:to_i) }
162
+
163
+ ages << 0
164
+ ages = ages.sort.uniq
165
+
166
+ start_time = Time.now
167
+
168
+ checks = []
169
+ # get all the current checks, with last update time
170
+ Flapjack::Data::Entity.find_all_current(:redis => redis).each do |entity|
171
+ redis.zrange("current_checks:#{entity}", 0, -1, {:withscores => true}).each do |check|
172
+ check[0] = "#{entity}:#{check[0]}"
173
+ checks << check
174
+ end
175
+ end
176
+
177
+ skeleton = ages.inject({}) {|memo, age| memo[age] = [] ; memo }
178
+ age_ranges = ages.reverse.each_cons(2)
179
+ results_with_times = checks.inject(skeleton) do |memo, check|
180
+ check_age = start_time.to_i - check[1]
181
+ check_age = 0 unless check_age > 0
182
+ if check_age >= ages.last
183
+ memo[ages.last] << check
184
+ else
185
+ age_range = age_ranges.detect {|a, b| check_age < a && check_age >= b }
186
+ memo[age_range.last] << check unless age_range.nil?
187
+ end
188
+ memo
189
+ end
190
+
191
+ case
192
+ when options[:with_times]
193
+ results_with_times
194
+ when options[:counts]
195
+ results_with_times.inject({}) do |memo, (age, checks)|
196
+ memo[age] = checks.length
197
+ memo
198
+ end
199
+ else
200
+ results_with_times.inject({}) do |memo, (age, checks)|
201
+ memo[age] = checks.map { |check| check[0] }
202
+ memo
203
+ end
204
+ end
205
+ end
206
+
133
207
  def entity_name
134
208
  entity.name
135
209
  end
@@ -29,16 +29,42 @@ module Flapjack
29
29
 
30
30
  set :show_exceptions, false
31
31
 
32
- #rescue_exception = Proc.new { |env, exception|
33
- # @logger.error exception.message
34
- # @logger.error exception.backtrace.join("\n")
35
- # [503, {}, {:errors => [exception.message]}.to_json]
36
- #}
37
- #use Rack::FiberPool, :size => 25, :rescue_exception => rescue_exception
38
- #
39
- # FIXME: not sure why the above isn't working, had to add a general
40
- # error handler later in this file
41
- use Rack::FiberPool, :size => 25
32
+ rescue_exception = Proc.new { |env, exception|
33
+
34
+ error = proc {|status, exception, *msg|
35
+ if !msg || msg.empty?
36
+ trace = exception.backtrace.join("\n")
37
+ msg = "#{exception.class} - #{exception.message}"
38
+ msg_str = "#{msg}\n#{trace}"
39
+ else
40
+ msg_str = msg.join(", ")
41
+ end
42
+ logger = Flapjack::Gateways::API.instance_variable_get('@logger')
43
+ case
44
+ when status < 500
45
+ logger.warn "Error: #{msg_str}"
46
+ else
47
+ logger.error "Error: #{msg_str}"
48
+ end
49
+ [status, {}, {:errors => msg}.to_json]
50
+ }
51
+
52
+ e = env['sinatra.error']
53
+
54
+ case exception
55
+ when Flapjack::Gateways::API::ContactNotFound
56
+ error.call(403, e, "could not find contact '#{e.contact_id}'")
57
+ when Flapjack::Gateways::API::NotificationRuleNotFound
58
+ error.call(403, e, "could not find notification rule '#{e.rule_id}'")
59
+ when Flapjack::Gateways::API::EntityNotFound
60
+ error.call(403, e, "could not find entity '#{e.entity}'")
61
+ when Flapjack::Gateways::API::EntityCheckNotFound
62
+ error.call(403, e, "could not find entity check '#{e.check}'")
63
+ else
64
+ error.call(500, exception)
65
+ end
66
+ }
67
+ use Rack::FiberPool, :size => 25, :rescue_exception => rescue_exception
42
68
 
43
69
  use Rack::MethodOverride
44
70
  use Rack::JsonParamsParser
@@ -65,11 +91,16 @@ module Flapjack
65
91
  end
66
92
 
67
93
  before do
68
- input = env['rack.input'].read
69
- input_short = input.gsub(/\n/, '').gsub(/\s+/, ' ')
70
- logger.info("#{request.request_method} #{request.path_info}#{request.query_string} #{input_short[0..80]}")
71
- logger.debug("#{request.request_method} #{request.path_info}#{request.query_string} #{input}")
72
- env['rack.input'].rewind
94
+ input = nil
95
+ if logger.debug?
96
+ input = env['rack.input'].read
97
+ logger.debug("#{request.request_method} #{request.path_info}#{request.query_string} #{input}")
98
+ elsif logger.info?
99
+ input = env['rack.input'].read
100
+ input_short = input.gsub(/\n/, '').gsub(/\s+/, ' ')
101
+ logger.info("#{request.request_method} #{request.path_info}#{request.query_string} #{input_short[0..80]}")
102
+ end
103
+ env['rack.input'].rewind unless input.nil?
73
104
  end
74
105
 
75
106
  after do
@@ -81,35 +112,9 @@ module Flapjack
81
112
  register Flapjack::Gateways::API::ContactMethods
82
113
 
83
114
  not_found do
84
- logger.debug("in not_found :-(")
85
115
  err(404, "not routable")
86
116
  end
87
117
 
88
- error Flapjack::Gateways::API::ContactNotFound do
89
- e = env['sinatra.error']
90
- err(403, "could not find contact '#{e.contact_id}'")
91
- end
92
-
93
- error Flapjack::Gateways::API::NotificationRuleNotFound do
94
- e = env['sinatra.error']
95
- err(403, "could not find notification rule '#{e.rule_id}'")
96
- end
97
-
98
- error Flapjack::Gateways::API::EntityNotFound do
99
- e = env['sinatra.error']
100
- err(403, "could not find entity '#{e.entity}'")
101
- end
102
-
103
- error Flapjack::Gateways::API::EntityCheckNotFound do
104
- e = env['sinatra.error']
105
- err(403, "could not find entity check '#{e.check}'")
106
- end
107
-
108
- error do
109
- e = env['sinatra.error']
110
- err(response.status, "#{e.class} - #{e.message}")
111
- end
112
-
113
118
  private
114
119
 
115
120
  def err(status, *msg)
@@ -117,7 +122,6 @@ module Flapjack
117
122
  logger.info "Error: #{msg_str}"
118
123
  [status, {}, {:errors => msg}.to_json]
119
124
  end
120
-
121
125
  end
122
126
 
123
127
  end
@@ -132,11 +132,11 @@ module Flapjack
132
132
  entity_stats
133
133
  check_stats
134
134
  {
135
- 'events_queued' => @events_queued,
136
- 'all_entities' => @count_all_entities,
137
- 'failing_entities' => @count_failing_entities,
138
- 'all_checks' => @count_all_checks,
139
- 'failing_checks' => @count_failing_checks,
135
+ 'events_queued' => @events_queued,
136
+ 'all_entities' => @count_all_entities,
137
+ 'failing_entities' => @count_failing_entities,
138
+ 'all_checks' => @count_all_checks,
139
+ 'failing_checks' => @count_failing_checks,
140
140
  'processed_events' => {
141
141
  'all_time' => {
142
142
  'total' => @event_counters['all'].to_i,
@@ -145,10 +145,11 @@ module Flapjack
145
145
  'action' => @event_counters['action'].to_i,
146
146
  }
147
147
  },
148
- 'total_keys' => @dbsize,
149
- 'uptime' => @uptime_string,
150
- 'boottime' => @boot_time,
151
- 'current_time' => Time.now,
148
+ 'check_freshness' => @current_checks_ages,
149
+ 'total_keys' => @dbsize,
150
+ 'uptime' => @uptime_string,
151
+ 'boottime' => @boot_time,
152
+ 'current_time' => Time.now,
152
153
  'executive_instances' => @executive_instances,
153
154
  }.to_json
154
155
  end
@@ -283,7 +284,6 @@ module Flapjack
283
284
  end
284
285
 
285
286
  get '/contacts' do
286
- #self_stats
287
287
  @contacts = Flapjack::Data::Contact.all(:redis => redis)
288
288
 
289
289
  erb 'contacts.html'.to_sym
@@ -394,6 +394,7 @@ module Flapjack
394
394
  end
395
395
  @event_counters = redis.hgetall('event_counters')
396
396
  @events_queued = redis.llen('events')
397
+ @current_checks_ages = Flapjack::Data::EntityCheck.find_all_split_by_freshness([0, 60, 300, 900, 3600], {:redis => redis, :counts => true } )
397
398
  end
398
399
 
399
400
  def entity_stats
@@ -106,32 +106,6 @@
106
106
  </table>
107
107
  <% end %>
108
108
 
109
- <h3>All Entities and Checks</h3>
110
- <% if !@entities_and_checks || @entities_and_checks.empty? %>
111
- <p>No entities</p>
112
- <% else %>
113
- <table class="table table-bordered table-hover table-condensed">
114
- <tr>
115
- <th>Entity</th>
116
- <th>Checks</th>
117
- </tr>
118
- <% @entities_and_checks.each do |ec| %>
119
- <%
120
- entity = ec[:entity]
121
- checks = ec[:checks]
122
- %>
123
- <tr>
124
- <td><a href="/entity/<%= u(entity.name) %>" title="entity status"><%= h entity.name %></a></td>
125
- <td>
126
- <% checks.each do |check| %>
127
- <%= "<a href=\"/check?entity=#{u(entity.name)}&amp;check=#{u(check)}\" title=\"check status\">#{ h check }</a>" %>
128
- <% end %>
129
- </td>
130
- </tr>
131
- <% end %>
132
- </table>
133
- <% end %>
134
-
135
109
  <h3>Notification Rules</h3>
136
110
  <% rules = @contact.notification_rules %>
137
111
  <% if !rules || rules.empty? %>
@@ -161,7 +135,33 @@
161
135
  <td><%= h(blackholes.join(', ')) %></td>
162
136
  </tr>
163
137
  <% end %>
164
- </table>
138
+ </table>
139
+ <% end %>
140
+
141
+ <h3>All Entities and Checks</h3>
142
+ <% if !@entities_and_checks || @entities_and_checks.empty? %>
143
+ <p>No entities</p>
144
+ <% else %>
145
+ <table class="table table-bordered table-hover table-condensed">
146
+ <tr>
147
+ <th>Entity</th>
148
+ <th>Checks</th>
149
+ </tr>
150
+ <% @entities_and_checks.each do |ec| %>
151
+ <%
152
+ entity = ec[:entity]
153
+ checks = ec[:checks]
154
+ %>
155
+ <tr>
156
+ <td><a href="/entity/<%= u(entity.name) %>" title="entity status"><%= h entity.name %></a></td>
157
+ <td>
158
+ <% checks.each do |check| %>
159
+ <%= "<a href=\"/check?entity=#{u(entity.name)}&amp;check=#{u(check)}\" title=\"check status\">#{ h check }</a>" %>
160
+ <% end %>
161
+ </td>
162
+ </tr>
163
+ <% end %>
164
+ </table>
165
165
  <% end %>
166
166
 
167
167
  </div>
@@ -50,6 +50,16 @@
50
50
  </ul>
51
51
  </td>
52
52
  </tr>
53
+ <tr>
54
+ <td>Check Freshness</td>
55
+ <td>
56
+ <ul>
57
+ <% @current_checks_ages.each_pair do |age, check_count| %>
58
+ <li>&gt;= <%= age %>: <%= check_count %></li>
59
+ <% end %>
60
+ </ul>
61
+ </td>
62
+ </tr>
53
63
  <tr>
54
64
  <td>Total keys in redis</td>
55
65
  <td><%= h @dbsize %></td>