flapjack 0.7.34 → 0.7.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  ## Flapjack Changelog
2
2
 
3
+ # 0.7.35 - 2013-12-10
4
+ - Feature: allow flapper to flap with an arbitrary interval gh-383 (@jessereynolds)
5
+ - Feature: Expose statistics for currency of all checks gh-386 (@jessereynolds)
6
+ - Chore: WebUI: contacts page - move notification rules section higher gh-376 (@jessereynolds)
7
+ - Bug: 500 errors in api are not logged by default gh-379 (@ali-graham)
8
+ - Bug: Exception generating notifications when state_duration is nil gh-372 (@jessereynolds)
9
+
3
10
  # 0.7.34 - 2013-11-20
4
11
  - Feature: update logoage (@jessereynolds)
5
12
  - Bug: flapjack-nagios-receiver to start daemonized from init script (@jessereynolds)
data/Gemfile CHANGED
@@ -11,6 +11,7 @@ group :test do
11
11
  gem 'cucumber'
12
12
  gem 'delorean'
13
13
  gem 'rack-test'
14
+ gem 'async_rack_test'
14
15
  gem 'resque_spec'
15
16
  gem 'webmock'
16
17
  gem 'guard'
data/bin/flapper CHANGED
@@ -25,13 +25,14 @@ ensure
25
25
  Socket.do_not_reverse_lookup = orig
26
26
  end
27
27
 
28
- def main(bind_ip)
29
- start_every = 120
30
- stop_after = 60
31
- bind_port = 12345
28
+ def main(bind_ip, bind_port, frequency)
29
+ raise "bind_port must be an integer" unless bind_port.is_a?(Integer)
30
+ start_every = frequency
31
+ stop_after = frequency.to_f / 2
32
+
32
33
  EM.run {
33
34
 
34
- puts "#{Time.now}: starting server"
35
+ puts "#{Time.now}: starting server on #{bind_ip}:#{bind_port}"
35
36
  server_init = EM.start_server bind_ip, bind_port, Flapper
36
37
  EM.add_timer(stop_after) do
37
38
  puts "#{Time.now}: stopping server"
@@ -39,7 +40,7 @@ def main(bind_ip)
39
40
  end
40
41
 
41
42
  EM.add_periodic_timer(start_every) do
42
- puts "#{Time.now}: starting server"
43
+ puts "#{Time.now}: starting server on #{bind_ip}:#{bind_port}"
43
44
  server = EM.start_server bind_ip, bind_port, Flapper
44
45
  EM.add_timer(stop_after) do
45
46
  puts "#{Time.now}: stopping server"
@@ -82,12 +83,22 @@ OptionParser.new do |opts|
82
83
  options.bind_ip = b
83
84
  end
84
85
 
86
+ opts.on("-P", "--bind-port [PORT]", String, "PORT for flapper to bind to") do |p|
87
+ options.bind_port = p.to_i
88
+ end
89
+
90
+ opts.on("-f", "--frequency [SECONDS]", String, "oscillate with a frequency of SECONDS [120]") do |f|
91
+ options.frequency = f.to_f
92
+ end
93
+
85
94
  end.parse!(ARGV)
86
95
 
87
- pidfile = options.pidfile || "/var/run/flapjack/#{exe}.pid"
88
- logfile = options.log_path || "/var/log/flapjack/#{exe}.log"
89
96
  daemonize = options.daemonize.nil? ? true : options.daemonize
90
- bind_ip = options.bind_ip || local_ip
97
+ pidfile = options.pidfile || "/var/run/flapjack/#{exe}.pid"
98
+ logfile = options.log_path || "/var/log/flapjack/#{exe}.log"
99
+ bind_ip = options.bind_ip || local_ip
100
+ bind_port = options.bind_port || 12345
101
+ frequency = options.frequency || 120.0
91
102
 
92
103
  runner = Dante::Runner.new(exe, :pid_path => pidfile, :log_path => logfile)
93
104
 
@@ -99,7 +110,7 @@ when "start"
99
110
  else
100
111
  print "#{exe} starting..."
101
112
  runner.execute(:daemonize => daemonize) {
102
- main(bind_ip)
113
+ main(bind_ip, bind_port, frequency)
103
114
  }
104
115
  puts " done."
105
116
  end
@@ -117,7 +128,7 @@ when "stop"
117
128
  when "restart"
118
129
  print "#{exe} restarting..."
119
130
  runner.execute(:daemonize => true, :restart => true) {
120
- main(bind_ip)
131
+ main(bind_ip, bind_port, frequency)
121
132
  }
122
133
  puts " done."
123
134
 
@@ -84,8 +84,10 @@ module Flapjack
84
84
  raise "state #{@state.inspect} is invalid" unless
85
85
  allowed_states.include?(@state)
86
86
 
87
- raise "state_duration #{@state_duration.inspect} is invalid" unless
88
- @state_duration && @state_duration.is_a?(Integer) && @state_duration >= 0
87
+ if @state_duration
88
+ raise "state_duration (#{@state_duration.inspect}) is invalid" unless
89
+ @state_duration.is_a?(Integer) && @state_duration >= 0
90
+ end
89
91
 
90
92
  if @rollup_alerts
91
93
  raise "rollup_alerts should be nil or a hash" unless @rollup_alerts.is_a?(Hash)
@@ -116,6 +116,16 @@ module Flapjack
116
116
  Flapjack::Data::EntityCheck.find_all_failing_by_entity(:redis => redis).keys
117
117
  end
118
118
 
119
+ def self.find_all_current(options)
120
+ raise "Redis connection not set" unless redis = options[:redis]
121
+ redis.zrange('current_entities', 0, -1)
122
+ end
123
+
124
+ def self.find_all_current_with_last_update(options)
125
+ raise "Redis connection not set" unless redis = options[:redis]
126
+ redis.zrange('current_entities', 0, -1, {:withscores => true})
127
+ end
128
+
119
129
  def contacts
120
130
  contact_ids = @redis.smembers("contacts_for:#{id}")
121
131
 
@@ -130,6 +130,80 @@ module Flapjack
130
130
  redis.hget("check:#{event_id}", 'state')
131
131
  end
132
132
 
133
+ # takes an array of ages (in seconds) to split all checks up by
134
+ # - age means how long since the last update
135
+ # - 0 age is implied if not explicitly passed
136
+ # returns arrays of all current checks hashed by age range upper bound, eg:
137
+ #
138
+ # EntityCheck.find_all_split_by_freshness([60, 300], opts) =>
139
+ # { 0 => [ 'foo-app-01:SSH' ],
140
+ # 60 => [ 'foo-app-01:Ping', 'foo-app-01:Disk / Utilisation' ],
141
+ # 300 => [] }
142
+ #
143
+ # you can also set :counts to true in options and you'll just get the counts, eg:
144
+ #
145
+ # EntityCheck.find_all_split_by_freshness([60, 300], opts.merge(:counts => true)) =>
146
+ # { 0 => 1,
147
+ # 60 => 3,
148
+ # 300 => 0 }
149
+ #
150
+ # and you can get the last update time with each check too by passing :with_times => true eg:
151
+ #
152
+ # EntityCheck.find_all_split_by_freshness([60, 300], opts.merge(:with_times => true)) =>
153
+ # { 0 => [ ['foo-app-01:SSH', 1382329923.0] ],
154
+ # 60 => [ ['foo-app-01:Ping', 1382329922.0], ['foo-app-01:Disk / Utilisation', 1382329921.0] ],
155
+ # 300 => [] }
156
+ #
157
+ def self.find_all_split_by_freshness(ages, options)
158
+ raise "Redis connection not set" unless redis = options[:redis]
159
+
160
+ raise "ages does not respond_to? :each and :each_with_index" unless ages.respond_to?(:each) && ages.respond_to?(:each_with_index)
161
+ raise "age values must respond_to? :to_i" unless ages.all? {|age| age.respond_to?(:to_i) }
162
+
163
+ ages << 0
164
+ ages = ages.sort.uniq
165
+
166
+ start_time = Time.now
167
+
168
+ checks = []
169
+ # get all the current checks, with last update time
170
+ Flapjack::Data::Entity.find_all_current(:redis => redis).each do |entity|
171
+ redis.zrange("current_checks:#{entity}", 0, -1, {:withscores => true}).each do |check|
172
+ check[0] = "#{entity}:#{check[0]}"
173
+ checks << check
174
+ end
175
+ end
176
+
177
+ skeleton = ages.inject({}) {|memo, age| memo[age] = [] ; memo }
178
+ age_ranges = ages.reverse.each_cons(2)
179
+ results_with_times = checks.inject(skeleton) do |memo, check|
180
+ check_age = start_time.to_i - check[1]
181
+ check_age = 0 unless check_age > 0
182
+ if check_age >= ages.last
183
+ memo[ages.last] << check
184
+ else
185
+ age_range = age_ranges.detect {|a, b| check_age < a && check_age >= b }
186
+ memo[age_range.last] << check unless age_range.nil?
187
+ end
188
+ memo
189
+ end
190
+
191
+ case
192
+ when options[:with_times]
193
+ results_with_times
194
+ when options[:counts]
195
+ results_with_times.inject({}) do |memo, (age, checks)|
196
+ memo[age] = checks.length
197
+ memo
198
+ end
199
+ else
200
+ results_with_times.inject({}) do |memo, (age, checks)|
201
+ memo[age] = checks.map { |check| check[0] }
202
+ memo
203
+ end
204
+ end
205
+ end
206
+
133
207
  def entity_name
134
208
  entity.name
135
209
  end
@@ -29,16 +29,42 @@ module Flapjack
29
29
 
30
30
  set :show_exceptions, false
31
31
 
32
- #rescue_exception = Proc.new { |env, exception|
33
- # @logger.error exception.message
34
- # @logger.error exception.backtrace.join("\n")
35
- # [503, {}, {:errors => [exception.message]}.to_json]
36
- #}
37
- #use Rack::FiberPool, :size => 25, :rescue_exception => rescue_exception
38
- #
39
- # FIXME: not sure why the above isn't working, had to add a general
40
- # error handler later in this file
41
- use Rack::FiberPool, :size => 25
32
+ rescue_exception = Proc.new { |env, exception|
33
+
34
+ error = proc {|status, exception, *msg|
35
+ if !msg || msg.empty?
36
+ trace = exception.backtrace.join("\n")
37
+ msg = "#{exception.class} - #{exception.message}"
38
+ msg_str = "#{msg}\n#{trace}"
39
+ else
40
+ msg_str = msg.join(", ")
41
+ end
42
+ logger = Flapjack::Gateways::API.instance_variable_get('@logger')
43
+ case
44
+ when status < 500
45
+ logger.warn "Error: #{msg_str}"
46
+ else
47
+ logger.error "Error: #{msg_str}"
48
+ end
49
+ [status, {}, {:errors => msg}.to_json]
50
+ }
51
+
52
+ e = env['sinatra.error']
53
+
54
+ case exception
55
+ when Flapjack::Gateways::API::ContactNotFound
56
+ error.call(403, e, "could not find contact '#{e.contact_id}'")
57
+ when Flapjack::Gateways::API::NotificationRuleNotFound
58
+ error.call(403, e, "could not find notification rule '#{e.rule_id}'")
59
+ when Flapjack::Gateways::API::EntityNotFound
60
+ error.call(403, e, "could not find entity '#{e.entity}'")
61
+ when Flapjack::Gateways::API::EntityCheckNotFound
62
+ error.call(403, e, "could not find entity check '#{e.check}'")
63
+ else
64
+ error.call(500, exception)
65
+ end
66
+ }
67
+ use Rack::FiberPool, :size => 25, :rescue_exception => rescue_exception
42
68
 
43
69
  use Rack::MethodOverride
44
70
  use Rack::JsonParamsParser
@@ -65,11 +91,16 @@ module Flapjack
65
91
  end
66
92
 
67
93
  before do
68
- input = env['rack.input'].read
69
- input_short = input.gsub(/\n/, '').gsub(/\s+/, ' ')
70
- logger.info("#{request.request_method} #{request.path_info}#{request.query_string} #{input_short[0..80]}")
71
- logger.debug("#{request.request_method} #{request.path_info}#{request.query_string} #{input}")
72
- env['rack.input'].rewind
94
+ input = nil
95
+ if logger.debug?
96
+ input = env['rack.input'].read
97
+ logger.debug("#{request.request_method} #{request.path_info}#{request.query_string} #{input}")
98
+ elsif logger.info?
99
+ input = env['rack.input'].read
100
+ input_short = input.gsub(/\n/, '').gsub(/\s+/, ' ')
101
+ logger.info("#{request.request_method} #{request.path_info}#{request.query_string} #{input_short[0..80]}")
102
+ end
103
+ env['rack.input'].rewind unless input.nil?
73
104
  end
74
105
 
75
106
  after do
@@ -81,35 +112,9 @@ module Flapjack
81
112
  register Flapjack::Gateways::API::ContactMethods
82
113
 
83
114
  not_found do
84
- logger.debug("in not_found :-(")
85
115
  err(404, "not routable")
86
116
  end
87
117
 
88
- error Flapjack::Gateways::API::ContactNotFound do
89
- e = env['sinatra.error']
90
- err(403, "could not find contact '#{e.contact_id}'")
91
- end
92
-
93
- error Flapjack::Gateways::API::NotificationRuleNotFound do
94
- e = env['sinatra.error']
95
- err(403, "could not find notification rule '#{e.rule_id}'")
96
- end
97
-
98
- error Flapjack::Gateways::API::EntityNotFound do
99
- e = env['sinatra.error']
100
- err(403, "could not find entity '#{e.entity}'")
101
- end
102
-
103
- error Flapjack::Gateways::API::EntityCheckNotFound do
104
- e = env['sinatra.error']
105
- err(403, "could not find entity check '#{e.check}'")
106
- end
107
-
108
- error do
109
- e = env['sinatra.error']
110
- err(response.status, "#{e.class} - #{e.message}")
111
- end
112
-
113
118
  private
114
119
 
115
120
  def err(status, *msg)
@@ -117,7 +122,6 @@ module Flapjack
117
122
  logger.info "Error: #{msg_str}"
118
123
  [status, {}, {:errors => msg}.to_json]
119
124
  end
120
-
121
125
  end
122
126
 
123
127
  end
@@ -132,11 +132,11 @@ module Flapjack
132
132
  entity_stats
133
133
  check_stats
134
134
  {
135
- 'events_queued' => @events_queued,
136
- 'all_entities' => @count_all_entities,
137
- 'failing_entities' => @count_failing_entities,
138
- 'all_checks' => @count_all_checks,
139
- 'failing_checks' => @count_failing_checks,
135
+ 'events_queued' => @events_queued,
136
+ 'all_entities' => @count_all_entities,
137
+ 'failing_entities' => @count_failing_entities,
138
+ 'all_checks' => @count_all_checks,
139
+ 'failing_checks' => @count_failing_checks,
140
140
  'processed_events' => {
141
141
  'all_time' => {
142
142
  'total' => @event_counters['all'].to_i,
@@ -145,10 +145,11 @@ module Flapjack
145
145
  'action' => @event_counters['action'].to_i,
146
146
  }
147
147
  },
148
- 'total_keys' => @dbsize,
149
- 'uptime' => @uptime_string,
150
- 'boottime' => @boot_time,
151
- 'current_time' => Time.now,
148
+ 'check_freshness' => @current_checks_ages,
149
+ 'total_keys' => @dbsize,
150
+ 'uptime' => @uptime_string,
151
+ 'boottime' => @boot_time,
152
+ 'current_time' => Time.now,
152
153
  'executive_instances' => @executive_instances,
153
154
  }.to_json
154
155
  end
@@ -283,7 +284,6 @@ module Flapjack
283
284
  end
284
285
 
285
286
  get '/contacts' do
286
- #self_stats
287
287
  @contacts = Flapjack::Data::Contact.all(:redis => redis)
288
288
 
289
289
  erb 'contacts.html'.to_sym
@@ -394,6 +394,7 @@ module Flapjack
394
394
  end
395
395
  @event_counters = redis.hgetall('event_counters')
396
396
  @events_queued = redis.llen('events')
397
+ @current_checks_ages = Flapjack::Data::EntityCheck.find_all_split_by_freshness([0, 60, 300, 900, 3600], {:redis => redis, :counts => true } )
397
398
  end
398
399
 
399
400
  def entity_stats
@@ -106,32 +106,6 @@
106
106
  </table>
107
107
  <% end %>
108
108
 
109
- <h3>All Entities and Checks</h3>
110
- <% if !@entities_and_checks || @entities_and_checks.empty? %>
111
- <p>No entities</p>
112
- <% else %>
113
- <table class="table table-bordered table-hover table-condensed">
114
- <tr>
115
- <th>Entity</th>
116
- <th>Checks</th>
117
- </tr>
118
- <% @entities_and_checks.each do |ec| %>
119
- <%
120
- entity = ec[:entity]
121
- checks = ec[:checks]
122
- %>
123
- <tr>
124
- <td><a href="/entity/<%= u(entity.name) %>" title="entity status"><%= h entity.name %></a></td>
125
- <td>
126
- <% checks.each do |check| %>
127
- <%= "<a href=\"/check?entity=#{u(entity.name)}&amp;check=#{u(check)}\" title=\"check status\">#{ h check }</a>" %>
128
- <% end %>
129
- </td>
130
- </tr>
131
- <% end %>
132
- </table>
133
- <% end %>
134
-
135
109
  <h3>Notification Rules</h3>
136
110
  <% rules = @contact.notification_rules %>
137
111
  <% if !rules || rules.empty? %>
@@ -161,7 +135,33 @@
161
135
  <td><%= h(blackholes.join(', ')) %></td>
162
136
  </tr>
163
137
  <% end %>
164
- </table>
138
+ </table>
139
+ <% end %>
140
+
141
+ <h3>All Entities and Checks</h3>
142
+ <% if !@entities_and_checks || @entities_and_checks.empty? %>
143
+ <p>No entities</p>
144
+ <% else %>
145
+ <table class="table table-bordered table-hover table-condensed">
146
+ <tr>
147
+ <th>Entity</th>
148
+ <th>Checks</th>
149
+ </tr>
150
+ <% @entities_and_checks.each do |ec| %>
151
+ <%
152
+ entity = ec[:entity]
153
+ checks = ec[:checks]
154
+ %>
155
+ <tr>
156
+ <td><a href="/entity/<%= u(entity.name) %>" title="entity status"><%= h entity.name %></a></td>
157
+ <td>
158
+ <% checks.each do |check| %>
159
+ <%= "<a href=\"/check?entity=#{u(entity.name)}&amp;check=#{u(check)}\" title=\"check status\">#{ h check }</a>" %>
160
+ <% end %>
161
+ </td>
162
+ </tr>
163
+ <% end %>
164
+ </table>
165
165
  <% end %>
166
166
 
167
167
  </div>
@@ -50,6 +50,16 @@
50
50
  </ul>
51
51
  </td>
52
52
  </tr>
53
+ <tr>
54
+ <td>Check Freshness</td>
55
+ <td>
56
+ <ul>
57
+ <% @current_checks_ages.each_pair do |age, check_count| %>
58
+ <li>&gt;= <%= age %>: <%= check_count %></li>
59
+ <% end %>
60
+ </ul>
61
+ </td>
62
+ </tr>
53
63
  <tr>
54
64
  <td>Total keys in redis</td>
55
65
  <td><%= h @dbsize %></td>