sensu-plugins-graphite 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,145 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-stats
4
+ #
5
+ # DESCRIPTION:
6
+ # Checks metrics in graphite, averaged over a period of time.
7
+ #
8
+ # The fired sensu event will only be critical if a stat is
9
+ # above the critical threshold. Otherwise, the event will be warning,
10
+ # if a stat is above the warning threshold.
11
+ #
12
+ # Multiple stats will be checked if * are used
13
+ # in the "target" query.
14
+ #
15
+ # OUTPUT:
16
+ # plain text
17
+ #
18
+ # PLATFORMS:
19
+ # Linux
20
+ #
21
+ # DEPENDENCIES:
22
+ # gem: sensu-plugin
23
+ # gem: <?>
24
+ #
25
+ # USAGE:
26
+ # example commands
27
+ #
28
+ # NOTES:
29
+ #
30
+ # LICENSE:
31
+ # Alan Smith (alan@asmith.me)
32
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
33
+ # for details.
34
+ #
35
+
36
+ require 'json'
37
+ require 'net/http'
38
+ require 'sensu-plugin/check/cli'
39
+
40
+ class CheckGraphiteStat < Sensu::Plugin::Check::CLI
41
+ option :host,
42
+ short: '-h HOST',
43
+ long: '--host HOST',
44
+ description: 'graphite hostname',
45
+ proc: proc(&:to_s),
46
+ default: 'graphite'
47
+
48
+ option :period,
49
+ short: '-p PERIOD',
50
+ long: '--period PERIOD',
51
+ description: 'The period back in time to extract from Graphite. Use -24hours, -2days, -15mins, etc, same format as in Graphite',
52
+ proc: proc(&:to_s),
53
+ required: true
54
+
55
+ option :target,
56
+ short: '-t TARGET',
57
+ long: '--target TARGET',
58
+ description: 'The graphite metric name. Can include * to query multiple metrics',
59
+ proc: proc(&:to_s),
60
+ required: true
61
+
62
+ option :warn,
63
+ short: '-w WARN',
64
+ long: '--warn WARN',
65
+ description: 'Warning level',
66
+ proc: proc(&:to_f),
67
+ required: false
68
+
69
+ option :crit,
70
+ short: '-c Crit',
71
+ long: '--crit CRIT',
72
+ description: 'Critical level',
73
+ proc: proc(&:to_f),
74
+ required: false
75
+
76
+ option :unknown_ignore,
77
+ short: '-u',
78
+ long: '--unknown-ignore',
79
+ description: "Do nothing for UNKNOWN status (when you wildcard-match a ton of metrics at once and you don't care for a few missing data)",
80
+ boolean: true,
81
+ default: false
82
+
83
+ def average(a)
84
+ total = 0
85
+ a.to_a.each { |i| total += i.to_f }
86
+
87
+ total / a.length
88
+ end
89
+
90
+ def danger(metric)
91
+ datapoints = metric['datapoints'].map(&:first).compact
92
+
93
+ # #YELLOW
94
+ unless datapoints.empty? # rubocop:disable UnlessElse
95
+ avg = average(datapoints)
96
+
97
+ if !config[:crit].nil? && avg > config[:crit]
98
+ return [2, "#{metric['target']} is #{avg}"]
99
+ elsif !config[:warn].nil? && avg > config[:warn]
100
+ return [1, "#{metric['target']} is #{avg}"]
101
+ end
102
+ else
103
+ return [3, "#{metric['target']} has no datapoints"] unless config[:unknown_ignore]
104
+ end
105
+ [0, nil]
106
+ end
107
+
108
+ def run
109
+ body =
110
+ begin
111
+ uri = URI("http://#{config[:host]}/render?format=json&target=#{config[:target]}&from=#{config[:period]}")
112
+ res = Net::HTTP.get_response(uri)
113
+ res.body
114
+ rescue => e
115
+ warning "Failed to query graphite: #{e.inspect}"
116
+ end
117
+
118
+ status = 0
119
+ message = ''
120
+ data =
121
+ begin
122
+ JSON.parse(body)
123
+ rescue
124
+ []
125
+ end
126
+
127
+ unknown 'No data from graphite' if data.empty?
128
+
129
+ data.each do |metric|
130
+ s, msg = danger(metric)
131
+
132
+ message += "#{msg} " unless s == 0
133
+ status = s unless s < status
134
+ end
135
+
136
+ if status == 2
137
+ critical message
138
+ elsif status == 1
139
+ warning message
140
+ elsif status == 3
141
+ unknown message
142
+ end
143
+ ok
144
+ end
145
+ end
@@ -0,0 +1,530 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # <script name>
4
+ #
5
+ # DESCRIPTION:
6
+ # Get time series values from Graphite and create events based on values
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: json
17
+ # gem: socket
18
+ # gem: array_stats
19
+ # gem: net/http
20
+ #
21
+ # USAGE:
22
+ # #YELLOW
23
+ #
24
+ # NOTES:
25
+ #
26
+ # LICENSE:
27
+ # Copyright 2012 Ulf Mansson @ Recorded Future
28
+ # Modifications by Chris Jansen to support wildcard targets
29
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
30
+ # for details.
31
+ #
32
+
33
+ require 'sensu-plugin/check/cli'
34
+ require 'json'
35
+ require 'net/http'
36
+ require 'socket'
37
+ require 'array_stats'
38
+
39
+ class Graphite < Sensu::Plugin::Check::CLI
40
+ option :host,
41
+ short: '-h HOST',
42
+ long: '--host HOST',
43
+ description: 'Graphite host to connect to, include port',
44
+ required: true
45
+
46
+ option :target,
47
+ description: 'The graphite metric name. Could be a comma separated list of metric names.',
48
+ short: '-t TARGET',
49
+ long: '--target TARGET',
50
+ required: true
51
+
52
+ option :period,
53
+ description: 'The period back in time to extract from Graphite and compare with. Use 24hours,2days etc, same format as in Graphite',
54
+ short: '-p PERIOD',
55
+ long: '--period PERIOD',
56
+ default: '2hours'
57
+
58
+ option :updated_since,
59
+ description: 'The graphite value should have been updated within UPDATED_SINCE seconds, default to 600 seconds',
60
+ short: '-u UPDATED_SINCE',
61
+ long: '--updated_since UPDATED_SINCE',
62
+ default: 600
63
+
64
+ option :acceptable_diff_percentage,
65
+ description: 'The acceptable diff from max values in percentage, used in check_function_increasing',
66
+ short: '-d ACCEPTABLE_DIFF_PERCENTAGE',
67
+ long: '--acceptable_diff_percentage ACCEPTABLE_DIFF_PERCENTAGE',
68
+ default: 0
69
+
70
+ option :check_function_increasing,
71
+ description: 'Check that value is increasing or equal over time (use acceptable_diff_percentage if it should allow to be lower)',
72
+ short: '-i',
73
+ long: '--check_function_decreasing',
74
+ default: false,
75
+ boolean: true
76
+
77
+ option :greater_than,
78
+ description: 'Change whether value is greater than or less than check',
79
+ short: '-g',
80
+ long: '--greater_than',
81
+ default: false
82
+
83
+ option :check_last,
84
+ description: 'Check that the last value in GRAPHITE is greater/less than VALUE',
85
+ short: '-l VALUE',
86
+ long: '--last VALUE',
87
+ default: nil
88
+
89
+ option :ignore_nulls,
90
+ description: 'Do not error on null values, used in check_function_increasing',
91
+ short: '-n',
92
+ long: '--ignore_nulls',
93
+ default: false,
94
+ boolean: true
95
+
96
+ option :concat_output,
97
+ description: 'Include warning messages in output even if overall status is critical',
98
+ short: '-c',
99
+ long: '--concat_output',
100
+ default: false,
101
+ boolean: true
102
+
103
+ option :short_output,
104
+ description: 'Report only the highest status per series in output',
105
+ short: '-s',
106
+ long: '--short_output',
107
+ default: false,
108
+ boolean: true
109
+
110
+ option :check_average,
111
+ description: 'MAX_VALUE should be greater than the average of Graphite values from PERIOD',
112
+ short: '-a MAX_VALUE',
113
+ long: '--average_value MAX_VALUE'
114
+
115
+ option :data_points,
116
+ description: 'Number of data points to include in average check (smooths out spikes)',
117
+ short: '-d VALUE',
118
+ long: '--data_points VALUE',
119
+ default: 1
120
+
121
+ option :check_average_percent,
122
+ description: 'MAX_VALUE% should be greater than the average of Graphite values from PERIOD',
123
+ short: '-b MAX_VALUE',
124
+ long: '--average_percent_value MAX_VALUE'
125
+
126
+ option :percentile,
127
+ description: 'Percentile value, should be used in conjunction with percentile_value, defaults to 90',
128
+ long: '--percentile PERCENTILE',
129
+ default: 90
130
+
131
+ option :check_percentile,
132
+ description: 'Values should not be greater than the VALUE of Graphite values from PERIOD',
133
+ long: '--percentile_value VALUE'
134
+
135
+ option :http_user,
136
+ description: 'Basic HTTP authentication user',
137
+ short: '-U USER',
138
+ long: '--http-user USER',
139
+ default: nil
140
+
141
+ option :http_password,
142
+ description: 'Basic HTTP authentication password',
143
+ short: '-P PASSWORD',
144
+ long: '--http-password USER',
145
+ default: nil
146
+
147
+ def initialize
148
+ super
149
+ @graphite_cache = {}
150
+ end
151
+
152
+ def graphite_cache(target = nil)
153
+ # #YELLOW
154
+ if @graphite_cache.key?(target) # rubocop:disable GuardClause
155
+ graphite_value = @graphite_cache[target].select { |value| value[:period] == @period }
156
+ graphite_value if graphite_value.size > 0
157
+ end
158
+ end
159
+
160
+ # Create a graphite url from params
161
+ #
162
+ #
163
+ def graphite_url(target = nil)
164
+ url = "#{config[:host]}/render/"
165
+ url = 'http://' + url unless url[0..3] == 'http'
166
+ # #YELLOW
167
+ url = url + "?target=#{target}" if target # rubocop:disable Style/SelfAssignment
168
+ URI.parse(url)
169
+ end
170
+
171
+ def get_levels(config_param)
172
+ values = config_param.split(',')
173
+ i = 0
174
+ levels = {}
175
+ %w(warning error fatal).each do |type|
176
+ levels[type] = values[i] if values[i]
177
+ i += 1
178
+ end
179
+ levels
180
+ end
181
+
182
+ def get_graphite_values(target)
183
+ cache_value = graphite_cache target
184
+ return cache_value if cache_value
185
+ params = {
186
+ target: target,
187
+ from: "-#{@period}",
188
+ format: 'json'
189
+ }
190
+
191
+ req = Net::HTTP::Post.new(graphite_url.path)
192
+
193
+ # If the basic http authentication credentials have been provided, then use them
194
+ if !config[:http_user].nil? && !config[:http_password].nil?
195
+ req.basic_auth(config[:http_user], config[:http_password])
196
+ end
197
+
198
+ req.set_form_data(params)
199
+ resp = Net::HTTP.new(graphite_url.host, graphite_url.port).start { |http| http.request(req) }
200
+ data = JSON.parse(resp.body)
201
+ @graphite_cache[target] = []
202
+ if data.size > 0
203
+ data.each { |d| @graphite_cache[target] << { target: d['target'], period: @period, datapoints: d['datapoints'] } }
204
+ graphite_cache target
205
+ else # rubocop:disable all
206
+ nil
207
+ end
208
+ end
209
+
210
+ # Will give max values for [0..-2]
211
+ def max_graphite_value(target)
212
+ max_values = {}
213
+ values = get_graphite_values target
214
+ if values
215
+ values.each do |val|
216
+ max = get_max_value(val[:datapoints])
217
+ max_values[val[:target]] = max
218
+ end
219
+ end
220
+ max_values
221
+ end
222
+
223
+ def get_max_value(values)
224
+ if values
225
+ values.map { |i| i[0] ? i[0] : 0 }[0..-2].max
226
+ else # rubocop:disable all
227
+ nil
228
+ end
229
+ end
230
+
231
+ def last_graphite_metric(target, count = 1)
232
+ last_values = {}
233
+ values = get_graphite_values target
234
+ if values
235
+ values.each do |val|
236
+ last = get_last_metric(val[:datapoints], count)
237
+ last_values[val[:target]] = last
238
+ end
239
+ end
240
+ last_values
241
+ end
242
+
243
+ def get_last_metric(values, count = 1)
244
+ if values
245
+ ret = []
246
+ values_size = values.size
247
+ count = values_size if count > values_size
248
+ while count > 0
249
+ values_size -= 1
250
+ break if values[values_size].nil?
251
+ count -= 1 if values[values_size][0]
252
+ ret.push(values[values_size]) if values[values_size][0]
253
+ end
254
+ ret
255
+ else # rubocop:disable all
256
+ nil
257
+ end
258
+ end
259
+
260
+ def last_graphite_value(target, count = 1)
261
+ last_metrics = last_graphite_metric(target, count)
262
+ last_values = {}
263
+ if last_metrics
264
+ last_metrics.each do |target_name, metrics|
265
+ last_values[target_name] = metrics.map { |metric| metric[0] }.mean
266
+ end
267
+ end
268
+ last_values
269
+ end
270
+
271
+ def been_updated_since(target, time, updated_since)
272
+ last_time_stamp = last_graphite_metric target
273
+ warnings = []
274
+ if last_time_stamp
275
+ last_time_stamp.each do |target_name, value|
276
+ last_time_stamp_bool = value[1] > time.to_i ? true : false
277
+ warnings << "The metric #{target_name} has not been updated in #{updated_since} seconds" unless last_time_stamp_bool
278
+ end
279
+ end
280
+ warnings
281
+ end
282
+
283
+ def greater_less
284
+ return 'greater' if config[:greater_than]
285
+ return 'less' unless config[:greater_than]
286
+ end
287
+
288
+ def check_increasing(target)
289
+ updated_since = config[:updated_since].to_i
290
+ time_to_be_updated_since = Time.now - updated_since
291
+ critical_errors = []
292
+ warnings = []
293
+ max_gv = max_graphite_value target
294
+ last_gv = last_graphite_value target
295
+ if last_gv.is_a?(Hash) && max_gv.is_a?(Hash)
296
+ # #YELLOW
297
+ last_gv.each do |target_name, value| # rubocop:disable Style/Next
298
+ if value && max_gv[target_name]
299
+ last = value
300
+ max = max_gv[target_name]
301
+ if max > last * (1 + config[:acceptable_diff_percentage].to_f / 100)
302
+ msg = "The metric #{target} with last value #{last} is less than max value #{max} during #{config[:period]} period"
303
+ critical_errors << msg
304
+ end
305
+ end
306
+ end
307
+ else
308
+ warnings << "Could not found any value in Graphite for metric #{target}, see #{graphite_url(target)}"
309
+ end
310
+ unless config[:ignore_nulls]
311
+ warnings.concat(been_updated_since(target, time_to_be_updated_since, updated_since))
312
+ end
313
+ [warnings, critical_errors, []]
314
+ end
315
+
316
+ def check_average_percent(target, max_values, data_points = 1)
317
+ values = get_graphite_values target
318
+ last_values = last_graphite_value(target, data_points)
319
+ return [[], [], []] unless values
320
+ warnings = []
321
+ criticals = []
322
+ fatal = []
323
+ values.each do |data|
324
+ target = data[:target]
325
+ values_pair = data[:datapoints]
326
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
327
+ # #YELLOW
328
+ avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
329
+ last_value = last_values[target]
330
+ percent = last_value / avg_value unless last_value.nil? || avg_value.nil?
331
+ # #YELLOW
332
+ %w(fatal error warning).each do |type| # rubocop:disable Style/Next
333
+ next unless max_values.key?(type)
334
+ max_value = max_values[type]
335
+ var1 = config[:greater_than] ? percent : max_value.to_f
336
+ var2 = config[:greater_than] ? max_value.to_f : percent
337
+ if !percent.nil? && var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
338
+ text = "The last value of metric #{target} is #{percent}% #{greater_less} than allowed #{max_value}% of the average value #{avg_value}"
339
+ case type
340
+ when 'warning'
341
+ warnings << text
342
+ when 'error'
343
+ criticals << text
344
+ when 'fatal'
345
+ fatal << text
346
+ else
347
+ fail "Unknown type #{type}"
348
+ end
349
+ break if config[:short_output]
350
+ end
351
+ end
352
+ end
353
+ [warnings, criticals, fatal]
354
+ end
355
+
356
+ def check_average(target, max_values)
357
+ values = get_graphite_values target
358
+ return [[], [], []] unless values
359
+ warnings = []
360
+ criticals = []
361
+ fatal = []
362
+ values.each do |data|
363
+ target = data[:target]
364
+ values_pair = data[:datapoints]
365
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
366
+ # #YELLOW
367
+ avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
368
+ # YELLOW
369
+ %w(fatal error warning).each do |type| # rubocop:disable Style/Next
370
+ next unless max_values.key?(type)
371
+ max_value = max_values[type]
372
+ var1 = config[:greater_than] ? avg_value : max_value.to_f
373
+ var2 = config[:greater_than] ? max_value.to_f : avg_value
374
+ if var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
375
+ text = "The average value of metric #{target} is #{avg_value} that is #{greater_less} than allowed average of #{max_value}"
376
+ case type
377
+ when 'warning'
378
+ warnings << text
379
+ when 'error'
380
+ criticals << text
381
+ when 'fatal'
382
+ fatal << text
383
+ else
384
+ fail "Unknown type #{type}"
385
+ end
386
+ break if config[:short_output]
387
+ end
388
+ end
389
+ end
390
+ [warnings, criticals, fatal]
391
+ end
392
+
393
+ def check_percentile(target, max_values, percentile, data_points = 1)
394
+ values = get_graphite_values target
395
+ last_values = last_graphite_value(target, data_points)
396
+ return [[], [], []] unless values
397
+ warnings = []
398
+ criticals = []
399
+ fatal = []
400
+ values.each do |data|
401
+ target = data[:target]
402
+ values_pair = data[:datapoints]
403
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
404
+ percentile_value = values_array.percentile(percentile)
405
+ last_value = last_values[target]
406
+ percent = last_value / percentile_value unless last_value.nil? || percentile_value.nil?
407
+ # #YELLOW
408
+ %w(fatal error warning).each do |type| # rubocop:disable Style/Next
409
+ next unless max_values.key?(type)
410
+ max_value = max_values[type]
411
+ var1 = config[:greater_than] ? percent : max_value.to_f
412
+ var2 = config[:greater_than] ? max_value.to_f : percent
413
+ if !percentile_value.nil? && var1 > var2
414
+ text = "The percentile value of metric #{target} (#{last_value}) is #{greater_less} than the
415
+ #{percentile}th percentile (#{percentile_value}) by more than #{max_value}%"
416
+ case type
417
+ when 'warning'
418
+ warnings << text
419
+ when 'error'
420
+ criticals << text
421
+ when 'fatal'
422
+ fatal << text
423
+ else
424
+ fail "Unknown type #{type}"
425
+ end
426
+ break if config[:short_output]
427
+ end
428
+ end
429
+ end
430
+ [warnings, criticals, fatal]
431
+ end
432
+
433
+ def check_last(target, max_values)
434
+ last_targets = last_graphite_metric target
435
+ return [[], [], []] unless last_targets
436
+ warnings = []
437
+ criticals = []
438
+ fatal = []
439
+ # #YELLOW
440
+ last_targets.each do |target_name, last| # rubocop:disable Style/Next
441
+ last_value = last.first
442
+ unless last_value.nil?
443
+ # #YELLOW
444
+ %w(fatal error warning).each do |type| # rubocop:disable Style/Next
445
+ next unless max_values.key?(type)
446
+ max_value = max_values[type]
447
+ var1 = config[:greater_than] ? last_value : max_value.to_f
448
+ var2 = config[:greater_than] ? max_value.to_f : last_value
449
+ if var1 > var2
450
+ text = "The metric #{target_name} is #{last_value} that is #{greater_less} than max allowed #{max_value}"
451
+ case type
452
+ when 'warning'
453
+ warnings << text
454
+ when 'error'
455
+ criticals << text
456
+ when 'fatal'
457
+ fatal << text
458
+ else
459
+ fail "Unknown type #{type}"
460
+ end
461
+ break if config[:short_output]
462
+ end
463
+ end
464
+ end
465
+ end
466
+ [warnings, criticals, fatal]
467
+ end
468
+
469
+ def run # rubocop:disable all
470
+ targets = config[:target].split(',')
471
+ @period = config[:period]
472
+ critical_errors = []
473
+ warnings = []
474
+ fatals = []
475
+ # #YELLOW
476
+ targets.each do |target| # rubocop:disable Style/Next
477
+ if config[:check_function_increasing]
478
+ inc_warnings, inc_critical, inc_fatal = check_increasing target
479
+ warnings += inc_warnings
480
+ critical_errors += inc_critical
481
+ fatals += inc_fatal
482
+ end
483
+ if config[:check_last]
484
+ max_values = get_levels config[:check_last]
485
+ lt_warnings, lt_critical, lt_fatal = check_last(target, max_values)
486
+ warnings += lt_warnings
487
+ critical_errors += lt_critical
488
+ fatals += lt_fatal
489
+ end
490
+ if config[:check_average]
491
+ max_values = get_levels config[:check_average]
492
+ avg_warnings, avg_critical, avg_fatal = check_average(target, max_values)
493
+ warnings += avg_warnings
494
+ critical_errors += avg_critical
495
+ fatals += avg_fatal
496
+ end
497
+ if config[:check_average_percent]
498
+ max_values = get_levels config[:check_average_percent]
499
+ avg_warnings, avg_critical, avg_fatal = check_average_percent(target, max_values, config[:data_points].to_i)
500
+ warnings += avg_warnings
501
+ critical_errors += avg_critical
502
+ fatals += avg_fatal
503
+ end
504
+ if config[:check_percentile]
505
+ max_values = get_levels config[:check_percentile]
506
+ pct_warnings, pct_critical, pct_fatal = check_percentile(target, max_values, config[:percentile].to_i, config[:data_points].to_i)
507
+ warnings += pct_warnings
508
+ critical_errors += pct_critical
509
+ fatals += pct_fatal
510
+ end
511
+ end
512
+ fatals_string = fatals.size > 0 ? fatals.join("\n") : ''
513
+ criticals_string = critical_errors.size > 0 ? critical_errors.join("\n") : ''
514
+ warnings_string = warnings.size > 0 ? warnings.join("\n") : ''
515
+
516
+ if config[:concat_output]
517
+ fatals_string = fatals_string + "\n" + criticals_string if critical_errors.size > 0
518
+ fatals_string = fatals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
519
+ criticals_string = criticals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
520
+ critical fatals_string if fatals.size > 0
521
+ critical criticals_string if critical_errors.size > 0
522
+ warning warnings_string if warnings.size > 0
523
+ else
524
+ critical fatals_string if fatals.size > 0
525
+ critical criticals_string if critical_errors.size > 0
526
+ warning warnings_string if warnings.size > 0
527
+ end
528
+ ok
529
+ end
530
+ end