sensu-plugins-graphite-donotuse 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,158 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-stats
4
+ #
5
+ # DESCRIPTION:
6
+ # Checks metrics in graphite, averaged over a period of time.
7
+ #
8
+ # The fired sensu event will only be critical if a stat is
9
+ # above the critical threshold. Otherwise, the event will be warning,
10
+ # if a stat is above the warning threshold.
11
+ #
12
+ # Multiple stats will be checked if * are used
13
+ # in the "target" query.
14
+ #
15
+ # OUTPUT:
16
+ # plain text
17
+ #
18
+ # PLATFORMS:
19
+ # Linux
20
+ #
21
+ # DEPENDENCIES:
22
+ # gem: sensu-plugin
23
+ #
24
+ # USAGE:
25
+ # example commands
26
+ #
27
+ # NOTES:
28
+ #
29
+ # LICENSE:
30
+ # Alan Smith (alan@asmith.me)
31
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
32
+ # for details.
33
+ #
34
+
35
+ require 'json'
36
+ require 'net/http'
37
+ require 'sensu-plugin/check/cli'
38
+
39
+ class CheckGraphiteStat < Sensu::Plugin::Check::CLI
40
+ option :host,
41
+ short: '-h HOST',
42
+ long: '--host HOST',
43
+ description: 'graphite hostname',
44
+ proc: proc(&:to_s),
45
+ default: 'graphite'
46
+
47
+ option :period,
48
+ short: '-p PERIOD',
49
+ long: '--period PERIOD',
50
+ description: 'The period back in time to extract from Graphite. Use -24hours, -2days, -15mins, etc, same format as in Graphite',
51
+ proc: proc(&:to_s),
52
+ required: true
53
+
54
+ option :target,
55
+ short: '-t TARGET',
56
+ long: '--target TARGET',
57
+ description: 'The graphite metric name. Can include * to query multiple metrics',
58
+ proc: proc(&:to_s),
59
+ required: true
60
+
61
+ option :warn,
62
+ short: '-w WARN',
63
+ long: '--warn WARN',
64
+ description: 'Warning level',
65
+ proc: proc(&:to_f),
66
+ required: false
67
+
68
+ option :crit,
69
+ short: '-c Crit',
70
+ long: '--crit CRIT',
71
+ description: 'Critical level',
72
+ proc: proc(&:to_f),
73
+ required: false
74
+
75
+ option :unknown_ignore,
76
+ short: '-u',
77
+ long: '--unknown-ignore',
78
+ description: "Do nothing for UNKNOWN status (when you wildcard-match a ton of metrics at once and you don't care for a few missing data)",
79
+ boolean: true,
80
+ default: false
81
+
82
+ option :reverse_scale,
83
+ short: '-r',
84
+ long: '--reverse-scale',
85
+ description: 'Reverse the warning/crit scale (if value is less than instead of greater than)',
86
+ boolean: true,
87
+ default: false
88
+
89
+ def average(a)
90
+ total = 0
91
+ a.to_a.each { |i| total += i.to_f }
92
+
93
+ total / a.length
94
+ end
95
+
96
+ def danger(metric)
97
+ datapoints = metric['datapoints'].map(&:first).compact
98
+
99
+ # #YELLOW
100
+ unless datapoints.empty? # rubocop:disable UnlessElse
101
+ avg = average(datapoints)
102
+ if config[:reverse_scale] == false
103
+ if !config[:crit].nil? && avg > config[:crit]
104
+ return [2, "#{metric['target']} is #{avg}"]
105
+ elsif !config[:warn].nil? && avg > config[:warn]
106
+ return [1, "#{metric['target']} is #{avg}"]
107
+ end
108
+ else
109
+ if !config[:crit].nil? && avg < config[:crit]
110
+ return [2, "#{metric['target']} is #{avg}"]
111
+ elsif !config[:warn].nil? && avg < config[:warn]
112
+ return [1, "#{metric['target']} is #{avg}"]
113
+ end
114
+ end
115
+ else
116
+ return [3, "#{metric['target']} has no datapoints"] unless config[:unknown_ignore]
117
+ end
118
+ [0, nil]
119
+ end
120
+
121
+ def run
122
+ body =
123
+ begin
124
+ uri = URI.parse(URI.encode("http://#{config[:host]}/render?format=json&target=#{config[:target]}&from=#{config[:period]}"))
125
+ res = Net::HTTP.get_response(uri)
126
+ res.body
127
+ rescue => e
128
+ warning "Failed to query graphite: #{e.inspect}"
129
+ end
130
+
131
+ status = 0
132
+ message = ''
133
+ data =
134
+ begin
135
+ JSON.parse(body)
136
+ rescue
137
+ []
138
+ end
139
+
140
+ unknown 'No data from graphite' if data.empty?
141
+
142
+ data.each do |metric|
143
+ s, msg = danger(metric)
144
+
145
+ message += "#{msg} " unless s == 0
146
+ status = s unless s < status
147
+ end
148
+
149
+ if status == 2
150
+ critical message
151
+ elsif status == 1
152
+ warning message
153
+ elsif status == 3
154
+ unknown message
155
+ end
156
+ ok
157
+ end
158
+ end
@@ -0,0 +1,521 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # <script name>
4
+ #
5
+ # DESCRIPTION:
6
+ # Get time series values from Graphite and create events based on values
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: array_stats
17
+ #
18
+ # USAGE:
19
+ # #YELLOW
20
+ #
21
+ # NOTES:
22
+ #
23
+ # LICENSE:
24
+ # Copyright 2012 Ulf Mansson @ Recorded Future
25
+ # Modifications by Chris Jansen to support wildcard targets
26
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
27
+ # for details.
28
+ #
29
+
30
+ require 'sensu-plugin/check/cli'
31
+ require 'json'
32
+ require 'net/http'
33
+ require 'socket'
34
+ require 'array_stats'
35
+
36
+ class Graphite < Sensu::Plugin::Check::CLI
37
+ option :host,
38
+ short: '-h HOST',
39
+ long: '--host HOST',
40
+ description: 'Graphite host to connect to, include port',
41
+ required: true
42
+
43
+ option :target,
44
+ description: 'The graphite metric name. Could be a comma separated list of metric names.',
45
+ short: '-t TARGET',
46
+ long: '--target TARGET',
47
+ required: true
48
+
49
+ option :period,
50
+ description: 'The period back in time to extract from Graphite and compare with. Use 24hours,2days etc, same format as in Graphite',
51
+ short: '-p PERIOD',
52
+ long: '--period PERIOD',
53
+ default: '2hours'
54
+
55
+ option :updated_since,
56
+ description: 'The graphite value should have been updated within UPDATED_SINCE seconds, default to 600 seconds',
57
+ short: '-u UPDATED_SINCE',
58
+ long: '--updated_since UPDATED_SINCE',
59
+ default: 600
60
+
61
+ option :acceptable_diff_percentage,
62
+ description: 'The acceptable diff from max values in percentage, used in check_function_increasing',
63
+ short: '-d ACCEPTABLE_DIFF_PERCENTAGE',
64
+ long: '--acceptable_diff_percentage ACCEPTABLE_DIFF_PERCENTAGE',
65
+ default: 0
66
+
67
+ option :check_function_increasing,
68
+ description: 'Check that value is increasing or equal over time (use acceptable_diff_percentage if it should allow to be lower)',
69
+ short: '-i',
70
+ long: '--check_function_decreasing',
71
+ default: false,
72
+ boolean: true
73
+
74
+ option :greater_than,
75
+ description: 'Change whether value is greater than or less than check',
76
+ short: '-g',
77
+ long: '--greater_than',
78
+ default: false
79
+
80
+ option :check_last,
81
+ description: 'Check that the last value in GRAPHITE is greater/less than VALUE',
82
+ short: '-l VALUE',
83
+ long: '--last VALUE',
84
+ default: nil
85
+
86
+ option :ignore_nulls,
87
+ description: 'Do not error on null values, used in check_function_increasing',
88
+ short: '-n',
89
+ long: '--ignore_nulls',
90
+ default: false,
91
+ boolean: true
92
+
93
+ option :concat_output,
94
+ description: 'Include warning messages in output even if overall status is critical',
95
+ short: '-c',
96
+ long: '--concat_output',
97
+ default: false,
98
+ boolean: true
99
+
100
+ option :short_output,
101
+ description: 'Report only the highest status per series in output',
102
+ short: '-s',
103
+ long: '--short_output',
104
+ default: false,
105
+ boolean: true
106
+
107
+ option :check_average,
108
+ description: 'MAX_VALUE should be greater than the average of Graphite values from PERIOD',
109
+ short: '-a MAX_VALUE',
110
+ long: '--average_value MAX_VALUE'
111
+
112
+ option :data_points,
113
+ description: 'Number of data points to include in average check (smooths out spikes)',
114
+ short: '-d VALUE',
115
+ long: '--data_points VALUE',
116
+ default: 1
117
+
118
+ option :check_average_percent,
119
+ description: 'MAX_VALUE% should be greater than the average of Graphite values from PERIOD',
120
+ short: '-b MAX_VALUE',
121
+ long: '--average_percent_value MAX_VALUE'
122
+
123
+ option :percentile,
124
+ description: 'Percentile value, should be used in conjunction with percentile_value, defaults to 90',
125
+ long: '--percentile PERCENTILE',
126
+ default: 90
127
+
128
+ option :check_percentile,
129
+ description: 'Values should not be greater than the VALUE of Graphite values from PERIOD',
130
+ long: '--percentile_value VALUE'
131
+
132
+ option :http_user,
133
+ description: 'Basic HTTP authentication user',
134
+ short: '-U USER',
135
+ long: '--http-user USER',
136
+ default: nil
137
+
138
+ option :http_password,
139
+ description: 'Basic HTTP authentication password',
140
+ short: '-P PASSWORD',
141
+ long: '--http-password USER',
142
+ default: nil
143
+
144
+ def initialize
145
+ super
146
+ @graphite_cache = {}
147
+ end
148
+
149
+ def graphite_cache(target = nil)
150
+ # #YELLOW
151
+ if @graphite_cache.key?(target)
152
+ graphite_value = @graphite_cache[target].select { |value| value[:period] == @period }
153
+ graphite_value if graphite_value.size > 0
154
+ end
155
+ end
156
+
157
+ # Create a graphite url from params
158
+ #
159
+ #
160
+ def graphite_url(target = nil)
161
+ url = "#{config[:host]}/render/"
162
+ url = 'http://' + url unless url[0..3] == 'http'
163
+ # #YELLOW
164
+ url = url + "?target=#{target}" if target # rubocop:disable Style/SelfAssignment
165
+ URI.parse(url)
166
+ end
167
+
168
+ def get_levels(config_param)
169
+ values = config_param.split(',')
170
+ i = 0
171
+ levels = {}
172
+ %w(warning error fatal).each do |type|
173
+ levels[type] = values[i] if values[i]
174
+ i += 1
175
+ end
176
+ levels
177
+ end
178
+
179
+ def get_graphite_values(target)
180
+ cache_value = graphite_cache target
181
+ return cache_value if cache_value
182
+ params = {
183
+ target: target,
184
+ from: "-#{@period}",
185
+ format: 'json'
186
+ }
187
+
188
+ req = Net::HTTP::Post.new(graphite_url.path)
189
+
190
+ # If the basic http authentication credentials have been provided, then use them
191
+ if !config[:http_user].nil? && !config[:http_password].nil?
192
+ req.basic_auth(config[:http_user], config[:http_password])
193
+ end
194
+
195
+ req.set_form_data(params)
196
+ resp = Net::HTTP.new(graphite_url.host, graphite_url.port).start { |http| http.request(req) }
197
+ data = JSON.parse(resp.body)
198
+ @graphite_cache[target] = []
199
+ if data.size > 0
200
+ data.each { |d| @graphite_cache[target] << { target: d['target'], period: @period, datapoints: d['datapoints'] } }
201
+ graphite_cache target
202
+ end
203
+ end
204
+
205
+ # Will give max values for [0..-2]
206
+ def max_graphite_value(target)
207
+ max_values = {}
208
+ values = get_graphite_values target
209
+ if values
210
+ values.each do |val|
211
+ max = get_max_value(val[:datapoints])
212
+ max_values[val[:target]] = max
213
+ end
214
+ end
215
+ max_values
216
+ end
217
+
218
+ def get_max_value(values)
219
+ if values
220
+ values.map { |i| i[0] ? i[0] : 0 }[0..-2].max
221
+ end
222
+ end
223
+
224
+ def last_graphite_metric(target, count = 1)
225
+ last_values = {}
226
+ values = get_graphite_values target
227
+ if values
228
+ values.each do |val|
229
+ last = get_last_metric(val[:datapoints], count)
230
+ last_values[val[:target]] = last
231
+ end
232
+ end
233
+ last_values
234
+ end
235
+
236
+ def get_last_metric(values, count = 1)
237
+ if values
238
+ ret = []
239
+ values_size = values.size
240
+ count = values_size if count > values_size
241
+ while count > 0
242
+ values_size -= 1
243
+ break if values[values_size].nil?
244
+ count -= 1 if values[values_size][0]
245
+ ret.push(values[values_size]) if values[values_size][0]
246
+ end
247
+ ret
248
+ end
249
+ end
250
+
251
+ def last_graphite_value(target, count = 1)
252
+ last_metrics = last_graphite_metric(target, count)
253
+ last_values = {}
254
+ if last_metrics
255
+ last_metrics.each do |target_name, metrics|
256
+ last_values[target_name] = metrics.map { |metric| metric[0] }.mean
257
+ end
258
+ end
259
+ last_values
260
+ end
261
+
262
+ def been_updated_since(target, time, updated_since)
263
+ last_time_stamp = last_graphite_metric target
264
+ warnings = []
265
+ if last_time_stamp
266
+ last_time_stamp.each do |target_name, value|
267
+ last_time_stamp_bool = value[1] > time.to_i ? true : false
268
+ warnings << "The metric #{target_name} has not been updated in #{updated_since} seconds" unless last_time_stamp_bool
269
+ end
270
+ end
271
+ warnings
272
+ end
273
+
274
+ def greater_less
275
+ return 'greater' if config[:greater_than]
276
+ return 'less' unless config[:greater_than]
277
+ end
278
+
279
+ def check_increasing(target)
280
+ updated_since = config[:updated_since].to_i
281
+ time_to_be_updated_since = Time.now - updated_since
282
+ critical_errors = []
283
+ warnings = []
284
+ max_gv = max_graphite_value target
285
+ last_gv = last_graphite_value target
286
+ if last_gv.is_a?(Hash) && max_gv.is_a?(Hash)
287
+ # #YELLOW
288
+ last_gv.each do |target_name, value|
289
+ if value && max_gv[target_name]
290
+ last = value
291
+ max = max_gv[target_name]
292
+ if max > last * (1 + config[:acceptable_diff_percentage].to_f / 100)
293
+ msg = "The metric #{target} with last value #{last} is less than max value #{max} during #{config[:period]} period"
294
+ critical_errors << msg
295
+ end
296
+ end
297
+ end
298
+ else
299
+ warnings << "Could not found any value in Graphite for metric #{target}, see #{graphite_url(target)}"
300
+ end
301
+ unless config[:ignore_nulls]
302
+ warnings.concat(been_updated_since(target, time_to_be_updated_since, updated_since))
303
+ end
304
+ [warnings, critical_errors, []]
305
+ end
306
+
307
+ def check_average_percent(target, max_values, data_points = 1)
308
+ values = get_graphite_values target
309
+ last_values = last_graphite_value(target, data_points)
310
+ return [[], [], []] unless values
311
+ warnings = []
312
+ criticals = []
313
+ fatal = []
314
+ values.each do |data|
315
+ target = data[:target]
316
+ values_pair = data[:datapoints]
317
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
318
+ # #YELLOW
319
+ avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
320
+ last_value = last_values[target]
321
+ percent = last_value / avg_value unless last_value.nil? || avg_value.nil?
322
+ # #YELLOW
323
+ %w(fatal error warning).each do |type|
324
+ next unless max_values.key?(type)
325
+ max_value = max_values[type]
326
+ var1 = config[:greater_than] ? percent : max_value.to_f
327
+ var2 = config[:greater_than] ? max_value.to_f : percent
328
+ if !percent.nil? && var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
329
+ text = "The last value of metric #{target} is #{percent}% #{greater_less} than allowed #{max_value}% of the average value #{avg_value}"
330
+ case type
331
+ when 'warning'
332
+ warnings << text
333
+ when 'error'
334
+ criticals << text
335
+ when 'fatal'
336
+ fatal << text
337
+ else
338
+ fail "Unknown type #{type}"
339
+ end
340
+ break if config[:short_output]
341
+ end
342
+ end
343
+ end
344
+ [warnings, criticals, fatal]
345
+ end
346
+
347
+ def check_average(target, max_values)
348
+ values = get_graphite_values target
349
+ return [[], [], []] unless values
350
+ warnings = []
351
+ criticals = []
352
+ fatal = []
353
+ values.each do |data|
354
+ target = data[:target]
355
+ values_pair = data[:datapoints]
356
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
357
+ # #YELLOW
358
+ avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
359
+ # YELLOW
360
+ %w(fatal error warning).each do |type|
361
+ next unless max_values.key?(type)
362
+ max_value = max_values[type]
363
+ var1 = config[:greater_than] ? avg_value : max_value.to_f
364
+ var2 = config[:greater_than] ? max_value.to_f : avg_value
365
+ if var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
366
+ text = "The average value of metric #{target} is #{avg_value} that is #{greater_less} than allowed average of #{max_value}"
367
+ case type
368
+ when 'warning'
369
+ warnings << text
370
+ when 'error'
371
+ criticals << text
372
+ when 'fatal'
373
+ fatal << text
374
+ else
375
+ fail "Unknown type #{type}"
376
+ end
377
+ break if config[:short_output]
378
+ end
379
+ end
380
+ end
381
+ [warnings, criticals, fatal]
382
+ end
383
+
384
+ def check_percentile(target, max_values, percentile, data_points = 1)
385
+ values = get_graphite_values target
386
+ last_values = last_graphite_value(target, data_points)
387
+ return [[], [], []] unless values
388
+ warnings = []
389
+ criticals = []
390
+ fatal = []
391
+ values.each do |data|
392
+ target = data[:target]
393
+ values_pair = data[:datapoints]
394
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
395
+ percentile_value = values_array.percentile(percentile)
396
+ last_value = last_values[target]
397
+ percent = last_value / percentile_value unless last_value.nil? || percentile_value.nil?
398
+ # #YELLOW
399
+ %w(fatal error warning).each do |type|
400
+ next unless max_values.key?(type)
401
+ max_value = max_values[type]
402
+ var1 = config[:greater_than] ? percent : max_value.to_f
403
+ var2 = config[:greater_than] ? max_value.to_f : percent
404
+ if !percentile_value.nil? && var1 > var2
405
+ text = "The percentile value of metric #{target} (#{last_value}) is #{greater_less} than the
406
+ #{percentile}th percentile (#{percentile_value}) by more than #{max_value}%"
407
+ case type
408
+ when 'warning'
409
+ warnings << text
410
+ when 'error'
411
+ criticals << text
412
+ when 'fatal'
413
+ fatal << text
414
+ else
415
+ fail "Unknown type #{type}"
416
+ end
417
+ break if config[:short_output]
418
+ end
419
+ end
420
+ end
421
+ [warnings, criticals, fatal]
422
+ end
423
+
424
+ def check_last(target, max_values)
425
+ last_targets = last_graphite_metric target
426
+ return [[], [], []] unless last_targets
427
+ warnings = []
428
+ criticals = []
429
+ fatal = []
430
+ # #YELLOW
431
+ last_targets.each do |target_name, last|
432
+ last_value = last.first
433
+ unless last_value.nil?
434
+ # #YELLOW
435
+ %w(fatal error warning).each do |type|
436
+ next unless max_values.key?(type)
437
+ max_value = max_values[type]
438
+ var1 = config[:greater_than] ? last_value : max_value.to_f
439
+ var2 = config[:greater_than] ? max_value.to_f : last_value
440
+ if var1 > var2
441
+ text = "The metric #{target_name} is #{last_value} that is #{greater_less} than max allowed #{max_value}"
442
+ case type
443
+ when 'warning'
444
+ warnings << text
445
+ when 'error'
446
+ criticals << text
447
+ when 'fatal'
448
+ fatal << text
449
+ else
450
+ fail "Unknown type #{type}"
451
+ end
452
+ break if config[:short_output]
453
+ end
454
+ end
455
+ end
456
+ end
457
+ [warnings, criticals, fatal]
458
+ end
459
+
460
+ def run
461
+ targets = config[:target].split(',')
462
+ @period = config[:period]
463
+ critical_errors = []
464
+ warnings = []
465
+ fatals = []
466
+ # #YELLOW
467
+ targets.each do |target|
468
+ if config[:check_function_increasing]
469
+ inc_warnings, inc_critical, inc_fatal = check_increasing target
470
+ warnings += inc_warnings
471
+ critical_errors += inc_critical
472
+ fatals += inc_fatal
473
+ end
474
+ if config[:check_last]
475
+ max_values = get_levels config[:check_last]
476
+ lt_warnings, lt_critical, lt_fatal = check_last(target, max_values)
477
+ warnings += lt_warnings
478
+ critical_errors += lt_critical
479
+ fatals += lt_fatal
480
+ end
481
+ if config[:check_average]
482
+ max_values = get_levels config[:check_average]
483
+ avg_warnings, avg_critical, avg_fatal = check_average(target, max_values)
484
+ warnings += avg_warnings
485
+ critical_errors += avg_critical
486
+ fatals += avg_fatal
487
+ end
488
+ if config[:check_average_percent]
489
+ max_values = get_levels config[:check_average_percent]
490
+ avg_warnings, avg_critical, avg_fatal = check_average_percent(target, max_values, config[:data_points].to_i)
491
+ warnings += avg_warnings
492
+ critical_errors += avg_critical
493
+ fatals += avg_fatal
494
+ end
495
+ if config[:check_percentile]
496
+ max_values = get_levels config[:check_percentile]
497
+ pct_warnings, pct_critical, pct_fatal = check_percentile(target, max_values, config[:percentile].to_i, config[:data_points].to_i)
498
+ warnings += pct_warnings
499
+ critical_errors += pct_critical
500
+ fatals += pct_fatal
501
+ end
502
+ end
503
+ fatals_string = fatals.size > 0 ? fatals.join("\n") : ''
504
+ criticals_string = critical_errors.size > 0 ? critical_errors.join("\n") : ''
505
+ warnings_string = warnings.size > 0 ? warnings.join("\n") : ''
506
+
507
+ if config[:concat_output]
508
+ fatals_string = fatals_string + "\n" + criticals_string if critical_errors.size > 0
509
+ fatals_string = fatals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
510
+ criticals_string = criticals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
511
+ critical fatals_string if fatals.size > 0
512
+ critical criticals_string if critical_errors.size > 0
513
+ warning warnings_string if warnings.size > 0
514
+ else
515
+ critical fatals_string if fatals.size > 0
516
+ critical criticals_string if critical_errors.size > 0
517
+ warning warnings_string if warnings.size > 0
518
+ end
519
+ ok
520
+ end
521
+ end