sensu-plugins-graphite-donotuse 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,158 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-stats
4
+ #
5
+ # DESCRIPTION:
6
+ # Checks metrics in graphite, averaged over a period of time.
7
+ #
8
+ # The fired sensu event will only be critical if a stat is
9
+ # above the critical threshold. Otherwise, the event will be warning,
10
+ # if a stat is above the warning threshold.
11
+ #
12
+ # Multiple stats will be checked if * are used
13
+ # in the "target" query.
14
+ #
15
+ # OUTPUT:
16
+ # plain text
17
+ #
18
+ # PLATFORMS:
19
+ # Linux
20
+ #
21
+ # DEPENDENCIES:
22
+ # gem: sensu-plugin
23
+ #
24
+ # USAGE:
25
+ # example commands
26
+ #
27
+ # NOTES:
28
+ #
29
+ # LICENSE:
30
+ # Alan Smith (alan@asmith.me)
31
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
32
+ # for details.
33
+ #
34
+
35
+ require 'json'
36
+ require 'net/http'
37
+ require 'sensu-plugin/check/cli'
38
+
39
+ class CheckGraphiteStat < Sensu::Plugin::Check::CLI
40
+ option :host,
41
+ short: '-h HOST',
42
+ long: '--host HOST',
43
+ description: 'graphite hostname',
44
+ proc: proc(&:to_s),
45
+ default: 'graphite'
46
+
47
+ option :period,
48
+ short: '-p PERIOD',
49
+ long: '--period PERIOD',
50
+ description: 'The period back in time to extract from Graphite. Use -24hours, -2days, -15mins, etc, same format as in Graphite',
51
+ proc: proc(&:to_s),
52
+ required: true
53
+
54
+ option :target,
55
+ short: '-t TARGET',
56
+ long: '--target TARGET',
57
+ description: 'The graphite metric name. Can include * to query multiple metrics',
58
+ proc: proc(&:to_s),
59
+ required: true
60
+
61
+ option :warn,
62
+ short: '-w WARN',
63
+ long: '--warn WARN',
64
+ description: 'Warning level',
65
+ proc: proc(&:to_f),
66
+ required: false
67
+
68
+ option :crit,
69
+ short: '-c Crit',
70
+ long: '--crit CRIT',
71
+ description: 'Critical level',
72
+ proc: proc(&:to_f),
73
+ required: false
74
+
75
+ option :unknown_ignore,
76
+ short: '-u',
77
+ long: '--unknown-ignore',
78
+ description: "Do nothing for UNKNOWN status (when you wildcard-match a ton of metrics at once and you don't care for a few missing data)",
79
+ boolean: true,
80
+ default: false
81
+
82
+ option :reverse_scale,
83
+ short: '-r',
84
+ long: '--reverse-scale',
85
+ description: 'Reverse the warning/crit scale (if value is less than instead of greater than)',
86
+ boolean: true,
87
+ default: false
88
+
89
+ def average(a)
90
+ total = 0
91
+ a.to_a.each { |i| total += i.to_f }
92
+
93
+ total / a.length
94
+ end
95
+
96
+ def danger(metric)
97
+ datapoints = metric['datapoints'].map(&:first).compact
98
+
99
+ # #YELLOW
100
+ unless datapoints.empty? # rubocop:disable UnlessElse
101
+ avg = average(datapoints)
102
+ if config[:reverse_scale] == false
103
+ if !config[:crit].nil? && avg > config[:crit]
104
+ return [2, "#{metric['target']} is #{avg}"]
105
+ elsif !config[:warn].nil? && avg > config[:warn]
106
+ return [1, "#{metric['target']} is #{avg}"]
107
+ end
108
+ else
109
+ if !config[:crit].nil? && avg < config[:crit]
110
+ return [2, "#{metric['target']} is #{avg}"]
111
+ elsif !config[:warn].nil? && avg < config[:warn]
112
+ return [1, "#{metric['target']} is #{avg}"]
113
+ end
114
+ end
115
+ else
116
+ return [3, "#{metric['target']} has no datapoints"] unless config[:unknown_ignore]
117
+ end
118
+ [0, nil]
119
+ end
120
+
121
+ def run
122
+ body =
123
+ begin
124
+ uri = URI.parse(URI.encode("http://#{config[:host]}/render?format=json&target=#{config[:target]}&from=#{config[:period]}"))
125
+ res = Net::HTTP.get_response(uri)
126
+ res.body
127
+ rescue => e
128
+ warning "Failed to query graphite: #{e.inspect}"
129
+ end
130
+
131
+ status = 0
132
+ message = ''
133
+ data =
134
+ begin
135
+ JSON.parse(body)
136
+ rescue
137
+ []
138
+ end
139
+
140
+ unknown 'No data from graphite' if data.empty?
141
+
142
+ data.each do |metric|
143
+ s, msg = danger(metric)
144
+
145
+ message += "#{msg} " unless s == 0
146
+ status = s unless s < status
147
+ end
148
+
149
+ if status == 2
150
+ critical message
151
+ elsif status == 1
152
+ warning message
153
+ elsif status == 3
154
+ unknown message
155
+ end
156
+ ok
157
+ end
158
+ end
@@ -0,0 +1,521 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # <script name>
4
+ #
5
+ # DESCRIPTION:
6
+ # Get time series values from Graphite and create events based on values
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: array_stats
17
+ #
18
+ # USAGE:
19
+ # #YELLOW
20
+ #
21
+ # NOTES:
22
+ #
23
+ # LICENSE:
24
+ # Copyright 2012 Ulf Mansson @ Recorded Future
25
+ # Modifications by Chris Jansen to support wildcard targets
26
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
27
+ # for details.
28
+ #
29
+
30
+ require 'sensu-plugin/check/cli'
31
+ require 'json'
32
+ require 'net/http'
33
+ require 'socket'
34
+ require 'array_stats'
35
+
36
+ class Graphite < Sensu::Plugin::Check::CLI
37
+ option :host,
38
+ short: '-h HOST',
39
+ long: '--host HOST',
40
+ description: 'Graphite host to connect to, include port',
41
+ required: true
42
+
43
+ option :target,
44
+ description: 'The graphite metric name. Could be a comma separated list of metric names.',
45
+ short: '-t TARGET',
46
+ long: '--target TARGET',
47
+ required: true
48
+
49
+ option :period,
50
+ description: 'The period back in time to extract from Graphite and compare with. Use 24hours,2days etc, same format as in Graphite',
51
+ short: '-p PERIOD',
52
+ long: '--period PERIOD',
53
+ default: '2hours'
54
+
55
+ option :updated_since,
56
+ description: 'The graphite value should have been updated within UPDATED_SINCE seconds, default to 600 seconds',
57
+ short: '-u UPDATED_SINCE',
58
+ long: '--updated_since UPDATED_SINCE',
59
+ default: 600
60
+
61
+ option :acceptable_diff_percentage,
62
+ description: 'The acceptable diff from max values in percentage, used in check_function_increasing',
63
+ short: '-d ACCEPTABLE_DIFF_PERCENTAGE',
64
+ long: '--acceptable_diff_percentage ACCEPTABLE_DIFF_PERCENTAGE',
65
+ default: 0
66
+
67
+ option :check_function_increasing,
68
+ description: 'Check that value is increasing or equal over time (use acceptable_diff_percentage if it should allow to be lower)',
69
+ short: '-i',
70
+ long: '--check_function_decreasing',
71
+ default: false,
72
+ boolean: true
73
+
74
+ option :greater_than,
75
+ description: 'Change whether value is greater than or less than check',
76
+ short: '-g',
77
+ long: '--greater_than',
78
+ default: false
79
+
80
+ option :check_last,
81
+ description: 'Check that the last value in GRAPHITE is greater/less than VALUE',
82
+ short: '-l VALUE',
83
+ long: '--last VALUE',
84
+ default: nil
85
+
86
+ option :ignore_nulls,
87
+ description: 'Do not error on null values, used in check_function_increasing',
88
+ short: '-n',
89
+ long: '--ignore_nulls',
90
+ default: false,
91
+ boolean: true
92
+
93
+ option :concat_output,
94
+ description: 'Include warning messages in output even if overall status is critical',
95
+ short: '-c',
96
+ long: '--concat_output',
97
+ default: false,
98
+ boolean: true
99
+
100
+ option :short_output,
101
+ description: 'Report only the highest status per series in output',
102
+ short: '-s',
103
+ long: '--short_output',
104
+ default: false,
105
+ boolean: true
106
+
107
+ option :check_average,
108
+ description: 'MAX_VALUE should be greater than the average of Graphite values from PERIOD',
109
+ short: '-a MAX_VALUE',
110
+ long: '--average_value MAX_VALUE'
111
+
112
+ option :data_points,
113
+ description: 'Number of data points to include in average check (smooths out spikes)',
114
+ short: '-d VALUE',
115
+ long: '--data_points VALUE',
116
+ default: 1
117
+
118
+ option :check_average_percent,
119
+ description: 'MAX_VALUE% should be greater than the average of Graphite values from PERIOD',
120
+ short: '-b MAX_VALUE',
121
+ long: '--average_percent_value MAX_VALUE'
122
+
123
+ option :percentile,
124
+ description: 'Percentile value, should be used in conjunction with percentile_value, defaults to 90',
125
+ long: '--percentile PERCENTILE',
126
+ default: 90
127
+
128
+ option :check_percentile,
129
+ description: 'Values should not be greater than the VALUE of Graphite values from PERIOD',
130
+ long: '--percentile_value VALUE'
131
+
132
+ option :http_user,
133
+ description: 'Basic HTTP authentication user',
134
+ short: '-U USER',
135
+ long: '--http-user USER',
136
+ default: nil
137
+
138
+ option :http_password,
139
+ description: 'Basic HTTP authentication password',
140
+ short: '-P PASSWORD',
141
+ long: '--http-password USER',
142
+ default: nil
143
+
144
+ def initialize
145
+ super
146
+ @graphite_cache = {}
147
+ end
148
+
149
+ def graphite_cache(target = nil)
150
+ # #YELLOW
151
+ if @graphite_cache.key?(target)
152
+ graphite_value = @graphite_cache[target].select { |value| value[:period] == @period }
153
+ graphite_value if graphite_value.size > 0
154
+ end
155
+ end
156
+
157
+ # Create a graphite url from params
158
+ #
159
+ #
160
+ def graphite_url(target = nil)
161
+ url = "#{config[:host]}/render/"
162
+ url = 'http://' + url unless url[0..3] == 'http'
163
+ # #YELLOW
164
+ url = url + "?target=#{target}" if target # rubocop:disable Style/SelfAssignment
165
+ URI.parse(url)
166
+ end
167
+
168
+ def get_levels(config_param)
169
+ values = config_param.split(',')
170
+ i = 0
171
+ levels = {}
172
+ %w(warning error fatal).each do |type|
173
+ levels[type] = values[i] if values[i]
174
+ i += 1
175
+ end
176
+ levels
177
+ end
178
+
179
+ def get_graphite_values(target)
180
+ cache_value = graphite_cache target
181
+ return cache_value if cache_value
182
+ params = {
183
+ target: target,
184
+ from: "-#{@period}",
185
+ format: 'json'
186
+ }
187
+
188
+ req = Net::HTTP::Post.new(graphite_url.path)
189
+
190
+ # If the basic http authentication credentials have been provided, then use them
191
+ if !config[:http_user].nil? && !config[:http_password].nil?
192
+ req.basic_auth(config[:http_user], config[:http_password])
193
+ end
194
+
195
+ req.set_form_data(params)
196
+ resp = Net::HTTP.new(graphite_url.host, graphite_url.port).start { |http| http.request(req) }
197
+ data = JSON.parse(resp.body)
198
+ @graphite_cache[target] = []
199
+ if data.size > 0
200
+ data.each { |d| @graphite_cache[target] << { target: d['target'], period: @period, datapoints: d['datapoints'] } }
201
+ graphite_cache target
202
+ end
203
+ end
204
+
205
+ # Will give max values for [0..-2]
206
+ def max_graphite_value(target)
207
+ max_values = {}
208
+ values = get_graphite_values target
209
+ if values
210
+ values.each do |val|
211
+ max = get_max_value(val[:datapoints])
212
+ max_values[val[:target]] = max
213
+ end
214
+ end
215
+ max_values
216
+ end
217
+
218
+ def get_max_value(values)
219
+ if values
220
+ values.map { |i| i[0] ? i[0] : 0 }[0..-2].max
221
+ end
222
+ end
223
+
224
+ def last_graphite_metric(target, count = 1)
225
+ last_values = {}
226
+ values = get_graphite_values target
227
+ if values
228
+ values.each do |val|
229
+ last = get_last_metric(val[:datapoints], count)
230
+ last_values[val[:target]] = last
231
+ end
232
+ end
233
+ last_values
234
+ end
235
+
236
+ def get_last_metric(values, count = 1)
237
+ if values
238
+ ret = []
239
+ values_size = values.size
240
+ count = values_size if count > values_size
241
+ while count > 0
242
+ values_size -= 1
243
+ break if values[values_size].nil?
244
+ count -= 1 if values[values_size][0]
245
+ ret.push(values[values_size]) if values[values_size][0]
246
+ end
247
+ ret
248
+ end
249
+ end
250
+
251
+ def last_graphite_value(target, count = 1)
252
+ last_metrics = last_graphite_metric(target, count)
253
+ last_values = {}
254
+ if last_metrics
255
+ last_metrics.each do |target_name, metrics|
256
+ last_values[target_name] = metrics.map { |metric| metric[0] }.mean
257
+ end
258
+ end
259
+ last_values
260
+ end
261
+
262
+ def been_updated_since(target, time, updated_since)
263
+ last_time_stamp = last_graphite_metric target
264
+ warnings = []
265
+ if last_time_stamp
266
+ last_time_stamp.each do |target_name, value|
267
+ last_time_stamp_bool = value[1] > time.to_i ? true : false
268
+ warnings << "The metric #{target_name} has not been updated in #{updated_since} seconds" unless last_time_stamp_bool
269
+ end
270
+ end
271
+ warnings
272
+ end
273
+
274
+ def greater_less
275
+ return 'greater' if config[:greater_than]
276
+ return 'less' unless config[:greater_than]
277
+ end
278
+
279
+ def check_increasing(target)
280
+ updated_since = config[:updated_since].to_i
281
+ time_to_be_updated_since = Time.now - updated_since
282
+ critical_errors = []
283
+ warnings = []
284
+ max_gv = max_graphite_value target
285
+ last_gv = last_graphite_value target
286
+ if last_gv.is_a?(Hash) && max_gv.is_a?(Hash)
287
+ # #YELLOW
288
+ last_gv.each do |target_name, value|
289
+ if value && max_gv[target_name]
290
+ last = value
291
+ max = max_gv[target_name]
292
+ if max > last * (1 + config[:acceptable_diff_percentage].to_f / 100)
293
+ msg = "The metric #{target} with last value #{last} is less than max value #{max} during #{config[:period]} period"
294
+ critical_errors << msg
295
+ end
296
+ end
297
+ end
298
+ else
299
+ warnings << "Could not found any value in Graphite for metric #{target}, see #{graphite_url(target)}"
300
+ end
301
+ unless config[:ignore_nulls]
302
+ warnings.concat(been_updated_since(target, time_to_be_updated_since, updated_since))
303
+ end
304
+ [warnings, critical_errors, []]
305
+ end
306
+
307
+ def check_average_percent(target, max_values, data_points = 1)
308
+ values = get_graphite_values target
309
+ last_values = last_graphite_value(target, data_points)
310
+ return [[], [], []] unless values
311
+ warnings = []
312
+ criticals = []
313
+ fatal = []
314
+ values.each do |data|
315
+ target = data[:target]
316
+ values_pair = data[:datapoints]
317
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
318
+ # #YELLOW
319
+ avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
320
+ last_value = last_values[target]
321
+ percent = last_value / avg_value unless last_value.nil? || avg_value.nil?
322
+ # #YELLOW
323
+ %w(fatal error warning).each do |type|
324
+ next unless max_values.key?(type)
325
+ max_value = max_values[type]
326
+ var1 = config[:greater_than] ? percent : max_value.to_f
327
+ var2 = config[:greater_than] ? max_value.to_f : percent
328
+ if !percent.nil? && var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
329
+ text = "The last value of metric #{target} is #{percent}% #{greater_less} than allowed #{max_value}% of the average value #{avg_value}"
330
+ case type
331
+ when 'warning'
332
+ warnings << text
333
+ when 'error'
334
+ criticals << text
335
+ when 'fatal'
336
+ fatal << text
337
+ else
338
+ fail "Unknown type #{type}"
339
+ end
340
+ break if config[:short_output]
341
+ end
342
+ end
343
+ end
344
+ [warnings, criticals, fatal]
345
+ end
346
+
347
+ def check_average(target, max_values)
348
+ values = get_graphite_values target
349
+ return [[], [], []] unless values
350
+ warnings = []
351
+ criticals = []
352
+ fatal = []
353
+ values.each do |data|
354
+ target = data[:target]
355
+ values_pair = data[:datapoints]
356
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
357
+ # #YELLOW
358
+ avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
359
+ # YELLOW
360
+ %w(fatal error warning).each do |type|
361
+ next unless max_values.key?(type)
362
+ max_value = max_values[type]
363
+ var1 = config[:greater_than] ? avg_value : max_value.to_f
364
+ var2 = config[:greater_than] ? max_value.to_f : avg_value
365
+ if var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
366
+ text = "The average value of metric #{target} is #{avg_value} that is #{greater_less} than allowed average of #{max_value}"
367
+ case type
368
+ when 'warning'
369
+ warnings << text
370
+ when 'error'
371
+ criticals << text
372
+ when 'fatal'
373
+ fatal << text
374
+ else
375
+ fail "Unknown type #{type}"
376
+ end
377
+ break if config[:short_output]
378
+ end
379
+ end
380
+ end
381
+ [warnings, criticals, fatal]
382
+ end
383
+
384
+ def check_percentile(target, max_values, percentile, data_points = 1)
385
+ values = get_graphite_values target
386
+ last_values = last_graphite_value(target, data_points)
387
+ return [[], [], []] unless values
388
+ warnings = []
389
+ criticals = []
390
+ fatal = []
391
+ values.each do |data|
392
+ target = data[:target]
393
+ values_pair = data[:datapoints]
394
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
395
+ percentile_value = values_array.percentile(percentile)
396
+ last_value = last_values[target]
397
+ percent = last_value / percentile_value unless last_value.nil? || percentile_value.nil?
398
+ # #YELLOW
399
+ %w(fatal error warning).each do |type|
400
+ next unless max_values.key?(type)
401
+ max_value = max_values[type]
402
+ var1 = config[:greater_than] ? percent : max_value.to_f
403
+ var2 = config[:greater_than] ? max_value.to_f : percent
404
+ if !percentile_value.nil? && var1 > var2
405
+ text = "The percentile value of metric #{target} (#{last_value}) is #{greater_less} than the
406
+ #{percentile}th percentile (#{percentile_value}) by more than #{max_value}%"
407
+ case type
408
+ when 'warning'
409
+ warnings << text
410
+ when 'error'
411
+ criticals << text
412
+ when 'fatal'
413
+ fatal << text
414
+ else
415
+ fail "Unknown type #{type}"
416
+ end
417
+ break if config[:short_output]
418
+ end
419
+ end
420
+ end
421
+ [warnings, criticals, fatal]
422
+ end
423
+
424
+ def check_last(target, max_values)
425
+ last_targets = last_graphite_metric target
426
+ return [[], [], []] unless last_targets
427
+ warnings = []
428
+ criticals = []
429
+ fatal = []
430
+ # #YELLOW
431
+ last_targets.each do |target_name, last|
432
+ last_value = last.first
433
+ unless last_value.nil?
434
+ # #YELLOW
435
+ %w(fatal error warning).each do |type|
436
+ next unless max_values.key?(type)
437
+ max_value = max_values[type]
438
+ var1 = config[:greater_than] ? last_value : max_value.to_f
439
+ var2 = config[:greater_than] ? max_value.to_f : last_value
440
+ if var1 > var2
441
+ text = "The metric #{target_name} is #{last_value} that is #{greater_less} than max allowed #{max_value}"
442
+ case type
443
+ when 'warning'
444
+ warnings << text
445
+ when 'error'
446
+ criticals << text
447
+ when 'fatal'
448
+ fatal << text
449
+ else
450
+ fail "Unknown type #{type}"
451
+ end
452
+ break if config[:short_output]
453
+ end
454
+ end
455
+ end
456
+ end
457
+ [warnings, criticals, fatal]
458
+ end
459
+
460
+ def run
461
+ targets = config[:target].split(',')
462
+ @period = config[:period]
463
+ critical_errors = []
464
+ warnings = []
465
+ fatals = []
466
+ # #YELLOW
467
+ targets.each do |target|
468
+ if config[:check_function_increasing]
469
+ inc_warnings, inc_critical, inc_fatal = check_increasing target
470
+ warnings += inc_warnings
471
+ critical_errors += inc_critical
472
+ fatals += inc_fatal
473
+ end
474
+ if config[:check_last]
475
+ max_values = get_levels config[:check_last]
476
+ lt_warnings, lt_critical, lt_fatal = check_last(target, max_values)
477
+ warnings += lt_warnings
478
+ critical_errors += lt_critical
479
+ fatals += lt_fatal
480
+ end
481
+ if config[:check_average]
482
+ max_values = get_levels config[:check_average]
483
+ avg_warnings, avg_critical, avg_fatal = check_average(target, max_values)
484
+ warnings += avg_warnings
485
+ critical_errors += avg_critical
486
+ fatals += avg_fatal
487
+ end
488
+ if config[:check_average_percent]
489
+ max_values = get_levels config[:check_average_percent]
490
+ avg_warnings, avg_critical, avg_fatal = check_average_percent(target, max_values, config[:data_points].to_i)
491
+ warnings += avg_warnings
492
+ critical_errors += avg_critical
493
+ fatals += avg_fatal
494
+ end
495
+ if config[:check_percentile]
496
+ max_values = get_levels config[:check_percentile]
497
+ pct_warnings, pct_critical, pct_fatal = check_percentile(target, max_values, config[:percentile].to_i, config[:data_points].to_i)
498
+ warnings += pct_warnings
499
+ critical_errors += pct_critical
500
+ fatals += pct_fatal
501
+ end
502
+ end
503
+ fatals_string = fatals.size > 0 ? fatals.join("\n") : ''
504
+ criticals_string = critical_errors.size > 0 ? critical_errors.join("\n") : ''
505
+ warnings_string = warnings.size > 0 ? warnings.join("\n") : ''
506
+
507
+ if config[:concat_output]
508
+ fatals_string = fatals_string + "\n" + criticals_string if critical_errors.size > 0
509
+ fatals_string = fatals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
510
+ criticals_string = criticals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
511
+ critical fatals_string if fatals.size > 0
512
+ critical criticals_string if critical_errors.size > 0
513
+ warning warnings_string if warnings.size > 0
514
+ else
515
+ critical fatals_string if fatals.size > 0
516
+ critical criticals_string if critical_errors.size > 0
517
+ warning warnings_string if warnings.size > 0
518
+ end
519
+ ok
520
+ end
521
+ end