sensu-plugins-graphite 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-stats
4
+ #
5
+ # DESCRIPTION:
6
+ # Checks metrics in graphite, averaged over a period of time.
7
+ #
8
+ # The fired sensu event will only be critical if a stat is
9
+ # above the critical threshold. Otherwise, the event will be warning,
10
+ # if a stat is above the warning threshold.
11
+ #
12
+ # Multiple stats will be checked if * are used
13
+ # in the "target" query.
14
+ #
15
+ # OUTPUT:
16
+ # plain text
17
+ #
18
+ # PLATFORMS:
19
+ # Linux
20
+ #
21
+ # DEPENDENCIES:
22
+ # gem: sensu-plugin
23
+ # gem: <?>
24
+ #
25
+ # USAGE:
26
+ # example commands
27
+ #
28
+ # NOTES:
29
+ #
30
+ # LICENSE:
31
+ # Alan Smith (alan@asmith.me)
32
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
33
+ # for details.
34
+ #
35
+
36
+ require 'json'
37
+ require 'net/http'
38
+ require 'sensu-plugin/check/cli'
39
+
40
+ class CheckGraphiteStat < Sensu::Plugin::Check::CLI
41
+ option :host,
42
+ short: '-h HOST',
43
+ long: '--host HOST',
44
+ description: 'graphite hostname',
45
+ proc: proc(&:to_s),
46
+ default: 'graphite'
47
+
48
+ option :period,
49
+ short: '-p PERIOD',
50
+ long: '--period PERIOD',
51
+ description: 'The period back in time to extract from Graphite. Use -24hours, -2days, -15mins, etc, same format as in Graphite',
52
+ proc: proc(&:to_s),
53
+ required: true
54
+
55
+ option :target,
56
+ short: '-t TARGET',
57
+ long: '--target TARGET',
58
+ description: 'The graphite metric name. Can include * to query multiple metrics',
59
+ proc: proc(&:to_s),
60
+ required: true
61
+
62
+ option :warn,
63
+ short: '-w WARN',
64
+ long: '--warn WARN',
65
+ description: 'Warning level',
66
+ proc: proc(&:to_f),
67
+ required: false
68
+
69
+ option :crit,
70
+ short: '-c Crit',
71
+ long: '--crit CRIT',
72
+ description: 'Critical level',
73
+ proc: proc(&:to_f),
74
+ required: false
75
+
76
+ option :unknown_ignore,
77
+ short: '-u',
78
+ long: '--unknown-ignore',
79
+ description: "Do nothing for UNKNOWN status (when you wildcard-match a ton of metrics at once and you don't care for a few missing data)",
80
+ boolean: true,
81
+ default: false
82
+
83
+ def average(a)
84
+ total = 0
85
+ a.to_a.each { |i| total += i.to_f }
86
+
87
+ total / a.length
88
+ end
89
+
90
+ def danger(metric)
91
+ datapoints = metric['datapoints'].map(&:first).compact
92
+
93
+ # #YELLOW
94
+ unless datapoints.empty? # rubocop:disable UnlessElse
95
+ avg = average(datapoints)
96
+
97
+ if !config[:crit].nil? && avg > config[:crit]
98
+ return [2, "#{metric['target']} is #{avg}"]
99
+ elsif !config[:warn].nil? && avg > config[:warn]
100
+ return [1, "#{metric['target']} is #{avg}"]
101
+ end
102
+ else
103
+ return [3, "#{metric['target']} has no datapoints"] unless config[:unknown_ignore]
104
+ end
105
+ [0, nil]
106
+ end
107
+
108
+ def run
109
+ body =
110
+ begin
111
+ uri = URI("http://#{config[:host]}/render?format=json&target=#{config[:target]}&from=#{config[:period]}")
112
+ res = Net::HTTP.get_response(uri)
113
+ res.body
114
+ rescue => e
115
+ warning "Failed to query graphite: #{e.inspect}"
116
+ end
117
+
118
+ status = 0
119
+ message = ''
120
+ data =
121
+ begin
122
+ JSON.parse(body)
123
+ rescue
124
+ []
125
+ end
126
+
127
+ unknown 'No data from graphite' if data.empty?
128
+
129
+ data.each do |metric|
130
+ s, msg = danger(metric)
131
+
132
+ message += "#{msg} " unless s == 0
133
+ status = s unless s < status
134
+ end
135
+
136
+ if status == 2
137
+ critical message
138
+ elsif status == 1
139
+ warning message
140
+ elsif status == 3
141
+ unknown message
142
+ end
143
+ ok
144
+ end
145
+ end
@@ -0,0 +1,530 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # <script name>
4
+ #
5
+ # DESCRIPTION:
6
+ # Get time series values from Graphite and create events based on values
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: json
17
+ # gem: socket
18
+ # gem: array_stats
19
+ # gem: net/http
20
+ #
21
+ # USAGE:
22
+ # #YELLOW
23
+ #
24
+ # NOTES:
25
+ #
26
+ # LICENSE:
27
+ # Copyright 2012 Ulf Mansson @ Recorded Future
28
+ # Modifications by Chris Jansen to support wildcard targets
29
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
30
+ # for details.
31
+ #
32
+
33
+ require 'sensu-plugin/check/cli'
34
+ require 'json'
35
+ require 'net/http'
36
+ require 'socket'
37
+ require 'array_stats'
38
+
39
+ class Graphite < Sensu::Plugin::Check::CLI
40
+ option :host,
41
+ short: '-h HOST',
42
+ long: '--host HOST',
43
+ description: 'Graphite host to connect to, include port',
44
+ required: true
45
+
46
+ option :target,
47
+ description: 'The graphite metric name. Could be a comma separated list of metric names.',
48
+ short: '-t TARGET',
49
+ long: '--target TARGET',
50
+ required: true
51
+
52
+ option :period,
53
+ description: 'The period back in time to extract from Graphite and compare with. Use 24hours,2days etc, same format as in Graphite',
54
+ short: '-p PERIOD',
55
+ long: '--period PERIOD',
56
+ default: '2hours'
57
+
58
+ option :updated_since,
59
+ description: 'The graphite value should have been updated within UPDATED_SINCE seconds, default to 600 seconds',
60
+ short: '-u UPDATED_SINCE',
61
+ long: '--updated_since UPDATED_SINCE',
62
+ default: 600
63
+
64
+ option :acceptable_diff_percentage,
65
+ description: 'The acceptable diff from max values in percentage, used in check_function_increasing',
66
+ short: '-d ACCEPTABLE_DIFF_PERCENTAGE',
67
+ long: '--acceptable_diff_percentage ACCEPTABLE_DIFF_PERCENTAGE',
68
+ default: 0
69
+
70
+ option :check_function_increasing,
71
+ description: 'Check that value is increasing or equal over time (use acceptable_diff_percentage if it should allow to be lower)',
72
+ short: '-i',
73
+ long: '--check_function_decreasing',
74
+ default: false,
75
+ boolean: true
76
+
77
+ option :greater_than,
78
+ description: 'Change whether value is greater than or less than check',
79
+ short: '-g',
80
+ long: '--greater_than',
81
+ default: false
82
+
83
+ option :check_last,
84
+ description: 'Check that the last value in GRAPHITE is greater/less than VALUE',
85
+ short: '-l VALUE',
86
+ long: '--last VALUE',
87
+ default: nil
88
+
89
+ option :ignore_nulls,
90
+ description: 'Do not error on null values, used in check_function_increasing',
91
+ short: '-n',
92
+ long: '--ignore_nulls',
93
+ default: false,
94
+ boolean: true
95
+
96
+ option :concat_output,
97
+ description: 'Include warning messages in output even if overall status is critical',
98
+ short: '-c',
99
+ long: '--concat_output',
100
+ default: false,
101
+ boolean: true
102
+
103
+ option :short_output,
104
+ description: 'Report only the highest status per series in output',
105
+ short: '-s',
106
+ long: '--short_output',
107
+ default: false,
108
+ boolean: true
109
+
110
+ option :check_average,
111
+ description: 'MAX_VALUE should be greater than the average of Graphite values from PERIOD',
112
+ short: '-a MAX_VALUE',
113
+ long: '--average_value MAX_VALUE'
114
+
115
+ option :data_points,
116
+ description: 'Number of data points to include in average check (smooths out spikes)',
117
+ short: '-d VALUE',
118
+ long: '--data_points VALUE',
119
+ default: 1
120
+
121
+ option :check_average_percent,
122
+ description: 'MAX_VALUE% should be greater than the average of Graphite values from PERIOD',
123
+ short: '-b MAX_VALUE',
124
+ long: '--average_percent_value MAX_VALUE'
125
+
126
+ option :percentile,
127
+ description: 'Percentile value, should be used in conjunction with percentile_value, defaults to 90',
128
+ long: '--percentile PERCENTILE',
129
+ default: 90
130
+
131
+ option :check_percentile,
132
+ description: 'Values should not be greater than the VALUE of Graphite values from PERIOD',
133
+ long: '--percentile_value VALUE'
134
+
135
+ option :http_user,
136
+ description: 'Basic HTTP authentication user',
137
+ short: '-U USER',
138
+ long: '--http-user USER',
139
+ default: nil
140
+
141
+ option :http_password,
142
+ description: 'Basic HTTP authentication password',
143
+ short: '-P PASSWORD',
144
+ long: '--http-password USER',
145
+ default: nil
146
+
147
+ def initialize
148
+ super
149
+ @graphite_cache = {}
150
+ end
151
+
152
+ def graphite_cache(target = nil)
153
+ # #YELLOW
154
+ if @graphite_cache.key?(target) # rubocop:disable GuardClause
155
+ graphite_value = @graphite_cache[target].select { |value| value[:period] == @period }
156
+ graphite_value if graphite_value.size > 0
157
+ end
158
+ end
159
+
160
+ # Create a graphite url from params
161
+ #
162
+ #
163
+ def graphite_url(target = nil)
164
+ url = "#{config[:host]}/render/"
165
+ url = 'http://' + url unless url[0..3] == 'http'
166
+ # #YELLOW
167
+ url = url + "?target=#{target}" if target # rubocop:disable Style/SelfAssignment
168
+ URI.parse(url)
169
+ end
170
+
171
+ def get_levels(config_param)
172
+ values = config_param.split(',')
173
+ i = 0
174
+ levels = {}
175
+ %w(warning error fatal).each do |type|
176
+ levels[type] = values[i] if values[i]
177
+ i += 1
178
+ end
179
+ levels
180
+ end
181
+
182
+ def get_graphite_values(target)
183
+ cache_value = graphite_cache target
184
+ return cache_value if cache_value
185
+ params = {
186
+ target: target,
187
+ from: "-#{@period}",
188
+ format: 'json'
189
+ }
190
+
191
+ req = Net::HTTP::Post.new(graphite_url.path)
192
+
193
+ # If the basic http authentication credentials have been provided, then use them
194
+ if !config[:http_user].nil? && !config[:http_password].nil?
195
+ req.basic_auth(config[:http_user], config[:http_password])
196
+ end
197
+
198
+ req.set_form_data(params)
199
+ resp = Net::HTTP.new(graphite_url.host, graphite_url.port).start { |http| http.request(req) }
200
+ data = JSON.parse(resp.body)
201
+ @graphite_cache[target] = []
202
+ if data.size > 0
203
+ data.each { |d| @graphite_cache[target] << { target: d['target'], period: @period, datapoints: d['datapoints'] } }
204
+ graphite_cache target
205
+ else # rubocop:disable all
206
+ nil
207
+ end
208
+ end
209
+
210
+ # Will give max values for [0..-2]
211
+ def max_graphite_value(target)
212
+ max_values = {}
213
+ values = get_graphite_values target
214
+ if values
215
+ values.each do |val|
216
+ max = get_max_value(val[:datapoints])
217
+ max_values[val[:target]] = max
218
+ end
219
+ end
220
+ max_values
221
+ end
222
+
223
+ def get_max_value(values)
224
+ if values
225
+ values.map { |i| i[0] ? i[0] : 0 }[0..-2].max
226
+ else # rubocop:disable all
227
+ nil
228
+ end
229
+ end
230
+
231
+ def last_graphite_metric(target, count = 1)
232
+ last_values = {}
233
+ values = get_graphite_values target
234
+ if values
235
+ values.each do |val|
236
+ last = get_last_metric(val[:datapoints], count)
237
+ last_values[val[:target]] = last
238
+ end
239
+ end
240
+ last_values
241
+ end
242
+
243
+ def get_last_metric(values, count = 1)
244
+ if values
245
+ ret = []
246
+ values_size = values.size
247
+ count = values_size if count > values_size
248
+ while count > 0
249
+ values_size -= 1
250
+ break if values[values_size].nil?
251
+ count -= 1 if values[values_size][0]
252
+ ret.push(values[values_size]) if values[values_size][0]
253
+ end
254
+ ret
255
+ else # rubocop:disable all
256
+ nil
257
+ end
258
+ end
259
+
260
+ def last_graphite_value(target, count = 1)
261
+ last_metrics = last_graphite_metric(target, count)
262
+ last_values = {}
263
+ if last_metrics
264
+ last_metrics.each do |target_name, metrics|
265
+ last_values[target_name] = metrics.map { |metric| metric[0] }.mean
266
+ end
267
+ end
268
+ last_values
269
+ end
270
+
271
+ def been_updated_since(target, time, updated_since)
272
+ last_time_stamp = last_graphite_metric target
273
+ warnings = []
274
+ if last_time_stamp
275
+ last_time_stamp.each do |target_name, value|
276
+ last_time_stamp_bool = value[1] > time.to_i ? true : false
277
+ warnings << "The metric #{target_name} has not been updated in #{updated_since} seconds" unless last_time_stamp_bool
278
+ end
279
+ end
280
+ warnings
281
+ end
282
+
283
+ def greater_less
284
+ return 'greater' if config[:greater_than]
285
+ return 'less' unless config[:greater_than]
286
+ end
287
+
288
+ def check_increasing(target)
289
+ updated_since = config[:updated_since].to_i
290
+ time_to_be_updated_since = Time.now - updated_since
291
+ critical_errors = []
292
+ warnings = []
293
+ max_gv = max_graphite_value target
294
+ last_gv = last_graphite_value target
295
+ if last_gv.is_a?(Hash) && max_gv.is_a?(Hash)
296
+ # #YELLOW
297
+ last_gv.each do |target_name, value| # rubocop:disable Style/Next
298
+ if value && max_gv[target_name]
299
+ last = value
300
+ max = max_gv[target_name]
301
+ if max > last * (1 + config[:acceptable_diff_percentage].to_f / 100)
302
+ msg = "The metric #{target} with last value #{last} is less than max value #{max} during #{config[:period]} period"
303
+ critical_errors << msg
304
+ end
305
+ end
306
+ end
307
+ else
308
+ warnings << "Could not found any value in Graphite for metric #{target}, see #{graphite_url(target)}"
309
+ end
310
+ unless config[:ignore_nulls]
311
+ warnings.concat(been_updated_since(target, time_to_be_updated_since, updated_since))
312
+ end
313
+ [warnings, critical_errors, []]
314
+ end
315
+
316
+ def check_average_percent(target, max_values, data_points = 1)
317
+ values = get_graphite_values target
318
+ last_values = last_graphite_value(target, data_points)
319
+ return [[], [], []] unless values
320
+ warnings = []
321
+ criticals = []
322
+ fatal = []
323
+ values.each do |data|
324
+ target = data[:target]
325
+ values_pair = data[:datapoints]
326
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
327
+ # #YELLOW
328
+ avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
329
+ last_value = last_values[target]
330
+ percent = last_value / avg_value unless last_value.nil? || avg_value.nil?
331
+ # #YELLOW
332
+ %w(fatal error warning).each do |type| # rubocop:disable Style/Next
333
+ next unless max_values.key?(type)
334
+ max_value = max_values[type]
335
+ var1 = config[:greater_than] ? percent : max_value.to_f
336
+ var2 = config[:greater_than] ? max_value.to_f : percent
337
+ if !percent.nil? && var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
338
+ text = "The last value of metric #{target} is #{percent}% #{greater_less} than allowed #{max_value}% of the average value #{avg_value}"
339
+ case type
340
+ when 'warning'
341
+ warnings << text
342
+ when 'error'
343
+ criticals << text
344
+ when 'fatal'
345
+ fatal << text
346
+ else
347
+ fail "Unknown type #{type}"
348
+ end
349
+ break if config[:short_output]
350
+ end
351
+ end
352
+ end
353
+ [warnings, criticals, fatal]
354
+ end
355
+
356
+ def check_average(target, max_values)
357
+ values = get_graphite_values target
358
+ return [[], [], []] unless values
359
+ warnings = []
360
+ criticals = []
361
+ fatal = []
362
+ values.each do |data|
363
+ target = data[:target]
364
+ values_pair = data[:datapoints]
365
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
366
+ # #YELLOW
367
+ avg_value = values_array.reduce { |sum, el| sum + el if el }.to_f / values_array.size # rubocop:disable SingleLineBlockParams
368
+ # YELLOW
369
+ %w(fatal error warning).each do |type| # rubocop:disable Style/Next
370
+ next unless max_values.key?(type)
371
+ max_value = max_values[type]
372
+ var1 = config[:greater_than] ? avg_value : max_value.to_f
373
+ var2 = config[:greater_than] ? max_value.to_f : avg_value
374
+ if var1 > var2 && (values_array.size > 0 || !config[:ignore_nulls])
375
+ text = "The average value of metric #{target} is #{avg_value} that is #{greater_less} than allowed average of #{max_value}"
376
+ case type
377
+ when 'warning'
378
+ warnings << text
379
+ when 'error'
380
+ criticals << text
381
+ when 'fatal'
382
+ fatal << text
383
+ else
384
+ fail "Unknown type #{type}"
385
+ end
386
+ break if config[:short_output]
387
+ end
388
+ end
389
+ end
390
+ [warnings, criticals, fatal]
391
+ end
392
+
393
+ def check_percentile(target, max_values, percentile, data_points = 1)
394
+ values = get_graphite_values target
395
+ last_values = last_graphite_value(target, data_points)
396
+ return [[], [], []] unless values
397
+ warnings = []
398
+ criticals = []
399
+ fatal = []
400
+ values.each do |data|
401
+ target = data[:target]
402
+ values_pair = data[:datapoints]
403
+ values_array = values_pair.select(&:first).map { |v| v.first unless v.first.nil? }
404
+ percentile_value = values_array.percentile(percentile)
405
+ last_value = last_values[target]
406
+ percent = last_value / percentile_value unless last_value.nil? || percentile_value.nil?
407
+ # #YELLOW
408
+ %w(fatal error warning).each do |type| # rubocop:disable Style/Next
409
+ next unless max_values.key?(type)
410
+ max_value = max_values[type]
411
+ var1 = config[:greater_than] ? percent : max_value.to_f
412
+ var2 = config[:greater_than] ? max_value.to_f : percent
413
+ if !percentile_value.nil? && var1 > var2
414
+ text = "The percentile value of metric #{target} (#{last_value}) is #{greater_less} than the
415
+ #{percentile}th percentile (#{percentile_value}) by more than #{max_value}%"
416
+ case type
417
+ when 'warning'
418
+ warnings << text
419
+ when 'error'
420
+ criticals << text
421
+ when 'fatal'
422
+ fatal << text
423
+ else
424
+ fail "Unknown type #{type}"
425
+ end
426
+ break if config[:short_output]
427
+ end
428
+ end
429
+ end
430
+ [warnings, criticals, fatal]
431
+ end
432
+
433
+ def check_last(target, max_values)
434
+ last_targets = last_graphite_metric target
435
+ return [[], [], []] unless last_targets
436
+ warnings = []
437
+ criticals = []
438
+ fatal = []
439
+ # #YELLOW
440
+ last_targets.each do |target_name, last| # rubocop:disable Style/Next
441
+ last_value = last.first
442
+ unless last_value.nil?
443
+ # #YELLOW
444
+ %w(fatal error warning).each do |type| # rubocop:disable Style/Next
445
+ next unless max_values.key?(type)
446
+ max_value = max_values[type]
447
+ var1 = config[:greater_than] ? last_value : max_value.to_f
448
+ var2 = config[:greater_than] ? max_value.to_f : last_value
449
+ if var1 > var2
450
+ text = "The metric #{target_name} is #{last_value} that is #{greater_less} than max allowed #{max_value}"
451
+ case type
452
+ when 'warning'
453
+ warnings << text
454
+ when 'error'
455
+ criticals << text
456
+ when 'fatal'
457
+ fatal << text
458
+ else
459
+ fail "Unknown type #{type}"
460
+ end
461
+ break if config[:short_output]
462
+ end
463
+ end
464
+ end
465
+ end
466
+ [warnings, criticals, fatal]
467
+ end
468
+
469
+ def run # rubocop:disable all
470
+ targets = config[:target].split(',')
471
+ @period = config[:period]
472
+ critical_errors = []
473
+ warnings = []
474
+ fatals = []
475
+ # #YELLOW
476
+ targets.each do |target| # rubocop:disable Style/Next
477
+ if config[:check_function_increasing]
478
+ inc_warnings, inc_critical, inc_fatal = check_increasing target
479
+ warnings += inc_warnings
480
+ critical_errors += inc_critical
481
+ fatals += inc_fatal
482
+ end
483
+ if config[:check_last]
484
+ max_values = get_levels config[:check_last]
485
+ lt_warnings, lt_critical, lt_fatal = check_last(target, max_values)
486
+ warnings += lt_warnings
487
+ critical_errors += lt_critical
488
+ fatals += lt_fatal
489
+ end
490
+ if config[:check_average]
491
+ max_values = get_levels config[:check_average]
492
+ avg_warnings, avg_critical, avg_fatal = check_average(target, max_values)
493
+ warnings += avg_warnings
494
+ critical_errors += avg_critical
495
+ fatals += avg_fatal
496
+ end
497
+ if config[:check_average_percent]
498
+ max_values = get_levels config[:check_average_percent]
499
+ avg_warnings, avg_critical, avg_fatal = check_average_percent(target, max_values, config[:data_points].to_i)
500
+ warnings += avg_warnings
501
+ critical_errors += avg_critical
502
+ fatals += avg_fatal
503
+ end
504
+ if config[:check_percentile]
505
+ max_values = get_levels config[:check_percentile]
506
+ pct_warnings, pct_critical, pct_fatal = check_percentile(target, max_values, config[:percentile].to_i, config[:data_points].to_i)
507
+ warnings += pct_warnings
508
+ critical_errors += pct_critical
509
+ fatals += pct_fatal
510
+ end
511
+ end
512
+ fatals_string = fatals.size > 0 ? fatals.join("\n") : ''
513
+ criticals_string = critical_errors.size > 0 ? critical_errors.join("\n") : ''
514
+ warnings_string = warnings.size > 0 ? warnings.join("\n") : ''
515
+
516
+ if config[:concat_output]
517
+ fatals_string = fatals_string + "\n" + criticals_string if critical_errors.size > 0
518
+ fatals_string = fatals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
519
+ criticals_string = criticals_string + "\nGraphite WARNING: " + warnings_string if warnings.size > 0
520
+ critical fatals_string if fatals.size > 0
521
+ critical criticals_string if critical_errors.size > 0
522
+ warning warnings_string if warnings.size > 0
523
+ else
524
+ critical fatals_string if fatals.size > 0
525
+ critical criticals_string if critical_errors.size > 0
526
+ warning warnings_string if warnings.size > 0
527
+ end
528
+ ok
529
+ end
530
+ end