sensu-plugins-mesos 0.1.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/CHANGELOG.md +22 -1
- data/LICENSE +0 -0
- data/README.md +5 -0
- data/bin/check-chronos.rb +8 -1
- data/bin/check-marathon-task.rb +111 -15
- data/bin/check-marathon.rb +14 -1
- data/bin/check-mesos-cpu-balance.rb +154 -0
- data/bin/check-mesos-disk-balance.rb +154 -0
- data/bin/check-mesos-failed-tasks.rb +146 -0
- data/bin/check-mesos-gpu-balance.rb +154 -0
- data/bin/check-mesos-leader-status.rb +75 -0
- data/bin/check-mesos-lost-tasks.rb +139 -0
- data/bin/check-mesos-mem-balance.rb +154 -0
- data/bin/check-mesos-running-tasks.rb +182 -0
- data/bin/check-mesos.rb +14 -24
- data/bin/check-metronome.rb +80 -0
- data/bin/metrics-marathon.rb +17 -4
- data/bin/metrics-mesos.rb +9 -4
- data/lib/sensu-plugins-mesos.rb +0 -0
- data/lib/sensu-plugins-mesos/version.rb +3 -3
- metadata +85 -32
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
YzMwZWNjZmU0OTRjYTU5NmZjZTYzYzVlNjViMzRlN2QyOGYzNjEwMg==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 21411bface3e1d88fc6c4041e0b5dac80044a364
|
4
|
+
data.tar.gz: 806ab66415a37de8c8e724078d74ca66696607ca
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
MzY1NjM5MGVkMGU1YzVkNjcyOTg2ZDc4MGZkOWU5MTEyMTI2OThjMmVkZWJm
|
11
|
-
MTQ0ZDU3N2Y4ZGJjNTJiMzE0N2I5MmE0ZTJhNDJmYWRjNDdhNTM=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
MTc3N2U5OWFiM2E1NTI3Njk4MDU1NzQyMDIyNWRlY2E5NzJjZTJmMzhlNTUy
|
14
|
-
YTVkMTI1ZmFiZTliOGExM2FmNTg0NzA5MGQxNmQ0YWQ0MDVjMGVlNjM3NGZm
|
15
|
-
ZDliYzNjMTdhYzQwY2U5OTdjZTk4MjJiODg1ODk1ZDlkYTNkYmE=
|
6
|
+
metadata.gz: c5a02b94924115f6fed7d4fc57d4cd960624c0caf674b834375cf1733ba1275105c6cb9af26c6893eac04e95a5ca8cde46516fa0ebc390fd17e7118132653fd9
|
7
|
+
data.tar.gz: fab71f56c0672988d61d098e40e61ae47ebcdeca229e61588f02877630b57d72643cefe493e5298f7e46d22d7cc171d0aa4de54a6ab28257dfa9dbac305096eb
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,26 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
4
4
|
This CHANGELOG follows the format listed at [Keep A Changelog](http://keepachangelog.com/)
|
5
5
|
|
6
6
|
## [Unreleased]
|
7
|
+
## [1.0.0] - 2017-05-05
|
8
|
+
### Breaking Change
|
9
|
+
- check-mesos.rb: removed the `mode` parameter (@luisdavim)
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- check-metronome.rb: Check if Metronome is running (@luisdavim)
|
13
|
+
- check-mesos-cpu-balance.rb: Check for imballanced use of CPU accross mesos agents (@luisdavim)
|
14
|
+
- check-mesos-gpu-balance.rb: Check for imballanced use of GPU accross mesos agents (@luisdavim)
|
15
|
+
- check-mesos-disk-balance.rb: Check for imballanced use of disk accross mesos agents (@luisdavim)
|
16
|
+
- check-mesos-mem-balance.rb: Check for imballanced use of memory accross mesos agents (@luisdavim)
|
17
|
+
|
18
|
+
### Changed
|
19
|
+
- check-marathon-task.rb: Use the health check results to verify that a task is running. (@andrelaszlo)
|
20
|
+
- check-marathon-task.rb: Rename incorrect "state" parameter to "status". (@andrelaszlo)
|
21
|
+
- Add https support and authentication to marathon plugins: (thanks to Erasys GmbH)
|
22
|
+
- Add "protocol" option to check-marathon and metrics-marathon
|
23
|
+
- Add "protocol", "username" and "password" options to check-marathon-task
|
24
|
+
- All checks now have a configurable API endpoint using --uri or -u (@luisdavim)
|
25
|
+
- Support the latest Mesos API (@luisdavim)
|
26
|
+
- Dropped support for Ruby 1.9.3 (@luisdavim)
|
7
27
|
|
8
28
|
## [0.1.1] - 2016-03-04
|
9
29
|
### Added
|
@@ -41,7 +61,8 @@ This CHANGELOG follows the format listed at [Keep A Changelog](http://keepachang
|
|
41
61
|
### Added
|
42
62
|
- initial release
|
43
63
|
|
44
|
-
[Unreleased]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.
|
64
|
+
[Unreleased]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/1.0.0...HEAD
|
65
|
+
[1.0.0]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.1.1...1.0.0
|
45
66
|
[0.1.1]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.1.0...0.1.1
|
46
67
|
[0.1.0]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.0.4...0.1.0
|
47
68
|
[0.0.4]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.0.2...0.0.4
|
data/LICENSE
CHANGED
File without changes
|
data/README.md
CHANGED
@@ -10,8 +10,13 @@
|
|
10
10
|
|
11
11
|
## Files
|
12
12
|
* bin/check-chronos.rb
|
13
|
+
* bin/check-metronome.rb
|
13
14
|
* bin/check-marathon.rb
|
14
15
|
* bin/check-mesos.rb
|
16
|
+
* bin/check-mesos-cpu-balance.rb
|
17
|
+
* bin/check-mesos-disk-balance.rb
|
18
|
+
* bin/check-mesos-gpu-balance.rb
|
19
|
+
* bin/check-mesos-mem-balance.rb
|
15
20
|
* bin/metrics-marathon.rb
|
16
21
|
* bin/metrics-mesos.rb
|
17
22
|
|
data/bin/check-chronos.rb
CHANGED
@@ -42,6 +42,12 @@ class ChronosNodeStatus < Sensu::Plugin::Check::CLI
|
|
42
42
|
long: '--port PORT',
|
43
43
|
default: '80'
|
44
44
|
|
45
|
+
option :uri,
|
46
|
+
description: 'Endpoint URI',
|
47
|
+
short: '-u URI',
|
48
|
+
long: '--uri URI',
|
49
|
+
default: '/scheduler/jobs'
|
50
|
+
|
45
51
|
option :timeout,
|
46
52
|
description: 'timeout in seconds',
|
47
53
|
short: '-t TIMEOUT',
|
@@ -51,10 +57,11 @@ class ChronosNodeStatus < Sensu::Plugin::Check::CLI
|
|
51
57
|
|
52
58
|
def run
|
53
59
|
servers = config[:server]
|
60
|
+
uri = config[:uri]
|
54
61
|
failures = []
|
55
62
|
servers.split(',').each do |server|
|
56
63
|
begin
|
57
|
-
r = RestClient::Resource.new("http://#{server}:#{config[:port]}
|
64
|
+
r = RestClient::Resource.new("http://#{server}:#{config[:port]}#{uri}", timeout: config[:timeout]).get
|
58
65
|
if r.code != 200
|
59
66
|
failures << "Chronos on #{server} is not responding"
|
60
67
|
end
|
data/bin/check-marathon-task.rb
CHANGED
@@ -33,48 +33,144 @@ require 'sensu-plugin/check/cli'
|
|
33
33
|
require 'net/http'
|
34
34
|
require 'json'
|
35
35
|
|
36
|
+
# This plugin checks that the given Mesos/Marathon task is running properly.
|
37
|
+
#
|
38
|
+
# This means that all of the following is true:
|
39
|
+
# 1. There are N tasks for the app, as defined by the --instances parameter
|
40
|
+
# 2. Each task's state is running
|
41
|
+
# 3. No task is unhealthy, as defined in Marathon
|
42
|
+
#
|
43
|
+
# A task is seen as **unhealthy** by Marathon if any of the health checks for
|
44
|
+
# the task is not **alive**. Alive means that a check has a last success that
|
45
|
+
# is more recent than last failure. It's not alive if the last failure is more
|
46
|
+
# recent than the last success, or if the last success doesn't exist at all.
|
36
47
|
class MarathonTaskCheck < Sensu::Plugin::Check::CLI
|
37
48
|
check_name 'CheckMarathonTask'
|
38
49
|
|
39
|
-
option :server,
|
40
|
-
|
41
|
-
|
42
|
-
|
50
|
+
option :server,
|
51
|
+
short: '-s SERVER',
|
52
|
+
long: '--server SERVER',
|
53
|
+
required: true
|
54
|
+
|
55
|
+
option :port,
|
56
|
+
short: '-p PORT',
|
57
|
+
long: '--port PORT',
|
58
|
+
default: 8080
|
59
|
+
|
60
|
+
option :uri,
|
61
|
+
description: 'Endpoint URI',
|
62
|
+
short: '-u URI',
|
63
|
+
long: '--uri URI',
|
64
|
+
default: '/v2/tasks?status=running'
|
65
|
+
|
66
|
+
option :task,
|
67
|
+
short: '-t TASK',
|
68
|
+
long: '--task TASK',
|
69
|
+
required: true
|
70
|
+
|
71
|
+
option :instances,
|
72
|
+
short: '-i INSTANCES',
|
73
|
+
long: '--instances INSTANCES',
|
74
|
+
required: true,
|
75
|
+
proc: proc(&:to_i)
|
76
|
+
|
77
|
+
option :protocol,
|
78
|
+
short: '-P PROTOCOL',
|
79
|
+
long: '--protocol PROTOCOL',
|
80
|
+
required: false,
|
81
|
+
default: 'http'
|
82
|
+
|
83
|
+
option :username,
|
84
|
+
short: '-u USERNAME',
|
85
|
+
long: '--username USERNAME',
|
86
|
+
required: false
|
87
|
+
|
88
|
+
option :password,
|
89
|
+
long: '--password PASSWORD',
|
90
|
+
required: false
|
43
91
|
|
44
92
|
def run
|
45
|
-
if config[:instances]
|
93
|
+
if config[:instances].zero?
|
46
94
|
unknown 'number of instances should be an integer'
|
47
95
|
end
|
48
96
|
|
97
|
+
if !config[:username].nil? && config[:password].nil? ||
|
98
|
+
config[:username].nil? && !config[:password].nil?
|
99
|
+
unknown 'You must provide both username and password'
|
100
|
+
end
|
101
|
+
|
49
102
|
failures = []
|
103
|
+
uri = config[:uri]
|
50
104
|
config[:server].split(',').each do |s|
|
51
105
|
begin
|
52
|
-
url = URI.parse("
|
106
|
+
url = URI.parse("#{config[:protocol]}://#{s}:#{config[:port]}#{uri}")
|
53
107
|
req = Net::HTTP::Get.new(url)
|
54
108
|
req.add_field('Accept', 'application/json')
|
55
|
-
|
109
|
+
if !config[:username].nil? && !config[:password].nil?
|
110
|
+
req.basic_auth(config[:username], config[:password])
|
111
|
+
end
|
112
|
+
r = Net::HTTP.start(url.host, url.port,
|
113
|
+
use_ssl: config[:protocol] == 'https') do |h|
|
56
114
|
h.request(req)
|
57
115
|
end
|
58
116
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
end
|
117
|
+
ok_count, unhealthy = check_tasks r.body
|
118
|
+
|
119
|
+
message = "#{ok_count}/#{config[:instances]} #{config[:task]} tasks running"
|
63
120
|
|
64
|
-
|
121
|
+
if unhealthy.any?
|
122
|
+
message << ":\n" << unhealthy.join("\n")
|
123
|
+
end
|
65
124
|
|
66
|
-
if
|
125
|
+
if unhealthy.any? || ok_count < config[:instances]
|
67
126
|
critical message
|
68
127
|
end
|
69
128
|
|
70
129
|
ok message
|
71
130
|
rescue Errno::ECONNREFUSED, SocketError
|
72
131
|
failures << "Marathon on #{s} could not be reached"
|
73
|
-
rescue
|
74
|
-
failures << "error caught trying to reach Marathon on #{s}"
|
132
|
+
rescue => err
|
133
|
+
failures << "error caught trying to reach Marathon on #{s}: #{err}"
|
75
134
|
end
|
76
135
|
end
|
77
136
|
|
78
137
|
unknown "marathon task state could not be retrieved:\n" << failures.join("\n")
|
79
138
|
end
|
139
|
+
|
140
|
+
# Parses JSON data as returned from Marathon's tasks API
|
141
|
+
# @param data [String] Server response
|
142
|
+
# @return [Numeric, [String]] Number of running tasks and a list of error
|
143
|
+
# messages from unhealthy tasks
|
144
|
+
def check_tasks(data)
|
145
|
+
begin
|
146
|
+
tasks = JSON.parse(data)['tasks']
|
147
|
+
rescue JSON::ParserError
|
148
|
+
raise "Could not parse JSON response: #{data}"
|
149
|
+
end
|
150
|
+
|
151
|
+
if tasks.nil?
|
152
|
+
raise "No tasks in server response: #{data}"
|
153
|
+
end
|
154
|
+
|
155
|
+
tasks.select! do |t|
|
156
|
+
t['appId'] == "/#{config[:task]}"
|
157
|
+
end
|
158
|
+
|
159
|
+
unhealthy = []
|
160
|
+
|
161
|
+
# Collect last error message for all health checks that are not alive
|
162
|
+
tasks.each do |task|
|
163
|
+
checks = task['healthCheckResults'] || []
|
164
|
+
checks.each do |check|
|
165
|
+
if check['alive']
|
166
|
+
next
|
167
|
+
end
|
168
|
+
message = check['lastFailureCause'] ||
|
169
|
+
'Health check not alive'
|
170
|
+
unhealthy << message
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
[tasks.length, unhealthy]
|
175
|
+
end
|
80
176
|
end
|
data/bin/check-marathon.rb
CHANGED
@@ -43,6 +43,19 @@ class MarathonNodeStatus < Sensu::Plugin::Check::CLI
|
|
43
43
|
required: false,
|
44
44
|
default: '8080'
|
45
45
|
|
46
|
+
option :protocol,
|
47
|
+
description: 'Marathon protocol [http/https]',
|
48
|
+
short: '-P PROTOCOL',
|
49
|
+
long: '--protocol PROTOCOL',
|
50
|
+
required: false,
|
51
|
+
default: 'http'
|
52
|
+
|
53
|
+
option :uri,
|
54
|
+
description: 'Endpoint URI',
|
55
|
+
short: '-u URI',
|
56
|
+
long: '--uri URI',
|
57
|
+
default: '/ping'
|
58
|
+
|
46
59
|
option :timeout,
|
47
60
|
description: 'timeout in seconds',
|
48
61
|
short: '-t TIMEOUT',
|
@@ -55,7 +68,7 @@ class MarathonNodeStatus < Sensu::Plugin::Check::CLI
|
|
55
68
|
failures = []
|
56
69
|
servers.split(',').each do |server|
|
57
70
|
begin
|
58
|
-
r = RestClient::Resource.new("
|
71
|
+
r = RestClient::Resource.new("#{config[:protocol]}://#{server}:#{config[:port]}#{config[:uri]}", timeout: config[:timeout]).get
|
59
72
|
if r.code != 200
|
60
73
|
failures << "Marathon Service on #{server} is not responding"
|
61
74
|
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-cpu-balance
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there is less CPU imbalance than specified on a certain mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
require 'json'
|
32
|
+
|
33
|
+
class MesosCpuBalanceCheck < Sensu::Plugin::Check::CLI
|
34
|
+
check_name 'MesosCpuBalanceCheck'
|
35
|
+
@metrics_name = 'slaves'.freeze
|
36
|
+
CHECK_TYPE = 'cpus'.freeze
|
37
|
+
|
38
|
+
class << self
|
39
|
+
attr_reader :metrics_name
|
40
|
+
end
|
41
|
+
|
42
|
+
option :server,
|
43
|
+
description: 'Mesos server',
|
44
|
+
short: '-s SERVER',
|
45
|
+
long: '--server SERVER',
|
46
|
+
default: 'localhost'
|
47
|
+
|
48
|
+
option :port,
|
49
|
+
description: 'port (default 5050)',
|
50
|
+
short: '-p PORT',
|
51
|
+
long: '--port PORT',
|
52
|
+
default: 5050,
|
53
|
+
required: false
|
54
|
+
|
55
|
+
option :uri,
|
56
|
+
description: 'Endpoint URI',
|
57
|
+
short: '-u URI',
|
58
|
+
long: '--uri URI',
|
59
|
+
default: '/master/slaves'
|
60
|
+
|
61
|
+
option :timeout,
|
62
|
+
description: 'timeout in seconds',
|
63
|
+
short: '-t TIMEOUT',
|
64
|
+
long: '--timeout TIMEOUT',
|
65
|
+
proc: proc(&:to_i),
|
66
|
+
default: 5
|
67
|
+
|
68
|
+
option :crit,
|
69
|
+
description: 'Critical value to check against',
|
70
|
+
short: '-c VALUE',
|
71
|
+
long: '--critical VALUE',
|
72
|
+
proc: proc(&:to_i),
|
73
|
+
default: 0,
|
74
|
+
required: false
|
75
|
+
|
76
|
+
option :warn,
|
77
|
+
description: 'Warning value to check against',
|
78
|
+
short: '-w VALUE',
|
79
|
+
long: '--warning VALUE',
|
80
|
+
proc: proc(&:to_i),
|
81
|
+
default: 0,
|
82
|
+
required: false
|
83
|
+
|
84
|
+
def run
|
85
|
+
if config[:crit] < 0 || config[:warn] < 0
|
86
|
+
unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
|
87
|
+
end
|
88
|
+
|
89
|
+
server = config[:server]
|
90
|
+
port = config[:port]
|
91
|
+
uri = config[:uri]
|
92
|
+
timeout = config[:timeout]
|
93
|
+
crit = config[:crit]
|
94
|
+
warn = config[:warn]
|
95
|
+
|
96
|
+
begin
|
97
|
+
server = get_leader_url server, port
|
98
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
99
|
+
compare = get_check_diff(get_slaves(r))
|
100
|
+
if compare['diff'] >= crit
|
101
|
+
critical "There is a CPU diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
|
102
|
+
end
|
103
|
+
if compare['diff'] >= warn
|
104
|
+
warning "There is a CPU diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
|
105
|
+
end
|
106
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
107
|
+
unknown "Mesos #{server} is not responding"
|
108
|
+
rescue RestClient::RequestTimeout
|
109
|
+
unknown "Mesos #{server} connection timed out"
|
110
|
+
end
|
111
|
+
ok
|
112
|
+
end
|
113
|
+
|
114
|
+
# Redirects server call to discover the Leader
|
115
|
+
# @param server [String] Server address
|
116
|
+
# @param port [Number] api port
|
117
|
+
# @return [Url] Url representing the Leader
|
118
|
+
|
119
|
+
def get_leader_url(server, port)
|
120
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parses JSON data as returned from Mesos's metrics API
|
124
|
+
# @param data [String] Server response
|
125
|
+
# @return [Integer] Number of failed tasks in Mesos
|
126
|
+
def get_slaves(data)
|
127
|
+
begin
|
128
|
+
slaves = JSON.parse(data)[MesosCpuBalanceCheck.metrics_name]
|
129
|
+
rescue JSON::ParserError
|
130
|
+
raise "Could not parse JSON response: #{data}"
|
131
|
+
end
|
132
|
+
|
133
|
+
if slaves.nil?
|
134
|
+
raise "No metrics for [#{MesosCpuBalanceCheck.metrics_name}] in server response: #{data}"
|
135
|
+
end
|
136
|
+
|
137
|
+
slaves
|
138
|
+
end
|
139
|
+
|
140
|
+
def get_check_diff(slavelist)
|
141
|
+
begin
|
142
|
+
usages = {}
|
143
|
+
check_diff = {}
|
144
|
+
slavelist.each do |slaveinfo|
|
145
|
+
usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
|
146
|
+
end
|
147
|
+
sorted = usages.sort_by { |_hostname, total| total }
|
148
|
+
max = usages.length - 1
|
149
|
+
check_diff['diff'] = sorted[max][1] - sorted[0][1]
|
150
|
+
check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
|
151
|
+
end
|
152
|
+
check_diff
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-disk-balance
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there is less disk usage imbalance than specified on a certain mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
require 'json'
|
32
|
+
|
33
|
+
class MesosDiskBalanceCheck < Sensu::Plugin::Check::CLI
|
34
|
+
check_name 'MesosDiskBalanceCheck'
|
35
|
+
@metrics_name = 'slaves'.freeze
|
36
|
+
CHECK_TYPE = 'disk'.freeze
|
37
|
+
|
38
|
+
class << self
|
39
|
+
attr_reader :metrics_name
|
40
|
+
end
|
41
|
+
|
42
|
+
option :server,
|
43
|
+
description: 'Mesos server',
|
44
|
+
short: '-s SERVER',
|
45
|
+
long: '--server SERVER',
|
46
|
+
default: 'localhost'
|
47
|
+
|
48
|
+
option :port,
|
49
|
+
description: 'port (default 5050)',
|
50
|
+
short: '-p PORT',
|
51
|
+
long: '--port PORT',
|
52
|
+
default: 5050,
|
53
|
+
required: false
|
54
|
+
|
55
|
+
option :uri,
|
56
|
+
description: 'Endpoint URI',
|
57
|
+
short: '-u URI',
|
58
|
+
long: '--uri URI',
|
59
|
+
default: '/master/slaves'
|
60
|
+
|
61
|
+
option :timeout,
|
62
|
+
description: 'timeout in seconds',
|
63
|
+
short: '-t TIMEOUT',
|
64
|
+
long: '--timeout TIMEOUT',
|
65
|
+
proc: proc(&:to_i),
|
66
|
+
default: 5
|
67
|
+
|
68
|
+
option :crit,
|
69
|
+
description: 'Critical value to check against',
|
70
|
+
short: '-c VALUE',
|
71
|
+
long: '--critical VALUE',
|
72
|
+
proc: proc(&:to_i),
|
73
|
+
default: 0,
|
74
|
+
required: false
|
75
|
+
|
76
|
+
option :warn,
|
77
|
+
description: 'Warning value to check against',
|
78
|
+
short: '-w VALUE',
|
79
|
+
long: '--warning VALUE',
|
80
|
+
proc: proc(&:to_i),
|
81
|
+
default: 0,
|
82
|
+
required: false
|
83
|
+
|
84
|
+
def run
|
85
|
+
if config[:crit] < 0 || config[:warn] < 0
|
86
|
+
unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
|
87
|
+
end
|
88
|
+
|
89
|
+
server = config[:server]
|
90
|
+
port = config[:port]
|
91
|
+
uri = config[:uri]
|
92
|
+
timeout = config[:timeout]
|
93
|
+
crit = config[:crit]
|
94
|
+
warn = config[:warn]
|
95
|
+
|
96
|
+
begin
|
97
|
+
server = get_leader_url server, port
|
98
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
99
|
+
compare = get_check_diff(get_slaves(r))
|
100
|
+
if compare['diff'] >= crit
|
101
|
+
critical "There is a disk usage diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
|
102
|
+
end
|
103
|
+
if compare['diff'] >= warn
|
104
|
+
warning "There is a disk usage diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
|
105
|
+
end
|
106
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
107
|
+
unknown "Mesos #{server} is not responding"
|
108
|
+
rescue RestClient::RequestTimeout
|
109
|
+
unknown "Mesos #{server} connection timed out"
|
110
|
+
end
|
111
|
+
ok
|
112
|
+
end
|
113
|
+
|
114
|
+
# Redirects server call to discover the Leader
|
115
|
+
# @param server [String] Server address
|
116
|
+
# @param port [Number] api port
|
117
|
+
# @return [Url] Url representing the Leader
|
118
|
+
|
119
|
+
def get_leader_url(server, port)
|
120
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parses JSON data as returned from Mesos's metrics API
|
124
|
+
# @param data [String] Server response
|
125
|
+
# @return [Integer] Number of failed tasks in Mesos
|
126
|
+
def get_slaves(data)
|
127
|
+
begin
|
128
|
+
slaves = JSON.parse(data)[MesosDiskBalanceCheck.metrics_name]
|
129
|
+
rescue JSON::ParserError
|
130
|
+
raise "Could not parse JSON response: #{data}"
|
131
|
+
end
|
132
|
+
|
133
|
+
if slaves.nil?
|
134
|
+
raise "No metrics for [#{MesosDiskBalanceCheck.metrics_name}] in server response: #{data}"
|
135
|
+
end
|
136
|
+
|
137
|
+
slaves
|
138
|
+
end
|
139
|
+
|
140
|
+
def get_check_diff(slavelist)
|
141
|
+
begin
|
142
|
+
usages = {}
|
143
|
+
check_diff = {}
|
144
|
+
slavelist.each do |slaveinfo|
|
145
|
+
usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
|
146
|
+
end
|
147
|
+
sorted = usages.sort_by { |_hostname, total| total }
|
148
|
+
max = usages.length - 1
|
149
|
+
check_diff['diff'] = sorted[max][1] - sorted[0][1]
|
150
|
+
check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
|
151
|
+
end
|
152
|
+
check_diff
|
153
|
+
end
|
154
|
+
end
|