sensu-plugins-mesos 0.1.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/CHANGELOG.md +22 -1
- data/LICENSE +0 -0
- data/README.md +5 -0
- data/bin/check-chronos.rb +8 -1
- data/bin/check-marathon-task.rb +111 -15
- data/bin/check-marathon.rb +14 -1
- data/bin/check-mesos-cpu-balance.rb +154 -0
- data/bin/check-mesos-disk-balance.rb +154 -0
- data/bin/check-mesos-failed-tasks.rb +146 -0
- data/bin/check-mesos-gpu-balance.rb +154 -0
- data/bin/check-mesos-leader-status.rb +75 -0
- data/bin/check-mesos-lost-tasks.rb +139 -0
- data/bin/check-mesos-mem-balance.rb +154 -0
- data/bin/check-mesos-running-tasks.rb +182 -0
- data/bin/check-mesos.rb +14 -24
- data/bin/check-metronome.rb +80 -0
- data/bin/metrics-marathon.rb +17 -4
- data/bin/metrics-mesos.rb +9 -4
- data/lib/sensu-plugins-mesos.rb +0 -0
- data/lib/sensu-plugins-mesos/version.rb +3 -3
- metadata +85 -32
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
YzMwZWNjZmU0OTRjYTU5NmZjZTYzYzVlNjViMzRlN2QyOGYzNjEwMg==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 21411bface3e1d88fc6c4041e0b5dac80044a364
|
4
|
+
data.tar.gz: 806ab66415a37de8c8e724078d74ca66696607ca
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
MzY1NjM5MGVkMGU1YzVkNjcyOTg2ZDc4MGZkOWU5MTEyMTI2OThjMmVkZWJm
|
11
|
-
MTQ0ZDU3N2Y4ZGJjNTJiMzE0N2I5MmE0ZTJhNDJmYWRjNDdhNTM=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
MTc3N2U5OWFiM2E1NTI3Njk4MDU1NzQyMDIyNWRlY2E5NzJjZTJmMzhlNTUy
|
14
|
-
YTVkMTI1ZmFiZTliOGExM2FmNTg0NzA5MGQxNmQ0YWQ0MDVjMGVlNjM3NGZm
|
15
|
-
ZDliYzNjMTdhYzQwY2U5OTdjZTk4MjJiODg1ODk1ZDlkYTNkYmE=
|
6
|
+
metadata.gz: c5a02b94924115f6fed7d4fc57d4cd960624c0caf674b834375cf1733ba1275105c6cb9af26c6893eac04e95a5ca8cde46516fa0ebc390fd17e7118132653fd9
|
7
|
+
data.tar.gz: fab71f56c0672988d61d098e40e61ae47ebcdeca229e61588f02877630b57d72643cefe493e5298f7e46d22d7cc171d0aa4de54a6ab28257dfa9dbac305096eb
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,26 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
4
4
|
This CHANGELOG follows the format listed at [Keep A Changelog](http://keepachangelog.com/)
|
5
5
|
|
6
6
|
## [Unreleased]
|
7
|
+
## [1.0.0] - 2017-05-05
|
8
|
+
### Breaking Change
|
9
|
+
- check-mesos.rb: removed the `mode` parameter (@luisdavim)
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- check-metronome.rb: Check if Metronome is running (@luisdavim)
|
13
|
+
- check-mesos-cpu-balance.rb: Check for imballanced use of CPU accross mesos agents (@luisdavim)
|
14
|
+
- check-mesos-gpu-balance.rb: Check for imballanced use of GPU accross mesos agents (@luisdavim)
|
15
|
+
- check-mesos-disk-balance.rb: Check for imballanced use of disk accross mesos agents (@luisdavim)
|
16
|
+
- check-mesos-mem-balance.rb: Check for imballanced use of memory accross mesos agents (@luisdavim)
|
17
|
+
|
18
|
+
### Changed
|
19
|
+
- check-marathon-task.rb: Use the health check results to verify that a task is running. (@andrelaszlo)
|
20
|
+
- check-marathon-task.rb: Rename incorrect "state" parameter to "status". (@andrelaszlo)
|
21
|
+
- Add https support and authentication to marathon plugins: (thanks to Erasys GmbH)
|
22
|
+
- Add "protocol" option to check-marathon and metrics-marathon
|
23
|
+
- Add "protocol", "username" and "password" options to check-marathon-task
|
24
|
+
- All checks now have a configurable API endpoint using --uri or -u (@luisdavim)
|
25
|
+
- Support the latest Mesos API (@luisdavim)
|
26
|
+
- Dropped support for Ruby 1.9.3 (@luisdavim)
|
7
27
|
|
8
28
|
## [0.1.1] - 2016-03-04
|
9
29
|
### Added
|
@@ -41,7 +61,8 @@ This CHANGELOG follows the format listed at [Keep A Changelog](http://keepachang
|
|
41
61
|
### Added
|
42
62
|
- initial release
|
43
63
|
|
44
|
-
[Unreleased]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.
|
64
|
+
[Unreleased]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/1.0.0...HEAD
|
65
|
+
[1.0.0]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.1.1...1.0.0
|
45
66
|
[0.1.1]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.1.0...0.1.1
|
46
67
|
[0.1.0]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.0.4...0.1.0
|
47
68
|
[0.0.4]: https://github.com/sensu-plugins/sensu-plugins-mesos/compare/0.0.2...0.0.4
|
data/LICENSE
CHANGED
File without changes
|
data/README.md
CHANGED
@@ -10,8 +10,13 @@
|
|
10
10
|
|
11
11
|
## Files
|
12
12
|
* bin/check-chronos.rb
|
13
|
+
* bin/check-metronome.rb
|
13
14
|
* bin/check-marathon.rb
|
14
15
|
* bin/check-mesos.rb
|
16
|
+
* bin/check-mesos-cpu-balance.rb
|
17
|
+
* bin/check-mesos-disk-balance.rb
|
18
|
+
* bin/check-mesos-gpu-balance.rb
|
19
|
+
* bin/check-mesos-mem-balance.rb
|
15
20
|
* bin/metrics-marathon.rb
|
16
21
|
* bin/metrics-mesos.rb
|
17
22
|
|
data/bin/check-chronos.rb
CHANGED
@@ -42,6 +42,12 @@ class ChronosNodeStatus < Sensu::Plugin::Check::CLI
|
|
42
42
|
long: '--port PORT',
|
43
43
|
default: '80'
|
44
44
|
|
45
|
+
option :uri,
|
46
|
+
description: 'Endpoint URI',
|
47
|
+
short: '-u URI',
|
48
|
+
long: '--uri URI',
|
49
|
+
default: '/scheduler/jobs'
|
50
|
+
|
45
51
|
option :timeout,
|
46
52
|
description: 'timeout in seconds',
|
47
53
|
short: '-t TIMEOUT',
|
@@ -51,10 +57,11 @@ class ChronosNodeStatus < Sensu::Plugin::Check::CLI
|
|
51
57
|
|
52
58
|
def run
|
53
59
|
servers = config[:server]
|
60
|
+
uri = config[:uri]
|
54
61
|
failures = []
|
55
62
|
servers.split(',').each do |server|
|
56
63
|
begin
|
57
|
-
r = RestClient::Resource.new("http://#{server}:#{config[:port]}
|
64
|
+
r = RestClient::Resource.new("http://#{server}:#{config[:port]}#{uri}", timeout: config[:timeout]).get
|
58
65
|
if r.code != 200
|
59
66
|
failures << "Chronos on #{server} is not responding"
|
60
67
|
end
|
data/bin/check-marathon-task.rb
CHANGED
@@ -33,48 +33,144 @@ require 'sensu-plugin/check/cli'
|
|
33
33
|
require 'net/http'
|
34
34
|
require 'json'
|
35
35
|
|
36
|
+
# This plugin checks that the given Mesos/Marathon task is running properly.
|
37
|
+
#
|
38
|
+
# This means that all of the following is true:
|
39
|
+
# 1. There are N tasks for the app, as defined by the --instances parameter
|
40
|
+
# 2. Each task's state is running
|
41
|
+
# 3. No task is unhealthy, as defined in Marathon
|
42
|
+
#
|
43
|
+
# A task is seen as **unhealthy** by Marathon if any of the health checks for
|
44
|
+
# the task is not **alive**. Alive means that a check has a last success that
|
45
|
+
# is more recent than last failure. It's not alive if the last failure is more
|
46
|
+
# recent than the last success, or if the last success doesn't exist at all.
|
36
47
|
class MarathonTaskCheck < Sensu::Plugin::Check::CLI
|
37
48
|
check_name 'CheckMarathonTask'
|
38
49
|
|
39
|
-
option :server,
|
40
|
-
|
41
|
-
|
42
|
-
|
50
|
+
option :server,
|
51
|
+
short: '-s SERVER',
|
52
|
+
long: '--server SERVER',
|
53
|
+
required: true
|
54
|
+
|
55
|
+
option :port,
|
56
|
+
short: '-p PORT',
|
57
|
+
long: '--port PORT',
|
58
|
+
default: 8080
|
59
|
+
|
60
|
+
option :uri,
|
61
|
+
description: 'Endpoint URI',
|
62
|
+
short: '-u URI',
|
63
|
+
long: '--uri URI',
|
64
|
+
default: '/v2/tasks?status=running'
|
65
|
+
|
66
|
+
option :task,
|
67
|
+
short: '-t TASK',
|
68
|
+
long: '--task TASK',
|
69
|
+
required: true
|
70
|
+
|
71
|
+
option :instances,
|
72
|
+
short: '-i INSTANCES',
|
73
|
+
long: '--instances INSTANCES',
|
74
|
+
required: true,
|
75
|
+
proc: proc(&:to_i)
|
76
|
+
|
77
|
+
option :protocol,
|
78
|
+
short: '-P PROTOCOL',
|
79
|
+
long: '--protocol PROTOCOL',
|
80
|
+
required: false,
|
81
|
+
default: 'http'
|
82
|
+
|
83
|
+
option :username,
|
84
|
+
short: '-u USERNAME',
|
85
|
+
long: '--username USERNAME',
|
86
|
+
required: false
|
87
|
+
|
88
|
+
option :password,
|
89
|
+
long: '--password PASSWORD',
|
90
|
+
required: false
|
43
91
|
|
44
92
|
def run
|
45
|
-
if config[:instances]
|
93
|
+
if config[:instances].zero?
|
46
94
|
unknown 'number of instances should be an integer'
|
47
95
|
end
|
48
96
|
|
97
|
+
if !config[:username].nil? && config[:password].nil? ||
|
98
|
+
config[:username].nil? && !config[:password].nil?
|
99
|
+
unknown 'You must provide both username and password'
|
100
|
+
end
|
101
|
+
|
49
102
|
failures = []
|
103
|
+
uri = config[:uri]
|
50
104
|
config[:server].split(',').each do |s|
|
51
105
|
begin
|
52
|
-
url = URI.parse("
|
106
|
+
url = URI.parse("#{config[:protocol]}://#{s}:#{config[:port]}#{uri}")
|
53
107
|
req = Net::HTTP::Get.new(url)
|
54
108
|
req.add_field('Accept', 'application/json')
|
55
|
-
|
109
|
+
if !config[:username].nil? && !config[:password].nil?
|
110
|
+
req.basic_auth(config[:username], config[:password])
|
111
|
+
end
|
112
|
+
r = Net::HTTP.start(url.host, url.port,
|
113
|
+
use_ssl: config[:protocol] == 'https') do |h|
|
56
114
|
h.request(req)
|
57
115
|
end
|
58
116
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
end
|
117
|
+
ok_count, unhealthy = check_tasks r.body
|
118
|
+
|
119
|
+
message = "#{ok_count}/#{config[:instances]} #{config[:task]} tasks running"
|
63
120
|
|
64
|
-
|
121
|
+
if unhealthy.any?
|
122
|
+
message << ":\n" << unhealthy.join("\n")
|
123
|
+
end
|
65
124
|
|
66
|
-
if
|
125
|
+
if unhealthy.any? || ok_count < config[:instances]
|
67
126
|
critical message
|
68
127
|
end
|
69
128
|
|
70
129
|
ok message
|
71
130
|
rescue Errno::ECONNREFUSED, SocketError
|
72
131
|
failures << "Marathon on #{s} could not be reached"
|
73
|
-
rescue
|
74
|
-
failures << "error caught trying to reach Marathon on #{s}"
|
132
|
+
rescue => err
|
133
|
+
failures << "error caught trying to reach Marathon on #{s}: #{err}"
|
75
134
|
end
|
76
135
|
end
|
77
136
|
|
78
137
|
unknown "marathon task state could not be retrieved:\n" << failures.join("\n")
|
79
138
|
end
|
139
|
+
|
140
|
+
# Parses JSON data as returned from Marathon's tasks API
|
141
|
+
# @param data [String] Server response
|
142
|
+
# @return [Numeric, [String]] Number of running tasks and a list of error
|
143
|
+
# messages from unhealthy tasks
|
144
|
+
def check_tasks(data)
|
145
|
+
begin
|
146
|
+
tasks = JSON.parse(data)['tasks']
|
147
|
+
rescue JSON::ParserError
|
148
|
+
raise "Could not parse JSON response: #{data}"
|
149
|
+
end
|
150
|
+
|
151
|
+
if tasks.nil?
|
152
|
+
raise "No tasks in server response: #{data}"
|
153
|
+
end
|
154
|
+
|
155
|
+
tasks.select! do |t|
|
156
|
+
t['appId'] == "/#{config[:task]}"
|
157
|
+
end
|
158
|
+
|
159
|
+
unhealthy = []
|
160
|
+
|
161
|
+
# Collect last error message for all health checks that are not alive
|
162
|
+
tasks.each do |task|
|
163
|
+
checks = task['healthCheckResults'] || []
|
164
|
+
checks.each do |check|
|
165
|
+
if check['alive']
|
166
|
+
next
|
167
|
+
end
|
168
|
+
message = check['lastFailureCause'] ||
|
169
|
+
'Health check not alive'
|
170
|
+
unhealthy << message
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
[tasks.length, unhealthy]
|
175
|
+
end
|
80
176
|
end
|
data/bin/check-marathon.rb
CHANGED
@@ -43,6 +43,19 @@ class MarathonNodeStatus < Sensu::Plugin::Check::CLI
|
|
43
43
|
required: false,
|
44
44
|
default: '8080'
|
45
45
|
|
46
|
+
option :protocol,
|
47
|
+
description: 'Marathon protocol [http/https]',
|
48
|
+
short: '-P PROTOCOL',
|
49
|
+
long: '--protocol PROTOCOL',
|
50
|
+
required: false,
|
51
|
+
default: 'http'
|
52
|
+
|
53
|
+
option :uri,
|
54
|
+
description: 'Endpoint URI',
|
55
|
+
short: '-u URI',
|
56
|
+
long: '--uri URI',
|
57
|
+
default: '/ping'
|
58
|
+
|
46
59
|
option :timeout,
|
47
60
|
description: 'timeout in seconds',
|
48
61
|
short: '-t TIMEOUT',
|
@@ -55,7 +68,7 @@ class MarathonNodeStatus < Sensu::Plugin::Check::CLI
|
|
55
68
|
failures = []
|
56
69
|
servers.split(',').each do |server|
|
57
70
|
begin
|
58
|
-
r = RestClient::Resource.new("
|
71
|
+
r = RestClient::Resource.new("#{config[:protocol]}://#{server}:#{config[:port]}#{config[:uri]}", timeout: config[:timeout]).get
|
59
72
|
if r.code != 200
|
60
73
|
failures << "Marathon Service on #{server} is not responding"
|
61
74
|
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-cpu-balance
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there is less CPU imbalance than specified on a certain mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
require 'json'
|
32
|
+
|
33
|
+
class MesosCpuBalanceCheck < Sensu::Plugin::Check::CLI
|
34
|
+
check_name 'MesosCpuBalanceCheck'
|
35
|
+
@metrics_name = 'slaves'.freeze
|
36
|
+
CHECK_TYPE = 'cpus'.freeze
|
37
|
+
|
38
|
+
class << self
|
39
|
+
attr_reader :metrics_name
|
40
|
+
end
|
41
|
+
|
42
|
+
option :server,
|
43
|
+
description: 'Mesos server',
|
44
|
+
short: '-s SERVER',
|
45
|
+
long: '--server SERVER',
|
46
|
+
default: 'localhost'
|
47
|
+
|
48
|
+
option :port,
|
49
|
+
description: 'port (default 5050)',
|
50
|
+
short: '-p PORT',
|
51
|
+
long: '--port PORT',
|
52
|
+
default: 5050,
|
53
|
+
required: false
|
54
|
+
|
55
|
+
option :uri,
|
56
|
+
description: 'Endpoint URI',
|
57
|
+
short: '-u URI',
|
58
|
+
long: '--uri URI',
|
59
|
+
default: '/master/slaves'
|
60
|
+
|
61
|
+
option :timeout,
|
62
|
+
description: 'timeout in seconds',
|
63
|
+
short: '-t TIMEOUT',
|
64
|
+
long: '--timeout TIMEOUT',
|
65
|
+
proc: proc(&:to_i),
|
66
|
+
default: 5
|
67
|
+
|
68
|
+
option :crit,
|
69
|
+
description: 'Critical value to check against',
|
70
|
+
short: '-c VALUE',
|
71
|
+
long: '--critical VALUE',
|
72
|
+
proc: proc(&:to_i),
|
73
|
+
default: 0,
|
74
|
+
required: false
|
75
|
+
|
76
|
+
option :warn,
|
77
|
+
description: 'Warning value to check against',
|
78
|
+
short: '-w VALUE',
|
79
|
+
long: '--warning VALUE',
|
80
|
+
proc: proc(&:to_i),
|
81
|
+
default: 0,
|
82
|
+
required: false
|
83
|
+
|
84
|
+
def run
|
85
|
+
if config[:crit] < 0 || config[:warn] < 0
|
86
|
+
unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
|
87
|
+
end
|
88
|
+
|
89
|
+
server = config[:server]
|
90
|
+
port = config[:port]
|
91
|
+
uri = config[:uri]
|
92
|
+
timeout = config[:timeout]
|
93
|
+
crit = config[:crit]
|
94
|
+
warn = config[:warn]
|
95
|
+
|
96
|
+
begin
|
97
|
+
server = get_leader_url server, port
|
98
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
99
|
+
compare = get_check_diff(get_slaves(r))
|
100
|
+
if compare['diff'] >= crit
|
101
|
+
critical "There is a CPU diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
|
102
|
+
end
|
103
|
+
if compare['diff'] >= warn
|
104
|
+
warning "There is a CPU diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
|
105
|
+
end
|
106
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
107
|
+
unknown "Mesos #{server} is not responding"
|
108
|
+
rescue RestClient::RequestTimeout
|
109
|
+
unknown "Mesos #{server} connection timed out"
|
110
|
+
end
|
111
|
+
ok
|
112
|
+
end
|
113
|
+
|
114
|
+
# Redirects server call to discover the Leader
|
115
|
+
# @param server [String] Server address
|
116
|
+
# @param port [Number] api port
|
117
|
+
# @return [Url] Url representing the Leader
|
118
|
+
|
119
|
+
def get_leader_url(server, port)
|
120
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parses JSON data as returned from Mesos's metrics API
|
124
|
+
# @param data [String] Server response
|
125
|
+
# @return [Integer] Number of failed tasks in Mesos
|
126
|
+
def get_slaves(data)
|
127
|
+
begin
|
128
|
+
slaves = JSON.parse(data)[MesosCpuBalanceCheck.metrics_name]
|
129
|
+
rescue JSON::ParserError
|
130
|
+
raise "Could not parse JSON response: #{data}"
|
131
|
+
end
|
132
|
+
|
133
|
+
if slaves.nil?
|
134
|
+
raise "No metrics for [#{MesosCpuBalanceCheck.metrics_name}] in server response: #{data}"
|
135
|
+
end
|
136
|
+
|
137
|
+
slaves
|
138
|
+
end
|
139
|
+
|
140
|
+
def get_check_diff(slavelist)
|
141
|
+
begin
|
142
|
+
usages = {}
|
143
|
+
check_diff = {}
|
144
|
+
slavelist.each do |slaveinfo|
|
145
|
+
usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
|
146
|
+
end
|
147
|
+
sorted = usages.sort_by { |_hostname, total| total }
|
148
|
+
max = usages.length - 1
|
149
|
+
check_diff['diff'] = sorted[max][1] - sorted[0][1]
|
150
|
+
check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
|
151
|
+
end
|
152
|
+
check_diff
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-disk-balance
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there is less disk usage imbalance than specified on a certain mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
require 'json'
|
32
|
+
|
33
|
+
class MesosDiskBalanceCheck < Sensu::Plugin::Check::CLI
|
34
|
+
check_name 'MesosDiskBalanceCheck'
|
35
|
+
@metrics_name = 'slaves'.freeze
|
36
|
+
CHECK_TYPE = 'disk'.freeze
|
37
|
+
|
38
|
+
class << self
|
39
|
+
attr_reader :metrics_name
|
40
|
+
end
|
41
|
+
|
42
|
+
option :server,
|
43
|
+
description: 'Mesos server',
|
44
|
+
short: '-s SERVER',
|
45
|
+
long: '--server SERVER',
|
46
|
+
default: 'localhost'
|
47
|
+
|
48
|
+
option :port,
|
49
|
+
description: 'port (default 5050)',
|
50
|
+
short: '-p PORT',
|
51
|
+
long: '--port PORT',
|
52
|
+
default: 5050,
|
53
|
+
required: false
|
54
|
+
|
55
|
+
option :uri,
|
56
|
+
description: 'Endpoint URI',
|
57
|
+
short: '-u URI',
|
58
|
+
long: '--uri URI',
|
59
|
+
default: '/master/slaves'
|
60
|
+
|
61
|
+
option :timeout,
|
62
|
+
description: 'timeout in seconds',
|
63
|
+
short: '-t TIMEOUT',
|
64
|
+
long: '--timeout TIMEOUT',
|
65
|
+
proc: proc(&:to_i),
|
66
|
+
default: 5
|
67
|
+
|
68
|
+
option :crit,
|
69
|
+
description: 'Critical value to check against',
|
70
|
+
short: '-c VALUE',
|
71
|
+
long: '--critical VALUE',
|
72
|
+
proc: proc(&:to_i),
|
73
|
+
default: 0,
|
74
|
+
required: false
|
75
|
+
|
76
|
+
option :warn,
|
77
|
+
description: 'Warning value to check against',
|
78
|
+
short: '-w VALUE',
|
79
|
+
long: '--warning VALUE',
|
80
|
+
proc: proc(&:to_i),
|
81
|
+
default: 0,
|
82
|
+
required: false
|
83
|
+
|
84
|
+
def run
|
85
|
+
if config[:crit] < 0 || config[:warn] < 0
|
86
|
+
unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
|
87
|
+
end
|
88
|
+
|
89
|
+
server = config[:server]
|
90
|
+
port = config[:port]
|
91
|
+
uri = config[:uri]
|
92
|
+
timeout = config[:timeout]
|
93
|
+
crit = config[:crit]
|
94
|
+
warn = config[:warn]
|
95
|
+
|
96
|
+
begin
|
97
|
+
server = get_leader_url server, port
|
98
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
99
|
+
compare = get_check_diff(get_slaves(r))
|
100
|
+
if compare['diff'] >= crit
|
101
|
+
critical "There is a disk usage diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
|
102
|
+
end
|
103
|
+
if compare['diff'] >= warn
|
104
|
+
warning "There is a disk usage diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
|
105
|
+
end
|
106
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
107
|
+
unknown "Mesos #{server} is not responding"
|
108
|
+
rescue RestClient::RequestTimeout
|
109
|
+
unknown "Mesos #{server} connection timed out"
|
110
|
+
end
|
111
|
+
ok
|
112
|
+
end
|
113
|
+
|
114
|
+
# Redirects server call to discover the Leader
|
115
|
+
# @param server [String] Server address
|
116
|
+
# @param port [Number] api port
|
117
|
+
# @return [Url] Url representing the Leader
|
118
|
+
|
119
|
+
def get_leader_url(server, port)
|
120
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parses JSON data as returned from Mesos's metrics API
|
124
|
+
# @param data [String] Server response
|
125
|
+
# @return [Integer] Number of failed tasks in Mesos
|
126
|
+
def get_slaves(data)
|
127
|
+
begin
|
128
|
+
slaves = JSON.parse(data)[MesosDiskBalanceCheck.metrics_name]
|
129
|
+
rescue JSON::ParserError
|
130
|
+
raise "Could not parse JSON response: #{data}"
|
131
|
+
end
|
132
|
+
|
133
|
+
if slaves.nil?
|
134
|
+
raise "No metrics for [#{MesosDiskBalanceCheck.metrics_name}] in server response: #{data}"
|
135
|
+
end
|
136
|
+
|
137
|
+
slaves
|
138
|
+
end
|
139
|
+
|
140
|
+
def get_check_diff(slavelist)
|
141
|
+
begin
|
142
|
+
usages = {}
|
143
|
+
check_diff = {}
|
144
|
+
slavelist.each do |slaveinfo|
|
145
|
+
usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
|
146
|
+
end
|
147
|
+
sorted = usages.sort_by { |_hostname, total| total }
|
148
|
+
max = usages.length - 1
|
149
|
+
check_diff['diff'] = sorted[max][1] - sorted[0][1]
|
150
|
+
check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
|
151
|
+
end
|
152
|
+
check_diff
|
153
|
+
end
|
154
|
+
end
|