sensu-plugins-mesos 0.1.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/CHANGELOG.md +22 -1
- data/LICENSE +0 -0
- data/README.md +5 -0
- data/bin/check-chronos.rb +8 -1
- data/bin/check-marathon-task.rb +111 -15
- data/bin/check-marathon.rb +14 -1
- data/bin/check-mesos-cpu-balance.rb +154 -0
- data/bin/check-mesos-disk-balance.rb +154 -0
- data/bin/check-mesos-failed-tasks.rb +146 -0
- data/bin/check-mesos-gpu-balance.rb +154 -0
- data/bin/check-mesos-leader-status.rb +75 -0
- data/bin/check-mesos-lost-tasks.rb +139 -0
- data/bin/check-mesos-mem-balance.rb +154 -0
- data/bin/check-mesos-running-tasks.rb +182 -0
- data/bin/check-mesos.rb +14 -24
- data/bin/check-metronome.rb +80 -0
- data/bin/metrics-marathon.rb +17 -4
- data/bin/metrics-mesos.rb +9 -4
- data/lib/sensu-plugins-mesos.rb +0 -0
- data/lib/sensu-plugins-mesos/version.rb +3 -3
- metadata +85 -32
@@ -0,0 +1,154 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-mem-balance
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there is less Memory imbalance than specified on a certain mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
require 'json'
|
32
|
+
|
33
|
+
class MesosMemBalanceCheck < Sensu::Plugin::Check::CLI
|
34
|
+
check_name 'MesosMemBalanceCheck'
|
35
|
+
@metrics_name = 'slaves'.freeze
|
36
|
+
CHECK_TYPE = 'mem'.freeze
|
37
|
+
|
38
|
+
class << self
|
39
|
+
attr_reader :metrics_name
|
40
|
+
end
|
41
|
+
|
42
|
+
option :server,
|
43
|
+
description: 'Mesos server',
|
44
|
+
short: '-s SERVER',
|
45
|
+
long: '--server SERVER',
|
46
|
+
default: 'localhost'
|
47
|
+
|
48
|
+
option :port,
|
49
|
+
description: 'port (default 5050)',
|
50
|
+
short: '-p PORT',
|
51
|
+
long: '--port PORT',
|
52
|
+
default: 5050,
|
53
|
+
required: false
|
54
|
+
|
55
|
+
option :uri,
|
56
|
+
description: 'Endpoint URI',
|
57
|
+
short: '-u URI',
|
58
|
+
long: '--uri URI',
|
59
|
+
default: '/master/slaves'
|
60
|
+
|
61
|
+
option :timeout,
|
62
|
+
description: 'timeout in seconds',
|
63
|
+
short: '-t TIMEOUT',
|
64
|
+
long: '--timeout TIMEOUT',
|
65
|
+
proc: proc(&:to_i),
|
66
|
+
default: 5
|
67
|
+
|
68
|
+
option :crit,
|
69
|
+
description: 'Critical value to check against',
|
70
|
+
short: '-c VALUE',
|
71
|
+
long: '--critical VALUE',
|
72
|
+
proc: proc(&:to_i),
|
73
|
+
default: 0,
|
74
|
+
required: false
|
75
|
+
|
76
|
+
option :warn,
|
77
|
+
description: 'Warning value to check against',
|
78
|
+
short: '-w VALUE',
|
79
|
+
long: '--warning VALUE',
|
80
|
+
proc: proc(&:to_i),
|
81
|
+
default: 0,
|
82
|
+
required: false
|
83
|
+
|
84
|
+
def run
|
85
|
+
if config[:crit] < 0 || config[:warn] < 0
|
86
|
+
unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
|
87
|
+
end
|
88
|
+
|
89
|
+
server = config[:server]
|
90
|
+
port = config[:port]
|
91
|
+
uri = config[:uri]
|
92
|
+
timeout = config[:timeout]
|
93
|
+
crit = config[:crit]
|
94
|
+
warn = config[:warn]
|
95
|
+
|
96
|
+
begin
|
97
|
+
server = get_leader_url server, port
|
98
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
99
|
+
compare = get_check_diff(get_slaves(r))
|
100
|
+
if compare['diff'] >= crit
|
101
|
+
critical "There is a Memory usage diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
|
102
|
+
end
|
103
|
+
if compare['diff'] >= warn
|
104
|
+
warning "There is a Memory usage diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
|
105
|
+
end
|
106
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
107
|
+
unknown "Mesos #{server} is not responding"
|
108
|
+
rescue RestClient::RequestTimeout
|
109
|
+
unknown "Mesos #{server} connection timed out"
|
110
|
+
end
|
111
|
+
ok
|
112
|
+
end
|
113
|
+
|
114
|
+
# Redirects server call to discover the Leader
|
115
|
+
# @param server [String] Server address
|
116
|
+
# @param port [Number] api port
|
117
|
+
# @return [Url] Url representing the Leader
|
118
|
+
|
119
|
+
def get_leader_url(server, port)
|
120
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parses JSON data as returned from Mesos's metrics API
|
124
|
+
# @param data [String] Server response
|
125
|
+
# @return [Integer] Number of failed tasks in Mesos
|
126
|
+
def get_slaves(data)
|
127
|
+
begin
|
128
|
+
slaves = JSON.parse(data)[MesosMemBalanceCheck.metrics_name]
|
129
|
+
rescue JSON::ParserError
|
130
|
+
raise "Could not parse JSON response: #{data}"
|
131
|
+
end
|
132
|
+
|
133
|
+
if slaves.nil?
|
134
|
+
raise "No metrics for [#{MesosMemBalanceCheck.metrics_name}] in server response: #{data}"
|
135
|
+
end
|
136
|
+
|
137
|
+
slaves
|
138
|
+
end
|
139
|
+
|
140
|
+
def get_check_diff(slavelist)
|
141
|
+
begin
|
142
|
+
usages = {}
|
143
|
+
check_diff = {}
|
144
|
+
slavelist.each do |slaveinfo|
|
145
|
+
usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
|
146
|
+
end
|
147
|
+
sorted = usages.sort_by { |_hostname, total| total }
|
148
|
+
max = usages.length - 1
|
149
|
+
check_diff['diff'] = sorted[max][1] - sorted[0][1]
|
150
|
+
check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
|
151
|
+
end
|
152
|
+
check_diff
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-running-tasks
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there are running tasks on a mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
|
26
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
27
|
+
# for details.
|
28
|
+
#
|
29
|
+
|
30
|
+
require 'sensu-plugin/check/cli'
|
31
|
+
require 'rest-client'
|
32
|
+
require 'json'
|
33
|
+
require 'daybreak'
|
34
|
+
|
35
|
+
class MesosRunningTaskCheck < Sensu::Plugin::Check::CLI
|
36
|
+
check_name 'CheckMesosRunningTask'
|
37
|
+
@metrics_name = 'master/tasks_running'.freeze
|
38
|
+
|
39
|
+
class << self
|
40
|
+
attr_reader :metrics_name
|
41
|
+
end
|
42
|
+
|
43
|
+
option :server,
|
44
|
+
description: 'Mesos server',
|
45
|
+
short: '-s SERVER',
|
46
|
+
long: '--server SERVER',
|
47
|
+
default: 'localhost'
|
48
|
+
|
49
|
+
option :port,
|
50
|
+
description: 'port (default 5050)',
|
51
|
+
short: '-p PORT',
|
52
|
+
long: '--port PORT',
|
53
|
+
default: 5050,
|
54
|
+
required: false
|
55
|
+
|
56
|
+
option :uri,
|
57
|
+
description: 'Endpoint URI',
|
58
|
+
short: '-u URI',
|
59
|
+
long: '--uri URI',
|
60
|
+
default: '/metrics/snapshot'
|
61
|
+
|
62
|
+
option :timeout,
|
63
|
+
description: 'timeout in seconds',
|
64
|
+
short: '-t TIMEOUT',
|
65
|
+
long: '--timeout TIMEOUT',
|
66
|
+
proc: proc(&:to_i),
|
67
|
+
default: 5
|
68
|
+
|
69
|
+
option :mode,
|
70
|
+
description: 'eq ne lt gt or rg',
|
71
|
+
short: '-m MODE',
|
72
|
+
long: '--mode MODE',
|
73
|
+
required: true
|
74
|
+
|
75
|
+
option :min,
|
76
|
+
description: 'min value on range',
|
77
|
+
short: '-l VALUE',
|
78
|
+
long: '--low VALUE',
|
79
|
+
required: false,
|
80
|
+
proc: proc(&:to_i),
|
81
|
+
derfault: 0
|
82
|
+
|
83
|
+
option :max,
|
84
|
+
description: 'max value on range',
|
85
|
+
short: '-h VALUE',
|
86
|
+
long: '--high VALUE',
|
87
|
+
required: false,
|
88
|
+
proc: proc(&:to_i),
|
89
|
+
default: 1
|
90
|
+
|
91
|
+
option :value,
|
92
|
+
description: 'value to check against',
|
93
|
+
short: '-v VALUE',
|
94
|
+
long: '--value VALUE',
|
95
|
+
proc: proc(&:to_i),
|
96
|
+
default: 0,
|
97
|
+
required: false
|
98
|
+
|
99
|
+
option :delta,
|
100
|
+
short: '-d',
|
101
|
+
long: '--delta',
|
102
|
+
description: 'Use this flag to compare the metric with the previously retrieved value',
|
103
|
+
boolean: true
|
104
|
+
|
105
|
+
def run
|
106
|
+
port = config[:port]
|
107
|
+
uri = config[:uri]
|
108
|
+
timeout = config[:timeout]
|
109
|
+
mode = config[:mode]
|
110
|
+
value = config[:value]
|
111
|
+
server = config[:server]
|
112
|
+
min = config[:min]
|
113
|
+
max = config[:max]
|
114
|
+
|
115
|
+
begin
|
116
|
+
server = get_leader_url server, port
|
117
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
118
|
+
metric_value = check_tasks(r)
|
119
|
+
check_mesos_tasks(metric_value, mode, value, min, max)
|
120
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
121
|
+
unknown "Mesos #{server} is not responding"
|
122
|
+
rescue RestClient::RequestTimeout
|
123
|
+
unknown "Mesos #{server} connection timed out"
|
124
|
+
end
|
125
|
+
ok "Found #{metric_value} tasks running"
|
126
|
+
end
|
127
|
+
|
128
|
+
# Redirects server call to discover the Leader
|
129
|
+
# @param server [String] Server address
|
130
|
+
# @param port [Number] api port
|
131
|
+
# @return [Url] Url representing the Leader
|
132
|
+
|
133
|
+
def get_leader_url(server, port)
|
134
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
135
|
+
end
|
136
|
+
|
137
|
+
# Parses JSON data as returned from Mesos API
|
138
|
+
# @param data [String] Server response
|
139
|
+
# @return [Numeric] Number of running tasks
|
140
|
+
|
141
|
+
def check_tasks(data)
|
142
|
+
begin
|
143
|
+
running_tasks = JSON.parse(data)[MesosRunningTaskCheck.metrics_name]
|
144
|
+
rescue JSON::ParserError
|
145
|
+
raise "Could not parse JSON response: #{data}"
|
146
|
+
end
|
147
|
+
|
148
|
+
if running_tasks.nil?
|
149
|
+
raise "No tasks in server response: #{data}"
|
150
|
+
end
|
151
|
+
|
152
|
+
running_tasks.round
|
153
|
+
end
|
154
|
+
|
155
|
+
def check_mesos_tasks(metric_value, mode, value, min, max)
|
156
|
+
if config[:delta]
|
157
|
+
db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
|
158
|
+
prev_value = db['task_running']
|
159
|
+
db.lock do
|
160
|
+
db['task_running'] = metric_value
|
161
|
+
end
|
162
|
+
metric_value -= prev_value
|
163
|
+
db.flush
|
164
|
+
db.compact
|
165
|
+
db.close
|
166
|
+
end
|
167
|
+
case mode
|
168
|
+
when 'eq'
|
169
|
+
critical "The number of running tasks cluster is equal to #{value}!" if metric_value.equal? value
|
170
|
+
when 'ne'
|
171
|
+
critical "The number of running tasks cluster is not equal to #{value}!" if metric_value != value
|
172
|
+
when 'lt'
|
173
|
+
critical "The number of running tasks cluster is lower than #{value}!" if metric_value < value
|
174
|
+
when 'gt'
|
175
|
+
critical "The number of running tasks cluster is greater than #{value}!" if metric_value > value
|
176
|
+
when 'rg'
|
177
|
+
unless (min.to_i..max.to_i).cover? metric_value
|
178
|
+
critical "The number of running tasks in cluster is not in #{min} - #{max} value range!"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
data/bin/check-mesos.rb
CHANGED
@@ -29,10 +29,6 @@
|
|
29
29
|
require 'sensu-plugin/check/cli'
|
30
30
|
require 'rest-client'
|
31
31
|
|
32
|
-
# Mesos default ports are defined here: http://mesos.apache.org/documentation/latest/configuration
|
33
|
-
MASTER_DEFAULT_PORT = '5050'.freeze
|
34
|
-
SLAVE_DEFAULT_PORT = '5051'.freeze
|
35
|
-
|
36
32
|
class MesosNodeStatus < Sensu::Plugin::Check::CLI
|
37
33
|
option :server,
|
38
34
|
description: 'Mesos servers, comma separated',
|
@@ -40,18 +36,19 @@ class MesosNodeStatus < Sensu::Plugin::Check::CLI
|
|
40
36
|
long: '--server SERVER1,SERVER2,...',
|
41
37
|
default: 'localhost'
|
42
38
|
|
43
|
-
option :mode,
|
44
|
-
description: 'master or slave',
|
45
|
-
short: '-m MODE',
|
46
|
-
long: '--mode MODE',
|
47
|
-
required: true
|
48
|
-
|
49
39
|
option :port,
|
50
|
-
description:
|
40
|
+
description: 'port (default 5050, use 5051 for slaves)',
|
51
41
|
short: '-p PORT',
|
52
42
|
long: '--port PORT',
|
43
|
+
default: 5050,
|
53
44
|
required: false
|
54
45
|
|
46
|
+
option :uri,
|
47
|
+
description: 'Endpoint URI',
|
48
|
+
short: '-u URI',
|
49
|
+
long: '--uri URI',
|
50
|
+
default: '/health'
|
51
|
+
|
55
52
|
option :timeout,
|
56
53
|
description: 'timeout in seconds',
|
57
54
|
short: '-t TIMEOUT',
|
@@ -60,31 +57,24 @@ class MesosNodeStatus < Sensu::Plugin::Check::CLI
|
|
60
57
|
default: 5
|
61
58
|
|
62
59
|
def run
|
63
|
-
mode = config[:mode]
|
64
60
|
servers = config[:server]
|
65
|
-
|
66
|
-
|
67
|
-
port = config[:port] || MASTER_DEFAULT_PORT
|
68
|
-
uri = '/master/health'
|
69
|
-
when 'slave'
|
70
|
-
port = config[:port] || SLAVE_DEFAULT_PORT
|
71
|
-
uri = '/slave(1)/health'
|
72
|
-
end
|
61
|
+
uri = config[:uri]
|
62
|
+
port = config[:port]
|
73
63
|
failures = []
|
74
64
|
servers.split(',').each do |server|
|
75
65
|
begin
|
76
66
|
r = RestClient::Resource.new("http://#{server}:#{port}#{uri}", timeout: config[:timeout]).get
|
77
67
|
if r.code != 200
|
78
|
-
failures << "
|
68
|
+
failures << "Mesos on #{server} is not responding"
|
79
69
|
end
|
80
70
|
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
81
|
-
failures << "Mesos
|
71
|
+
failures << "Mesos on #{server} is not responding"
|
82
72
|
rescue RestClient::RequestTimeout
|
83
|
-
failures << "Mesos
|
73
|
+
failures << "Mesos on #{server} connection timed out"
|
84
74
|
end
|
85
75
|
end
|
86
76
|
if failures.empty?
|
87
|
-
ok "Mesos
|
77
|
+
ok "Mesos is running on #{servers}"
|
88
78
|
else
|
89
79
|
critical failures.join("\n")
|
90
80
|
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-metronome
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that Metronome can query the existing job graph.
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
#
|
18
|
+
# USAGE:
|
19
|
+
#
|
20
|
+
#
|
21
|
+
# NOTES:
|
22
|
+
#
|
23
|
+
# LICENSE:
|
24
|
+
# Copyright 2017, PTC (www.ptc.com)
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
|
32
|
+
class MetronomeNodeStatus < Sensu::Plugin::Check::CLI
|
33
|
+
option :server,
|
34
|
+
description: 'Metronome hosts, comma separated',
|
35
|
+
short: '-s SERVER',
|
36
|
+
long: '--server SERVER',
|
37
|
+
default: 'localhost'
|
38
|
+
|
39
|
+
option :port,
|
40
|
+
description: 'Metronome port',
|
41
|
+
short: '-p PORT',
|
42
|
+
long: '--port PORT',
|
43
|
+
default: '9942'
|
44
|
+
|
45
|
+
option :uri,
|
46
|
+
description: 'Endpoint URI',
|
47
|
+
short: '-u URI',
|
48
|
+
long: '--uri URI',
|
49
|
+
default: '/v1/jobs'
|
50
|
+
|
51
|
+
option :timeout,
|
52
|
+
description: 'timeout in seconds',
|
53
|
+
short: '-t TIMEOUT',
|
54
|
+
long: '--timeout TIMEOUT',
|
55
|
+
proc: proc(&:to_i),
|
56
|
+
default: 5
|
57
|
+
|
58
|
+
def run
|
59
|
+
servers = config[:server]
|
60
|
+
uri = config[:uri]
|
61
|
+
failures = []
|
62
|
+
servers.split(',').each do |server|
|
63
|
+
begin
|
64
|
+
r = RestClient::Resource.new("http://#{server}:#{config[:port]}#{uri}", timeout: config[:timeout]).get
|
65
|
+
if r.code != 200
|
66
|
+
failures << "Metronome on #{server} is not responding"
|
67
|
+
end
|
68
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
69
|
+
failures << "Metronome on #{server} is not responding"
|
70
|
+
rescue RestClient::RequestTimeout
|
71
|
+
failures << "Metronome on #{server} connection timed out"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
if failures.empty?
|
75
|
+
ok "Metronome is running on #{servers}"
|
76
|
+
else
|
77
|
+
critical failures.join("\n")
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|