sensu-plugins-mesos 0.1.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/CHANGELOG.md +22 -1
- data/LICENSE +0 -0
- data/README.md +5 -0
- data/bin/check-chronos.rb +8 -1
- data/bin/check-marathon-task.rb +111 -15
- data/bin/check-marathon.rb +14 -1
- data/bin/check-mesos-cpu-balance.rb +154 -0
- data/bin/check-mesos-disk-balance.rb +154 -0
- data/bin/check-mesos-failed-tasks.rb +146 -0
- data/bin/check-mesos-gpu-balance.rb +154 -0
- data/bin/check-mesos-leader-status.rb +75 -0
- data/bin/check-mesos-lost-tasks.rb +139 -0
- data/bin/check-mesos-mem-balance.rb +154 -0
- data/bin/check-mesos-running-tasks.rb +182 -0
- data/bin/check-mesos.rb +14 -24
- data/bin/check-metronome.rb +80 -0
- data/bin/metrics-marathon.rb +17 -4
- data/bin/metrics-mesos.rb +9 -4
- data/lib/sensu-plugins-mesos.rb +0 -0
- data/lib/sensu-plugins-mesos/version.rb +3 -3
- metadata +85 -32
@@ -0,0 +1,146 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-failed-tasks
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there are less or same number of failed tasks than provided on a Mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
|
26
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
27
|
+
# for details.
|
28
|
+
#
|
29
|
+
|
30
|
+
require 'sensu-plugin/check/cli'
|
31
|
+
require 'rest-client'
|
32
|
+
require 'json'
|
33
|
+
require 'daybreak'
|
34
|
+
|
35
|
+
class MesosFailedTasksCheck < Sensu::Plugin::Check::CLI
|
36
|
+
check_name 'CheckMesosFailedTasks'
|
37
|
+
@metrics_name = 'master/tasks_failed'.freeze
|
38
|
+
|
39
|
+
class << self
|
40
|
+
attr_reader :metrics_name
|
41
|
+
end
|
42
|
+
|
43
|
+
option :server,
|
44
|
+
description: 'Mesos server',
|
45
|
+
short: '-s SERVER',
|
46
|
+
long: '--server SERVER',
|
47
|
+
default: 'localhost'
|
48
|
+
|
49
|
+
option :port,
|
50
|
+
description: 'port (default 5050)',
|
51
|
+
short: '-p PORT',
|
52
|
+
long: '--port PORT',
|
53
|
+
default: 5050,
|
54
|
+
required: false
|
55
|
+
|
56
|
+
option :uri,
|
57
|
+
description: 'Endpoint URI',
|
58
|
+
short: '-u URI',
|
59
|
+
long: '--uri URI',
|
60
|
+
default: '/metrics/snapshot'
|
61
|
+
|
62
|
+
option :timeout,
|
63
|
+
description: 'timeout in seconds',
|
64
|
+
short: '-t TIMEOUT',
|
65
|
+
long: '--timeout TIMEOUT',
|
66
|
+
proc: proc(&:to_i),
|
67
|
+
default: 5
|
68
|
+
|
69
|
+
option :value,
|
70
|
+
description: 'value to check against',
|
71
|
+
short: '-v VALUE',
|
72
|
+
long: '--value VALUE',
|
73
|
+
proc: proc(&:to_i),
|
74
|
+
default: 0,
|
75
|
+
required: false
|
76
|
+
|
77
|
+
option :delta,
|
78
|
+
short: '-d',
|
79
|
+
long: '--delta',
|
80
|
+
description: 'Use this flag to compare the metric with the previously retrieved value',
|
81
|
+
boolean: true
|
82
|
+
|
83
|
+
def run
|
84
|
+
if config[:value].to_i < 0
|
85
|
+
unknown 'Number of failed tasks cannot be negative'
|
86
|
+
end
|
87
|
+
|
88
|
+
server = config[:server]
|
89
|
+
port = config[:port]
|
90
|
+
uri = config[:uri]
|
91
|
+
timeout = config[:timeout].to_i
|
92
|
+
value = config[:value].to_i
|
93
|
+
|
94
|
+
begin
|
95
|
+
server = get_leader_url server, port
|
96
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
97
|
+
tasks_failed = check_tasks(r)
|
98
|
+
if config[:delta]
|
99
|
+
db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
|
100
|
+
prev_value = db["task_#{MesosFailedTasksCheck.metrics_name}"]
|
101
|
+
db.lock do
|
102
|
+
db["task_#{MesosFailedTasksCheck.metrics_name}"] = tasks_failed
|
103
|
+
end
|
104
|
+
tasks_failed -= prev_value
|
105
|
+
db.flush
|
106
|
+
db.compact
|
107
|
+
db.close
|
108
|
+
end
|
109
|
+
|
110
|
+
if tasks_failed >= value
|
111
|
+
critical "The number of FAILED tasks [#{tasks_failed}] is bigger than provided [#{value}]!"
|
112
|
+
end
|
113
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
114
|
+
unknown "Mesos #{server} is not responding"
|
115
|
+
rescue RestClient::RequestTimeout
|
116
|
+
unknown "Mesos #{server} connection timed out"
|
117
|
+
end
|
118
|
+
ok
|
119
|
+
end
|
120
|
+
|
121
|
+
# Redirects server call to discover the Leader
|
122
|
+
# @param server [String] Server address
|
123
|
+
# @param port [Number] api port
|
124
|
+
# @return [Url] Url representing the Leader
|
125
|
+
|
126
|
+
def get_leader_url(server, port)
|
127
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
128
|
+
end
|
129
|
+
|
130
|
+
# Parses JSON data as returned from Mesos's metrics API
|
131
|
+
# @param data [String] Server response
|
132
|
+
# @return [Integer] Number of failed tasks in Mesos
|
133
|
+
def check_tasks(data)
|
134
|
+
begin
|
135
|
+
tasks_failed = JSON.parse(data)[MesosFailedTasksCheck.metrics_name]
|
136
|
+
rescue JSON::ParserError
|
137
|
+
raise "Could not parse JSON response: #{data}"
|
138
|
+
end
|
139
|
+
|
140
|
+
if tasks_failed.nil?
|
141
|
+
raise "No metrics for [#{MesosFailedTasksCheck.metrics_name}] in server response: #{data}"
|
142
|
+
end
|
143
|
+
|
144
|
+
tasks_failed.round.to_i
|
145
|
+
end
|
146
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-gpu-balance
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there is less GPU imbalance than specified on a certain mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
require 'json'
|
32
|
+
|
33
|
+
class MesosGpuBalanceCheck < Sensu::Plugin::Check::CLI
|
34
|
+
check_name 'MesosGpuBalanceCheck'
|
35
|
+
@metrics_name = 'slaves'.freeze
|
36
|
+
CHECK_TYPE = 'gpus'.freeze
|
37
|
+
|
38
|
+
class << self
|
39
|
+
attr_reader :metrics_name
|
40
|
+
end
|
41
|
+
|
42
|
+
option :server,
|
43
|
+
description: 'Mesos server',
|
44
|
+
short: '-s SERVER',
|
45
|
+
long: '--server SERVER',
|
46
|
+
default: 'localhost'
|
47
|
+
|
48
|
+
option :port,
|
49
|
+
description: 'port (default 5050)',
|
50
|
+
short: '-p PORT',
|
51
|
+
long: '--port PORT',
|
52
|
+
default: 5050,
|
53
|
+
required: false
|
54
|
+
|
55
|
+
option :uri,
|
56
|
+
description: 'Endpoint URI',
|
57
|
+
short: '-u URI',
|
58
|
+
long: '--uri URI',
|
59
|
+
default: '/master/slaves'
|
60
|
+
|
61
|
+
option :timeout,
|
62
|
+
description: 'timeout in seconds',
|
63
|
+
short: '-t TIMEOUT',
|
64
|
+
long: '--timeout TIMEOUT',
|
65
|
+
proc: proc(&:to_i),
|
66
|
+
default: 5
|
67
|
+
|
68
|
+
option :crit,
|
69
|
+
description: 'Critical value to check against',
|
70
|
+
short: '-c VALUE',
|
71
|
+
long: '--critical VALUE',
|
72
|
+
proc: proc(&:to_i),
|
73
|
+
default: 0,
|
74
|
+
required: false
|
75
|
+
|
76
|
+
option :warn,
|
77
|
+
description: 'Warning value to check against',
|
78
|
+
short: '-w VALUE',
|
79
|
+
long: '--warning VALUE',
|
80
|
+
proc: proc(&:to_i),
|
81
|
+
default: 0,
|
82
|
+
required: false
|
83
|
+
|
84
|
+
def run
|
85
|
+
if config[:crit] < 0 || config[:warn] < 0
|
86
|
+
unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
|
87
|
+
end
|
88
|
+
|
89
|
+
server = config[:server]
|
90
|
+
port = config[:port]
|
91
|
+
uri = config[:uri]
|
92
|
+
timeout = config[:timeout]
|
93
|
+
crit = config[:crit]
|
94
|
+
warn = config[:warn]
|
95
|
+
|
96
|
+
begin
|
97
|
+
server = get_leader_url server, port
|
98
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
99
|
+
compare = get_check_diff(get_slaves(r))
|
100
|
+
if compare['diff'] >= crit
|
101
|
+
critical "There is a GPU usage diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
|
102
|
+
end
|
103
|
+
if compare['diff'] >= warn
|
104
|
+
warning "There is a GPU usage diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
|
105
|
+
end
|
106
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
107
|
+
unknown "Mesos #{server} is not responding"
|
108
|
+
rescue RestClient::RequestTimeout
|
109
|
+
unknown "Mesos #{server} connection timed out"
|
110
|
+
end
|
111
|
+
ok
|
112
|
+
end
|
113
|
+
|
114
|
+
# Redirects server call to discover the Leader
|
115
|
+
# @param server [String] Server address
|
116
|
+
# @param port [Number] api port
|
117
|
+
# @return [Url] Url representing the Leader
|
118
|
+
|
119
|
+
def get_leader_url(server, port)
|
120
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parses JSON data as returned from Mesos's metrics API
|
124
|
+
# @param data [String] Server response
|
125
|
+
# @return [Integer] Number of failed tasks in Mesos
|
126
|
+
def get_slaves(data)
|
127
|
+
begin
|
128
|
+
slaves = JSON.parse(data)[MesosGpuBalanceCheck.metrics_name]
|
129
|
+
rescue JSON::ParserError
|
130
|
+
raise "Could not parse JSON response: #{data}"
|
131
|
+
end
|
132
|
+
|
133
|
+
if slaves.nil?
|
134
|
+
raise "No metrics for [#{MesosGpuBalanceCheck.metrics_name}] in server response: #{data}"
|
135
|
+
end
|
136
|
+
|
137
|
+
slaves
|
138
|
+
end
|
139
|
+
|
140
|
+
def get_check_diff(slavelist)
|
141
|
+
begin
|
142
|
+
usages = {}
|
143
|
+
check_diff = {}
|
144
|
+
slavelist.each do |slaveinfo|
|
145
|
+
usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
|
146
|
+
end
|
147
|
+
sorted = usages.sort_by { |_hostname, total| total }
|
148
|
+
max = usages.length - 1
|
149
|
+
check_diff['diff'] = sorted[max][1] - sorted[0][1]
|
150
|
+
check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
|
151
|
+
end
|
152
|
+
check_diff
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-leader-status
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that the health url of the leader master returns 200 OK
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
#
|
18
|
+
# USAGE:
|
19
|
+
# #YELLOW
|
20
|
+
#
|
21
|
+
# NOTES:
|
22
|
+
#
|
23
|
+
# LICENSE:
|
24
|
+
# Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
|
32
|
+
class MesosLeaderNodeStatus < Sensu::Plugin::Check::CLI
|
33
|
+
option :server,
|
34
|
+
description: 'Mesos server',
|
35
|
+
short: '-s SERVER',
|
36
|
+
long: '--server SERVER',
|
37
|
+
default: 'localhost'
|
38
|
+
|
39
|
+
option :port,
|
40
|
+
description: 'port (default 5050)',
|
41
|
+
short: '-p PORT',
|
42
|
+
long: '--port PORT',
|
43
|
+
default: 5050,
|
44
|
+
required: false
|
45
|
+
|
46
|
+
option :uri,
|
47
|
+
description: 'Endpoint URI',
|
48
|
+
short: '-u URI',
|
49
|
+
long: '--uri URI',
|
50
|
+
default: '/redirect'
|
51
|
+
|
52
|
+
option :timeout,
|
53
|
+
description: 'timeout in seconds',
|
54
|
+
short: '-t TIMEOUT',
|
55
|
+
long: '--timeout TIMEOUT',
|
56
|
+
proc: proc(&:to_i),
|
57
|
+
default: 5
|
58
|
+
|
59
|
+
def run
|
60
|
+
server = config[:server]
|
61
|
+
port = config[:port]
|
62
|
+
uri = config[:uri]
|
63
|
+
begin
|
64
|
+
r = RestClient::Resource.new("http://#{server}:#{port}#{uri}", timeout: config[:timeout]).get
|
65
|
+
if r.code == 503
|
66
|
+
critical "Master on #{server} is not responding"
|
67
|
+
end
|
68
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
69
|
+
critical "Mesos on #{server} is not responding"
|
70
|
+
rescue RestClient::RequestTimeout
|
71
|
+
critical "Mesos on #{server} connection timed out"
|
72
|
+
end
|
73
|
+
ok
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-lost-tasks
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there are less or same number of lost tasks than provided on a Mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
|
26
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
27
|
+
# for details.
|
28
|
+
#
|
29
|
+
|
30
|
+
require 'sensu-plugin/check/cli'
|
31
|
+
require 'rest-client'
|
32
|
+
require 'json'
|
33
|
+
require 'daybreak'
|
34
|
+
|
35
|
+
class MesosLostTasksCheck < Sensu::Plugin::Check::CLI
|
36
|
+
check_name 'CheckMesosLostTasks'
|
37
|
+
@metrics_name = 'master/tasks_lost'.freeze
|
38
|
+
|
39
|
+
class << self
|
40
|
+
attr_reader :metrics_name
|
41
|
+
end
|
42
|
+
|
43
|
+
option :server,
|
44
|
+
description: 'Mesos server',
|
45
|
+
short: '-s SERVER',
|
46
|
+
long: '--server SERVER',
|
47
|
+
default: 'localhost'
|
48
|
+
|
49
|
+
option :port,
|
50
|
+
description: 'port (default 5050)',
|
51
|
+
short: '-p PORT',
|
52
|
+
long: '--port PORT',
|
53
|
+
default: 5050,
|
54
|
+
required: false
|
55
|
+
|
56
|
+
option :timeout,
|
57
|
+
description: 'timeout in seconds',
|
58
|
+
short: '-t TIMEOUT',
|
59
|
+
long: '--timeout TIMEOUT',
|
60
|
+
proc: proc(&:to_i),
|
61
|
+
default: 5
|
62
|
+
|
63
|
+
option :uri,
|
64
|
+
description: 'Endpoint URI',
|
65
|
+
short: '-u URI',
|
66
|
+
long: '--uri URI',
|
67
|
+
default: '/metrics/snapshot'
|
68
|
+
|
69
|
+
option :value,
|
70
|
+
description: 'value to check against',
|
71
|
+
short: '-v VALUE',
|
72
|
+
long: '--value VALUE',
|
73
|
+
default: 0,
|
74
|
+
proc: proc(&:to_i),
|
75
|
+
required: false
|
76
|
+
|
77
|
+
option :delta,
|
78
|
+
short: '-d',
|
79
|
+
long: '--delta',
|
80
|
+
description: 'Use this flag to compare the metric with the previously retreived value',
|
81
|
+
boolean: true
|
82
|
+
|
83
|
+
def run
|
84
|
+
if config[:value] < 0
|
85
|
+
unknown 'Number of lost tasks cannot be negative, please set --value to a number greater or equal to 0'
|
86
|
+
end
|
87
|
+
|
88
|
+
server = config[:server]
|
89
|
+
port = config[:port]
|
90
|
+
uri = config[:uri]
|
91
|
+
timeout = config[:timeout]
|
92
|
+
value = config[:value]
|
93
|
+
|
94
|
+
begin
|
95
|
+
server = get_leader_url server, port
|
96
|
+
# remove comment for debugging purpose
|
97
|
+
# puts(server)
|
98
|
+
|
99
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
100
|
+
tasks_lost = check_tasks(r)
|
101
|
+
if config[:delta]
|
102
|
+
db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
|
103
|
+
prev_value = db["task_#{MesosLostTasksCheck.metrics_name}"]
|
104
|
+
db.lock do
|
105
|
+
db["task_#{MesosLostTasksCheck.metrics_name}"] = tasks_lost
|
106
|
+
end
|
107
|
+
tasks_lost -= prev_value
|
108
|
+
db.flush
|
109
|
+
db.compact
|
110
|
+
db.close
|
111
|
+
end
|
112
|
+
if tasks_lost >= value
|
113
|
+
critical "The number of LOST tasks [#{tasks_lost}] is bigger than provided [#{value}]!"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
ok
|
117
|
+
end
|
118
|
+
|
119
|
+
def get_leader_url(server, port)
|
120
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parses JSON data as returned from Mesos's metrics API
|
124
|
+
# @param data [String] Server response
|
125
|
+
# @return [Integer] Number of lost tasks in Mesos
|
126
|
+
def check_tasks(data)
|
127
|
+
begin
|
128
|
+
tasks_lost = JSON.parse(data)[MesosLostTasksCheck.metrics_name]
|
129
|
+
rescue JSON::ParserError
|
130
|
+
raise "Could not parse JSON response: #{data}"
|
131
|
+
end
|
132
|
+
|
133
|
+
if tasks_lost.nil?
|
134
|
+
raise "No metrics for [#{MesosLostTasksCheck.metrics_name}] in server response: #{data}"
|
135
|
+
end
|
136
|
+
|
137
|
+
tasks_lost.round.to_i
|
138
|
+
end
|
139
|
+
end
|