sensu-plugins-mesos 0.1.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/CHANGELOG.md +22 -1
- data/LICENSE +0 -0
- data/README.md +5 -0
- data/bin/check-chronos.rb +8 -1
- data/bin/check-marathon-task.rb +111 -15
- data/bin/check-marathon.rb +14 -1
- data/bin/check-mesos-cpu-balance.rb +154 -0
- data/bin/check-mesos-disk-balance.rb +154 -0
- data/bin/check-mesos-failed-tasks.rb +146 -0
- data/bin/check-mesos-gpu-balance.rb +154 -0
- data/bin/check-mesos-leader-status.rb +75 -0
- data/bin/check-mesos-lost-tasks.rb +139 -0
- data/bin/check-mesos-mem-balance.rb +154 -0
- data/bin/check-mesos-running-tasks.rb +182 -0
- data/bin/check-mesos.rb +14 -24
- data/bin/check-metronome.rb +80 -0
- data/bin/metrics-marathon.rb +17 -4
- data/bin/metrics-mesos.rb +9 -4
- data/lib/sensu-plugins-mesos.rb +0 -0
- data/lib/sensu-plugins-mesos/version.rb +3 -3
- metadata +85 -32
@@ -0,0 +1,146 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-failed-tasks
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there are less or same number of failed tasks than provided on a Mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
|
26
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
27
|
+
# for details.
|
28
|
+
#
|
29
|
+
|
30
|
+
require 'sensu-plugin/check/cli'
|
31
|
+
require 'rest-client'
|
32
|
+
require 'json'
|
33
|
+
require 'daybreak'
|
34
|
+
|
35
|
+
class MesosFailedTasksCheck < Sensu::Plugin::Check::CLI
|
36
|
+
check_name 'CheckMesosFailedTasks'
|
37
|
+
@metrics_name = 'master/tasks_failed'.freeze
|
38
|
+
|
39
|
+
class << self
|
40
|
+
attr_reader :metrics_name
|
41
|
+
end
|
42
|
+
|
43
|
+
option :server,
|
44
|
+
description: 'Mesos server',
|
45
|
+
short: '-s SERVER',
|
46
|
+
long: '--server SERVER',
|
47
|
+
default: 'localhost'
|
48
|
+
|
49
|
+
option :port,
|
50
|
+
description: 'port (default 5050)',
|
51
|
+
short: '-p PORT',
|
52
|
+
long: '--port PORT',
|
53
|
+
default: 5050,
|
54
|
+
required: false
|
55
|
+
|
56
|
+
option :uri,
|
57
|
+
description: 'Endpoint URI',
|
58
|
+
short: '-u URI',
|
59
|
+
long: '--uri URI',
|
60
|
+
default: '/metrics/snapshot'
|
61
|
+
|
62
|
+
option :timeout,
|
63
|
+
description: 'timeout in seconds',
|
64
|
+
short: '-t TIMEOUT',
|
65
|
+
long: '--timeout TIMEOUT',
|
66
|
+
proc: proc(&:to_i),
|
67
|
+
default: 5
|
68
|
+
|
69
|
+
option :value,
|
70
|
+
description: 'value to check against',
|
71
|
+
short: '-v VALUE',
|
72
|
+
long: '--value VALUE',
|
73
|
+
proc: proc(&:to_i),
|
74
|
+
default: 0,
|
75
|
+
required: false
|
76
|
+
|
77
|
+
option :delta,
|
78
|
+
short: '-d',
|
79
|
+
long: '--delta',
|
80
|
+
description: 'Use this flag to compare the metric with the previously retrieved value',
|
81
|
+
boolean: true
|
82
|
+
|
83
|
+
def run
|
84
|
+
if config[:value].to_i < 0
|
85
|
+
unknown 'Number of failed tasks cannot be negative'
|
86
|
+
end
|
87
|
+
|
88
|
+
server = config[:server]
|
89
|
+
port = config[:port]
|
90
|
+
uri = config[:uri]
|
91
|
+
timeout = config[:timeout].to_i
|
92
|
+
value = config[:value].to_i
|
93
|
+
|
94
|
+
begin
|
95
|
+
server = get_leader_url server, port
|
96
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
97
|
+
tasks_failed = check_tasks(r)
|
98
|
+
if config[:delta]
|
99
|
+
db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
|
100
|
+
prev_value = db["task_#{MesosFailedTasksCheck.metrics_name}"]
|
101
|
+
db.lock do
|
102
|
+
db["task_#{MesosFailedTasksCheck.metrics_name}"] = tasks_failed
|
103
|
+
end
|
104
|
+
tasks_failed -= prev_value
|
105
|
+
db.flush
|
106
|
+
db.compact
|
107
|
+
db.close
|
108
|
+
end
|
109
|
+
|
110
|
+
if tasks_failed >= value
|
111
|
+
critical "The number of FAILED tasks [#{tasks_failed}] is bigger than provided [#{value}]!"
|
112
|
+
end
|
113
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
114
|
+
unknown "Mesos #{server} is not responding"
|
115
|
+
rescue RestClient::RequestTimeout
|
116
|
+
unknown "Mesos #{server} connection timed out"
|
117
|
+
end
|
118
|
+
ok
|
119
|
+
end
|
120
|
+
|
121
|
+
# Redirects server call to discover the Leader
|
122
|
+
# @param server [String] Server address
|
123
|
+
# @param port [Number] api port
|
124
|
+
# @return [Url] Url representing the Leader
|
125
|
+
|
126
|
+
def get_leader_url(server, port)
|
127
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
128
|
+
end
|
129
|
+
|
130
|
+
# Parses JSON data as returned from Mesos's metrics API
|
131
|
+
# @param data [String] Server response
|
132
|
+
# @return [Integer] Number of failed tasks in Mesos
|
133
|
+
def check_tasks(data)
|
134
|
+
begin
|
135
|
+
tasks_failed = JSON.parse(data)[MesosFailedTasksCheck.metrics_name]
|
136
|
+
rescue JSON::ParserError
|
137
|
+
raise "Could not parse JSON response: #{data}"
|
138
|
+
end
|
139
|
+
|
140
|
+
if tasks_failed.nil?
|
141
|
+
raise "No metrics for [#{MesosFailedTasksCheck.metrics_name}] in server response: #{data}"
|
142
|
+
end
|
143
|
+
|
144
|
+
tasks_failed.round.to_i
|
145
|
+
end
|
146
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-gpu-balance
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there is less GPU imbalance than specified on a certain mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
require 'json'
|
32
|
+
|
33
|
+
class MesosGpuBalanceCheck < Sensu::Plugin::Check::CLI
|
34
|
+
check_name 'MesosGpuBalanceCheck'
|
35
|
+
@metrics_name = 'slaves'.freeze
|
36
|
+
CHECK_TYPE = 'gpus'.freeze
|
37
|
+
|
38
|
+
class << self
|
39
|
+
attr_reader :metrics_name
|
40
|
+
end
|
41
|
+
|
42
|
+
option :server,
|
43
|
+
description: 'Mesos server',
|
44
|
+
short: '-s SERVER',
|
45
|
+
long: '--server SERVER',
|
46
|
+
default: 'localhost'
|
47
|
+
|
48
|
+
option :port,
|
49
|
+
description: 'port (default 5050)',
|
50
|
+
short: '-p PORT',
|
51
|
+
long: '--port PORT',
|
52
|
+
default: 5050,
|
53
|
+
required: false
|
54
|
+
|
55
|
+
option :uri,
|
56
|
+
description: 'Endpoint URI',
|
57
|
+
short: '-u URI',
|
58
|
+
long: '--uri URI',
|
59
|
+
default: '/master/slaves'
|
60
|
+
|
61
|
+
option :timeout,
|
62
|
+
description: 'timeout in seconds',
|
63
|
+
short: '-t TIMEOUT',
|
64
|
+
long: '--timeout TIMEOUT',
|
65
|
+
proc: proc(&:to_i),
|
66
|
+
default: 5
|
67
|
+
|
68
|
+
option :crit,
|
69
|
+
description: 'Critical value to check against',
|
70
|
+
short: '-c VALUE',
|
71
|
+
long: '--critical VALUE',
|
72
|
+
proc: proc(&:to_i),
|
73
|
+
default: 0,
|
74
|
+
required: false
|
75
|
+
|
76
|
+
option :warn,
|
77
|
+
description: 'Warning value to check against',
|
78
|
+
short: '-w VALUE',
|
79
|
+
long: '--warning VALUE',
|
80
|
+
proc: proc(&:to_i),
|
81
|
+
default: 0,
|
82
|
+
required: false
|
83
|
+
|
84
|
+
def run
|
85
|
+
if config[:crit] < 0 || config[:warn] < 0
|
86
|
+
unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
|
87
|
+
end
|
88
|
+
|
89
|
+
server = config[:server]
|
90
|
+
port = config[:port]
|
91
|
+
uri = config[:uri]
|
92
|
+
timeout = config[:timeout]
|
93
|
+
crit = config[:crit]
|
94
|
+
warn = config[:warn]
|
95
|
+
|
96
|
+
begin
|
97
|
+
server = get_leader_url server, port
|
98
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
99
|
+
compare = get_check_diff(get_slaves(r))
|
100
|
+
if compare['diff'] >= crit
|
101
|
+
critical "There is a GPU usage diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
|
102
|
+
end
|
103
|
+
if compare['diff'] >= warn
|
104
|
+
warning "There is a GPU usage diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
|
105
|
+
end
|
106
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
107
|
+
unknown "Mesos #{server} is not responding"
|
108
|
+
rescue RestClient::RequestTimeout
|
109
|
+
unknown "Mesos #{server} connection timed out"
|
110
|
+
end
|
111
|
+
ok
|
112
|
+
end
|
113
|
+
|
114
|
+
# Redirects server call to discover the Leader
|
115
|
+
# @param server [String] Server address
|
116
|
+
# @param port [Number] api port
|
117
|
+
# @return [Url] Url representing the Leader
|
118
|
+
|
119
|
+
def get_leader_url(server, port)
|
120
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parses JSON data as returned from Mesos's metrics API
|
124
|
+
# @param data [String] Server response
|
125
|
+
# @return [Integer] Number of failed tasks in Mesos
|
126
|
+
def get_slaves(data)
|
127
|
+
begin
|
128
|
+
slaves = JSON.parse(data)[MesosGpuBalanceCheck.metrics_name]
|
129
|
+
rescue JSON::ParserError
|
130
|
+
raise "Could not parse JSON response: #{data}"
|
131
|
+
end
|
132
|
+
|
133
|
+
if slaves.nil?
|
134
|
+
raise "No metrics for [#{MesosGpuBalanceCheck.metrics_name}] in server response: #{data}"
|
135
|
+
end
|
136
|
+
|
137
|
+
slaves
|
138
|
+
end
|
139
|
+
|
140
|
+
def get_check_diff(slavelist)
|
141
|
+
begin
|
142
|
+
usages = {}
|
143
|
+
check_diff = {}
|
144
|
+
slavelist.each do |slaveinfo|
|
145
|
+
usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
|
146
|
+
end
|
147
|
+
sorted = usages.sort_by { |_hostname, total| total }
|
148
|
+
max = usages.length - 1
|
149
|
+
check_diff['diff'] = sorted[max][1] - sorted[0][1]
|
150
|
+
check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
|
151
|
+
end
|
152
|
+
check_diff
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-leader-status
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that the health url of the leader master returns 200 OK
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
#
|
18
|
+
# USAGE:
|
19
|
+
# #YELLOW
|
20
|
+
#
|
21
|
+
# NOTES:
|
22
|
+
#
|
23
|
+
# LICENSE:
|
24
|
+
# Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
|
25
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
26
|
+
# for details.
|
27
|
+
#
|
28
|
+
|
29
|
+
require 'sensu-plugin/check/cli'
|
30
|
+
require 'rest-client'
|
31
|
+
|
32
|
+
class MesosLeaderNodeStatus < Sensu::Plugin::Check::CLI
|
33
|
+
option :server,
|
34
|
+
description: 'Mesos server',
|
35
|
+
short: '-s SERVER',
|
36
|
+
long: '--server SERVER',
|
37
|
+
default: 'localhost'
|
38
|
+
|
39
|
+
option :port,
|
40
|
+
description: 'port (default 5050)',
|
41
|
+
short: '-p PORT',
|
42
|
+
long: '--port PORT',
|
43
|
+
default: 5050,
|
44
|
+
required: false
|
45
|
+
|
46
|
+
option :uri,
|
47
|
+
description: 'Endpoint URI',
|
48
|
+
short: '-u URI',
|
49
|
+
long: '--uri URI',
|
50
|
+
default: '/redirect'
|
51
|
+
|
52
|
+
option :timeout,
|
53
|
+
description: 'timeout in seconds',
|
54
|
+
short: '-t TIMEOUT',
|
55
|
+
long: '--timeout TIMEOUT',
|
56
|
+
proc: proc(&:to_i),
|
57
|
+
default: 5
|
58
|
+
|
59
|
+
def run
|
60
|
+
server = config[:server]
|
61
|
+
port = config[:port]
|
62
|
+
uri = config[:uri]
|
63
|
+
begin
|
64
|
+
r = RestClient::Resource.new("http://#{server}:#{port}#{uri}", timeout: config[:timeout]).get
|
65
|
+
if r.code == 503
|
66
|
+
critical "Master on #{server} is not responding"
|
67
|
+
end
|
68
|
+
rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
|
69
|
+
critical "Mesos on #{server} is not responding"
|
70
|
+
rescue RestClient::RequestTimeout
|
71
|
+
critical "Mesos on #{server} connection timed out"
|
72
|
+
end
|
73
|
+
ok
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mesos-lost-tasks
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# This plugin checks that there are less or same number of lost tasks than provided on a Mesos cluster
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: rest-client
|
17
|
+
# gem: json
|
18
|
+
#
|
19
|
+
# USAGE:
|
20
|
+
# #YELLOW
|
21
|
+
#
|
22
|
+
# NOTES:
|
23
|
+
#
|
24
|
+
# LICENSE:
|
25
|
+
# Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
|
26
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
27
|
+
# for details.
|
28
|
+
#
|
29
|
+
|
30
|
+
require 'sensu-plugin/check/cli'
|
31
|
+
require 'rest-client'
|
32
|
+
require 'json'
|
33
|
+
require 'daybreak'
|
34
|
+
|
35
|
+
class MesosLostTasksCheck < Sensu::Plugin::Check::CLI
|
36
|
+
check_name 'CheckMesosLostTasks'
|
37
|
+
@metrics_name = 'master/tasks_lost'.freeze
|
38
|
+
|
39
|
+
class << self
|
40
|
+
attr_reader :metrics_name
|
41
|
+
end
|
42
|
+
|
43
|
+
option :server,
|
44
|
+
description: 'Mesos server',
|
45
|
+
short: '-s SERVER',
|
46
|
+
long: '--server SERVER',
|
47
|
+
default: 'localhost'
|
48
|
+
|
49
|
+
option :port,
|
50
|
+
description: 'port (default 5050)',
|
51
|
+
short: '-p PORT',
|
52
|
+
long: '--port PORT',
|
53
|
+
default: 5050,
|
54
|
+
required: false
|
55
|
+
|
56
|
+
option :timeout,
|
57
|
+
description: 'timeout in seconds',
|
58
|
+
short: '-t TIMEOUT',
|
59
|
+
long: '--timeout TIMEOUT',
|
60
|
+
proc: proc(&:to_i),
|
61
|
+
default: 5
|
62
|
+
|
63
|
+
option :uri,
|
64
|
+
description: 'Endpoint URI',
|
65
|
+
short: '-u URI',
|
66
|
+
long: '--uri URI',
|
67
|
+
default: '/metrics/snapshot'
|
68
|
+
|
69
|
+
option :value,
|
70
|
+
description: 'value to check against',
|
71
|
+
short: '-v VALUE',
|
72
|
+
long: '--value VALUE',
|
73
|
+
default: 0,
|
74
|
+
proc: proc(&:to_i),
|
75
|
+
required: false
|
76
|
+
|
77
|
+
option :delta,
|
78
|
+
short: '-d',
|
79
|
+
long: '--delta',
|
80
|
+
description: 'Use this flag to compare the metric with the previously retreived value',
|
81
|
+
boolean: true
|
82
|
+
|
83
|
+
def run
|
84
|
+
if config[:value] < 0
|
85
|
+
unknown 'Number of lost tasks cannot be negative, please set --value to a number greater or equal to 0'
|
86
|
+
end
|
87
|
+
|
88
|
+
server = config[:server]
|
89
|
+
port = config[:port]
|
90
|
+
uri = config[:uri]
|
91
|
+
timeout = config[:timeout]
|
92
|
+
value = config[:value]
|
93
|
+
|
94
|
+
begin
|
95
|
+
server = get_leader_url server, port
|
96
|
+
# remove comment for debugging purpose
|
97
|
+
# puts(server)
|
98
|
+
|
99
|
+
r = RestClient::Resource.new("#{server}#{uri}", timeout).get
|
100
|
+
tasks_lost = check_tasks(r)
|
101
|
+
if config[:delta]
|
102
|
+
db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
|
103
|
+
prev_value = db["task_#{MesosLostTasksCheck.metrics_name}"]
|
104
|
+
db.lock do
|
105
|
+
db["task_#{MesosLostTasksCheck.metrics_name}"] = tasks_lost
|
106
|
+
end
|
107
|
+
tasks_lost -= prev_value
|
108
|
+
db.flush
|
109
|
+
db.compact
|
110
|
+
db.close
|
111
|
+
end
|
112
|
+
if tasks_lost >= value
|
113
|
+
critical "The number of LOST tasks [#{tasks_lost}] is bigger than provided [#{value}]!"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
ok
|
117
|
+
end
|
118
|
+
|
119
|
+
def get_leader_url(server, port)
|
120
|
+
RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parses JSON data as returned from Mesos's metrics API
|
124
|
+
# @param data [String] Server response
|
125
|
+
# @return [Integer] Number of lost tasks in Mesos
|
126
|
+
def check_tasks(data)
|
127
|
+
begin
|
128
|
+
tasks_lost = JSON.parse(data)[MesosLostTasksCheck.metrics_name]
|
129
|
+
rescue JSON::ParserError
|
130
|
+
raise "Could not parse JSON response: #{data}"
|
131
|
+
end
|
132
|
+
|
133
|
+
if tasks_lost.nil?
|
134
|
+
raise "No metrics for [#{MesosLostTasksCheck.metrics_name}] in server response: #{data}"
|
135
|
+
end
|
136
|
+
|
137
|
+
tasks_lost.round.to_i
|
138
|
+
end
|
139
|
+
end
|