sensu-plugins-mesos 0.1.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,146 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-failed-tasks
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that there are less or same number of failed tasks than provided on a Mesos cluster
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ # gem: json
18
+ #
19
+ # USAGE:
20
+ # #YELLOW
21
+ #
22
+ # NOTES:
23
+ #
24
+ # LICENSE:
25
+ # Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
26
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
27
+ # for details.
28
+ #
29
+
30
+ require 'sensu-plugin/check/cli'
31
+ require 'rest-client'
32
+ require 'json'
33
+ require 'daybreak'
34
+
35
+ class MesosFailedTasksCheck < Sensu::Plugin::Check::CLI
36
+ check_name 'CheckMesosFailedTasks'
37
+ @metrics_name = 'master/tasks_failed'.freeze
38
+
39
+ class << self
40
+ attr_reader :metrics_name
41
+ end
42
+
43
+ option :server,
44
+ description: 'Mesos server',
45
+ short: '-s SERVER',
46
+ long: '--server SERVER',
47
+ default: 'localhost'
48
+
49
+ option :port,
50
+ description: 'port (default 5050)',
51
+ short: '-p PORT',
52
+ long: '--port PORT',
53
+ default: 5050,
54
+ required: false
55
+
56
+ option :uri,
57
+ description: 'Endpoint URI',
58
+ short: '-u URI',
59
+ long: '--uri URI',
60
+ default: '/metrics/snapshot'
61
+
62
+ option :timeout,
63
+ description: 'timeout in seconds',
64
+ short: '-t TIMEOUT',
65
+ long: '--timeout TIMEOUT',
66
+ proc: proc(&:to_i),
67
+ default: 5
68
+
69
+ option :value,
70
+ description: 'value to check against',
71
+ short: '-v VALUE',
72
+ long: '--value VALUE',
73
+ proc: proc(&:to_i),
74
+ default: 0,
75
+ required: false
76
+
77
+ option :delta,
78
+ short: '-d',
79
+ long: '--delta',
80
+ description: 'Use this flag to compare the metric with the previously retrieved value',
81
+ boolean: true
82
+
83
+ def run
84
+ if config[:value].to_i < 0
85
+ unknown 'Number of failed tasks cannot be negative'
86
+ end
87
+
88
+ server = config[:server]
89
+ port = config[:port]
90
+ uri = config[:uri]
91
+ timeout = config[:timeout].to_i
92
+ value = config[:value].to_i
93
+
94
+ begin
95
+ server = get_leader_url server, port
96
+ r = RestClient::Resource.new("#{server}#{uri}", timeout).get
97
+ tasks_failed = check_tasks(r)
98
+ if config[:delta]
99
+ db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
100
+ prev_value = db["task_#{MesosFailedTasksCheck.metrics_name}"]
101
+ db.lock do
102
+ db["task_#{MesosFailedTasksCheck.metrics_name}"] = tasks_failed
103
+ end
104
+ tasks_failed -= prev_value
105
+ db.flush
106
+ db.compact
107
+ db.close
108
+ end
109
+
110
+ if tasks_failed >= value
111
+ critical "The number of FAILED tasks [#{tasks_failed}] is bigger than provided [#{value}]!"
112
+ end
113
+ rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
114
+ unknown "Mesos #{server} is not responding"
115
+ rescue RestClient::RequestTimeout
116
+ unknown "Mesos #{server} connection timed out"
117
+ end
118
+ ok
119
+ end
120
+
121
+ # Redirects server call to discover the Leader
122
+ # @param server [String] Server address
123
+ # @param port [Number] api port
124
+ # @return [Url] Url representing the Leader
125
+
126
+ def get_leader_url(server, port)
127
+ RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
128
+ end
129
+
130
+ # Parses JSON data as returned from Mesos's metrics API
131
+ # @param data [String] Server response
132
+ # @return [Integer] Number of failed tasks in Mesos
133
+ def check_tasks(data)
134
+ begin
135
+ tasks_failed = JSON.parse(data)[MesosFailedTasksCheck.metrics_name]
136
+ rescue JSON::ParserError
137
+ raise "Could not parse JSON response: #{data}"
138
+ end
139
+
140
+ if tasks_failed.nil?
141
+ raise "No metrics for [#{MesosFailedTasksCheck.metrics_name}] in server response: #{data}"
142
+ end
143
+
144
+ tasks_failed.round.to_i
145
+ end
146
+ end
@@ -0,0 +1,154 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-gpu-balance
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that there is less GPU imbalance than specified on a certain mesos cluster
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ # gem: json
18
+ #
19
+ # USAGE:
20
+ # #YELLOW
21
+ #
22
+ # NOTES:
23
+ #
24
+ # LICENSE:
25
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
26
+ # for details.
27
+ #
28
+
29
+ require 'sensu-plugin/check/cli'
30
+ require 'rest-client'
31
+ require 'json'
32
+
33
+ class MesosGpuBalanceCheck < Sensu::Plugin::Check::CLI
34
+ check_name 'MesosGpuBalanceCheck'
35
+ @metrics_name = 'slaves'.freeze
36
+ CHECK_TYPE = 'gpus'.freeze
37
+
38
+ class << self
39
+ attr_reader :metrics_name
40
+ end
41
+
42
+ option :server,
43
+ description: 'Mesos server',
44
+ short: '-s SERVER',
45
+ long: '--server SERVER',
46
+ default: 'localhost'
47
+
48
+ option :port,
49
+ description: 'port (default 5050)',
50
+ short: '-p PORT',
51
+ long: '--port PORT',
52
+ default: 5050,
53
+ required: false
54
+
55
+ option :uri,
56
+ description: 'Endpoint URI',
57
+ short: '-u URI',
58
+ long: '--uri URI',
59
+ default: '/master/slaves'
60
+
61
+ option :timeout,
62
+ description: 'timeout in seconds',
63
+ short: '-t TIMEOUT',
64
+ long: '--timeout TIMEOUT',
65
+ proc: proc(&:to_i),
66
+ default: 5
67
+
68
+ option :crit,
69
+ description: 'Critical value to check against',
70
+ short: '-c VALUE',
71
+ long: '--critical VALUE',
72
+ proc: proc(&:to_i),
73
+ default: 0,
74
+ required: false
75
+
76
+ option :warn,
77
+ description: 'Warning value to check against',
78
+ short: '-w VALUE',
79
+ long: '--warning VALUE',
80
+ proc: proc(&:to_i),
81
+ default: 0,
82
+ required: false
83
+
84
+ def run
85
+ if config[:crit] < 0 || config[:warn] < 0
86
+ unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
87
+ end
88
+
89
+ server = config[:server]
90
+ port = config[:port]
91
+ uri = config[:uri]
92
+ timeout = config[:timeout]
93
+ crit = config[:crit]
94
+ warn = config[:warn]
95
+
96
+ begin
97
+ server = get_leader_url server, port
98
+ r = RestClient::Resource.new("#{server}#{uri}", timeout).get
99
+ compare = get_check_diff(get_slaves(r))
100
+ if compare['diff'] >= crit
101
+ critical "There is a GPU usage diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
102
+ end
103
+ if compare['diff'] >= warn
104
+ warning "There is a GPU usage diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
105
+ end
106
+ rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
107
+ unknown "Mesos #{server} is not responding"
108
+ rescue RestClient::RequestTimeout
109
+ unknown "Mesos #{server} connection timed out"
110
+ end
111
+ ok
112
+ end
113
+
114
+ # Redirects server call to discover the Leader
115
+ # @param server [String] Server address
116
+ # @param port [Number] api port
117
+ # @return [Url] Url representing the Leader
118
+
119
+ def get_leader_url(server, port)
120
+ RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
121
+ end
122
+
123
+ # Parses JSON data as returned from Mesos's metrics API
124
+ # @param data [String] Server response
125
+ # @return [Integer] Number of failed tasks in Mesos
126
+ def get_slaves(data)
127
+ begin
128
+ slaves = JSON.parse(data)[MesosGpuBalanceCheck.metrics_name]
129
+ rescue JSON::ParserError
130
+ raise "Could not parse JSON response: #{data}"
131
+ end
132
+
133
+ if slaves.nil?
134
+ raise "No metrics for [#{MesosGpuBalanceCheck.metrics_name}] in server response: #{data}"
135
+ end
136
+
137
+ slaves
138
+ end
139
+
140
+ def get_check_diff(slavelist)
141
+ begin
142
+ usages = {}
143
+ check_diff = {}
144
+ slavelist.each do |slaveinfo|
145
+ usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
146
+ end
147
+ sorted = usages.sort_by { |_hostname, total| total }
148
+ max = usages.length - 1
149
+ check_diff['diff'] = sorted[max][1] - sorted[0][1]
150
+ check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
151
+ end
152
+ check_diff
153
+ end
154
+ end
@@ -0,0 +1,75 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-leader-status
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that the health url of the leader master returns 200 OK
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ #
18
+ # USAGE:
19
+ # #YELLOW
20
+ #
21
+ # NOTES:
22
+ #
23
+ # LICENSE:
24
+ # Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
25
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
26
+ # for details.
27
+ #
28
+
29
+ require 'sensu-plugin/check/cli'
30
+ require 'rest-client'
31
+
32
+ class MesosLeaderNodeStatus < Sensu::Plugin::Check::CLI
33
+ option :server,
34
+ description: 'Mesos server',
35
+ short: '-s SERVER',
36
+ long: '--server SERVER',
37
+ default: 'localhost'
38
+
39
+ option :port,
40
+ description: 'port (default 5050)',
41
+ short: '-p PORT',
42
+ long: '--port PORT',
43
+ default: 5050,
44
+ required: false
45
+
46
+ option :uri,
47
+ description: 'Endpoint URI',
48
+ short: '-u URI',
49
+ long: '--uri URI',
50
+ default: '/redirect'
51
+
52
+ option :timeout,
53
+ description: 'timeout in seconds',
54
+ short: '-t TIMEOUT',
55
+ long: '--timeout TIMEOUT',
56
+ proc: proc(&:to_i),
57
+ default: 5
58
+
59
+ def run
60
+ server = config[:server]
61
+ port = config[:port]
62
+ uri = config[:uri]
63
+ begin
64
+ r = RestClient::Resource.new("http://#{server}:#{port}#{uri}", timeout: config[:timeout]).get
65
+ if r.code == 503
66
+ critical "Master on #{server} is not responding"
67
+ end
68
+ rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
69
+ critical "Mesos on #{server} is not responding"
70
+ rescue RestClient::RequestTimeout
71
+ critical "Mesos on #{server} connection timed out"
72
+ end
73
+ ok
74
+ end
75
+ end
@@ -0,0 +1,139 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-lost-tasks
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that there are less or same number of lost tasks than provided on a Mesos cluster
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ # gem: json
18
+ #
19
+ # USAGE:
20
+ # #YELLOW
21
+ #
22
+ # NOTES:
23
+ #
24
+ # LICENSE:
25
+ # Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
26
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
27
+ # for details.
28
+ #
29
+
30
+ require 'sensu-plugin/check/cli'
31
+ require 'rest-client'
32
+ require 'json'
33
+ require 'daybreak'
34
+
35
+ class MesosLostTasksCheck < Sensu::Plugin::Check::CLI
36
+ check_name 'CheckMesosLostTasks'
37
+ @metrics_name = 'master/tasks_lost'.freeze
38
+
39
+ class << self
40
+ attr_reader :metrics_name
41
+ end
42
+
43
+ option :server,
44
+ description: 'Mesos server',
45
+ short: '-s SERVER',
46
+ long: '--server SERVER',
47
+ default: 'localhost'
48
+
49
+ option :port,
50
+ description: 'port (default 5050)',
51
+ short: '-p PORT',
52
+ long: '--port PORT',
53
+ default: 5050,
54
+ required: false
55
+
56
+ option :timeout,
57
+ description: 'timeout in seconds',
58
+ short: '-t TIMEOUT',
59
+ long: '--timeout TIMEOUT',
60
+ proc: proc(&:to_i),
61
+ default: 5
62
+
63
+ option :uri,
64
+ description: 'Endpoint URI',
65
+ short: '-u URI',
66
+ long: '--uri URI',
67
+ default: '/metrics/snapshot'
68
+
69
+ option :value,
70
+ description: 'value to check against',
71
+ short: '-v VALUE',
72
+ long: '--value VALUE',
73
+ default: 0,
74
+ proc: proc(&:to_i),
75
+ required: false
76
+
77
+ option :delta,
78
+ short: '-d',
79
+ long: '--delta',
80
+ description: 'Use this flag to compare the metric with the previously retreived value',
81
+ boolean: true
82
+
83
+ def run
84
+ if config[:value] < 0
85
+ unknown 'Number of lost tasks cannot be negative, please set --value to a number greater or equal to 0'
86
+ end
87
+
88
+ server = config[:server]
89
+ port = config[:port]
90
+ uri = config[:uri]
91
+ timeout = config[:timeout]
92
+ value = config[:value]
93
+
94
+ begin
95
+ server = get_leader_url server, port
96
+ # remove comment for debugging purpose
97
+ # puts(server)
98
+
99
+ r = RestClient::Resource.new("#{server}#{uri}", timeout).get
100
+ tasks_lost = check_tasks(r)
101
+ if config[:delta]
102
+ db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
103
+ prev_value = db["task_#{MesosLostTasksCheck.metrics_name}"]
104
+ db.lock do
105
+ db["task_#{MesosLostTasksCheck.metrics_name}"] = tasks_lost
106
+ end
107
+ tasks_lost -= prev_value
108
+ db.flush
109
+ db.compact
110
+ db.close
111
+ end
112
+ if tasks_lost >= value
113
+ critical "The number of LOST tasks [#{tasks_lost}] is bigger than provided [#{value}]!"
114
+ end
115
+ end
116
+ ok
117
+ end
118
+
119
+ def get_leader_url(server, port)
120
+ RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
121
+ end
122
+
123
+ # Parses JSON data as returned from Mesos's metrics API
124
+ # @param data [String] Server response
125
+ # @return [Integer] Number of lost tasks in Mesos
126
+ def check_tasks(data)
127
+ begin
128
+ tasks_lost = JSON.parse(data)[MesosLostTasksCheck.metrics_name]
129
+ rescue JSON::ParserError
130
+ raise "Could not parse JSON response: #{data}"
131
+ end
132
+
133
+ if tasks_lost.nil?
134
+ raise "No metrics for [#{MesosLostTasksCheck.metrics_name}] in server response: #{data}"
135
+ end
136
+
137
+ tasks_lost.round.to_i
138
+ end
139
+ end