sensu-plugins-mesos 0.1.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-failed-tasks
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that there are less or same number of failed tasks than provided on a Mesos cluster
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ # gem: json
18
+ #
19
+ # USAGE:
20
+ # #YELLOW
21
+ #
22
+ # NOTES:
23
+ #
24
+ # LICENSE:
25
+ # Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
26
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
27
+ # for details.
28
+ #
29
+
30
+ require 'sensu-plugin/check/cli'
31
+ require 'rest-client'
32
+ require 'json'
33
+ require 'daybreak'
34
+
35
+ class MesosFailedTasksCheck < Sensu::Plugin::Check::CLI
36
+ check_name 'CheckMesosFailedTasks'
37
+ @metrics_name = 'master/tasks_failed'.freeze
38
+
39
+ class << self
40
+ attr_reader :metrics_name
41
+ end
42
+
43
+ option :server,
44
+ description: 'Mesos server',
45
+ short: '-s SERVER',
46
+ long: '--server SERVER',
47
+ default: 'localhost'
48
+
49
+ option :port,
50
+ description: 'port (default 5050)',
51
+ short: '-p PORT',
52
+ long: '--port PORT',
53
+ default: 5050,
54
+ required: false
55
+
56
+ option :uri,
57
+ description: 'Endpoint URI',
58
+ short: '-u URI',
59
+ long: '--uri URI',
60
+ default: '/metrics/snapshot'
61
+
62
+ option :timeout,
63
+ description: 'timeout in seconds',
64
+ short: '-t TIMEOUT',
65
+ long: '--timeout TIMEOUT',
66
+ proc: proc(&:to_i),
67
+ default: 5
68
+
69
+ option :value,
70
+ description: 'value to check against',
71
+ short: '-v VALUE',
72
+ long: '--value VALUE',
73
+ proc: proc(&:to_i),
74
+ default: 0,
75
+ required: false
76
+
77
+ option :delta,
78
+ short: '-d',
79
+ long: '--delta',
80
+ description: 'Use this flag to compare the metric with the previously retrieved value',
81
+ boolean: true
82
+
83
+ def run
84
+ if config[:value].to_i < 0
85
+ unknown 'Number of failed tasks cannot be negative'
86
+ end
87
+
88
+ server = config[:server]
89
+ port = config[:port]
90
+ uri = config[:uri]
91
+ timeout = config[:timeout].to_i
92
+ value = config[:value].to_i
93
+
94
+ begin
95
+ server = get_leader_url server, port
96
+ r = RestClient::Resource.new("#{server}#{uri}", timeout).get
97
+ tasks_failed = check_tasks(r)
98
+ if config[:delta]
99
+ db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
100
+ prev_value = db["task_#{MesosFailedTasksCheck.metrics_name}"]
101
+ db.lock do
102
+ db["task_#{MesosFailedTasksCheck.metrics_name}"] = tasks_failed
103
+ end
104
+ tasks_failed -= prev_value
105
+ db.flush
106
+ db.compact
107
+ db.close
108
+ end
109
+
110
+ if tasks_failed >= value
111
+ critical "The number of FAILED tasks [#{tasks_failed}] is bigger than provided [#{value}]!"
112
+ end
113
+ rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
114
+ unknown "Mesos #{server} is not responding"
115
+ rescue RestClient::RequestTimeout
116
+ unknown "Mesos #{server} connection timed out"
117
+ end
118
+ ok
119
+ end
120
+
121
+ # Redirects server call to discover the Leader
122
+ # @param server [String] Server address
123
+ # @param port [Number] api port
124
+ # @return [Url] Url representing the Leader
125
+
126
+ def get_leader_url(server, port)
127
+ RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
128
+ end
129
+
130
+ # Parses JSON data as returned from Mesos's metrics API
131
+ # @param data [String] Server response
132
+ # @return [Integer] Number of failed tasks in Mesos
133
+ def check_tasks(data)
134
+ begin
135
+ tasks_failed = JSON.parse(data)[MesosFailedTasksCheck.metrics_name]
136
+ rescue JSON::ParserError
137
+ raise "Could not parse JSON response: #{data}"
138
+ end
139
+
140
+ if tasks_failed.nil?
141
+ raise "No metrics for [#{MesosFailedTasksCheck.metrics_name}] in server response: #{data}"
142
+ end
143
+
144
+ tasks_failed.round.to_i
145
+ end
146
+ end
@@ -0,0 +1,154 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-gpu-balance
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that there is less GPU imbalance than specified on a certain mesos cluster
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ # gem: json
18
+ #
19
+ # USAGE:
20
+ # #YELLOW
21
+ #
22
+ # NOTES:
23
+ #
24
+ # LICENSE:
25
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
26
+ # for details.
27
+ #
28
+
29
+ require 'sensu-plugin/check/cli'
30
+ require 'rest-client'
31
+ require 'json'
32
+
33
+ class MesosGpuBalanceCheck < Sensu::Plugin::Check::CLI
34
+ check_name 'MesosGpuBalanceCheck'
35
+ @metrics_name = 'slaves'.freeze
36
+ CHECK_TYPE = 'gpus'.freeze
37
+
38
+ class << self
39
+ attr_reader :metrics_name
40
+ end
41
+
42
+ option :server,
43
+ description: 'Mesos server',
44
+ short: '-s SERVER',
45
+ long: '--server SERVER',
46
+ default: 'localhost'
47
+
48
+ option :port,
49
+ description: 'port (default 5050)',
50
+ short: '-p PORT',
51
+ long: '--port PORT',
52
+ default: 5050,
53
+ required: false
54
+
55
+ option :uri,
56
+ description: 'Endpoint URI',
57
+ short: '-u URI',
58
+ long: '--uri URI',
59
+ default: '/master/slaves'
60
+
61
+ option :timeout,
62
+ description: 'timeout in seconds',
63
+ short: '-t TIMEOUT',
64
+ long: '--timeout TIMEOUT',
65
+ proc: proc(&:to_i),
66
+ default: 5
67
+
68
+ option :crit,
69
+ description: 'Critical value to check against',
70
+ short: '-c VALUE',
71
+ long: '--critical VALUE',
72
+ proc: proc(&:to_i),
73
+ default: 0,
74
+ required: false
75
+
76
+ option :warn,
77
+ description: 'Warning value to check against',
78
+ short: '-w VALUE',
79
+ long: '--warning VALUE',
80
+ proc: proc(&:to_i),
81
+ default: 0,
82
+ required: false
83
+
84
+ def run
85
+ if config[:crit] < 0 || config[:warn] < 0
86
+ unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
87
+ end
88
+
89
+ server = config[:server]
90
+ port = config[:port]
91
+ uri = config[:uri]
92
+ timeout = config[:timeout]
93
+ crit = config[:crit]
94
+ warn = config[:warn]
95
+
96
+ begin
97
+ server = get_leader_url server, port
98
+ r = RestClient::Resource.new("#{server}#{uri}", timeout).get
99
+ compare = get_check_diff(get_slaves(r))
100
+ if compare['diff'] >= crit
101
+ critical "There is a GPU usage diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
102
+ end
103
+ if compare['diff'] >= warn
104
+ warning "There is a GPU usage diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
105
+ end
106
+ rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
107
+ unknown "Mesos #{server} is not responding"
108
+ rescue RestClient::RequestTimeout
109
+ unknown "Mesos #{server} connection timed out"
110
+ end
111
+ ok
112
+ end
113
+
114
+ # Redirects server call to discover the Leader
115
+ # @param server [String] Server address
116
+ # @param port [Number] api port
117
+ # @return [Url] Url representing the Leader
118
+
119
+ def get_leader_url(server, port)
120
+ RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
121
+ end
122
+
123
+ # Parses JSON data as returned from Mesos's metrics API
124
+ # @param data [String] Server response
125
+ # @return [Integer] Number of failed tasks in Mesos
126
+ def get_slaves(data)
127
+ begin
128
+ slaves = JSON.parse(data)[MesosGpuBalanceCheck.metrics_name]
129
+ rescue JSON::ParserError
130
+ raise "Could not parse JSON response: #{data}"
131
+ end
132
+
133
+ if slaves.nil?
134
+ raise "No metrics for [#{MesosGpuBalanceCheck.metrics_name}] in server response: #{data}"
135
+ end
136
+
137
+ slaves
138
+ end
139
+
140
+ def get_check_diff(slavelist)
141
+ begin
142
+ usages = {}
143
+ check_diff = {}
144
+ slavelist.each do |slaveinfo|
145
+ usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
146
+ end
147
+ sorted = usages.sort_by { |_hostname, total| total }
148
+ max = usages.length - 1
149
+ check_diff['diff'] = sorted[max][1] - sorted[0][1]
150
+ check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
151
+ end
152
+ check_diff
153
+ end
154
+ end
@@ -0,0 +1,75 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-leader-status
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that the health url of the leader master returns 200 OK
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ #
18
+ # USAGE:
19
+ # #YELLOW
20
+ #
21
+ # NOTES:
22
+ #
23
+ # LICENSE:
24
+ # Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
25
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
26
+ # for details.
27
+ #
28
+
29
+ require 'sensu-plugin/check/cli'
30
+ require 'rest-client'
31
+
32
+ class MesosLeaderNodeStatus < Sensu::Plugin::Check::CLI
33
+ option :server,
34
+ description: 'Mesos server',
35
+ short: '-s SERVER',
36
+ long: '--server SERVER',
37
+ default: 'localhost'
38
+
39
+ option :port,
40
+ description: 'port (default 5050)',
41
+ short: '-p PORT',
42
+ long: '--port PORT',
43
+ default: 5050,
44
+ required: false
45
+
46
+ option :uri,
47
+ description: 'Endpoint URI',
48
+ short: '-u URI',
49
+ long: '--uri URI',
50
+ default: '/redirect'
51
+
52
+ option :timeout,
53
+ description: 'timeout in seconds',
54
+ short: '-t TIMEOUT',
55
+ long: '--timeout TIMEOUT',
56
+ proc: proc(&:to_i),
57
+ default: 5
58
+
59
+ def run
60
+ server = config[:server]
61
+ port = config[:port]
62
+ uri = config[:uri]
63
+ begin
64
+ r = RestClient::Resource.new("http://#{server}:#{port}#{uri}", timeout: config[:timeout]).get
65
+ if r.code == 503
66
+ critical "Master on #{server} is not responding"
67
+ end
68
+ rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
69
+ critical "Mesos on #{server} is not responding"
70
+ rescue RestClient::RequestTimeout
71
+ critical "Mesos on #{server} connection timed out"
72
+ end
73
+ ok
74
+ end
75
+ end
@@ -0,0 +1,139 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-lost-tasks
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that there are less or same number of lost tasks than provided on a Mesos cluster
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ # gem: json
18
+ #
19
+ # USAGE:
20
+ # #YELLOW
21
+ #
22
+ # NOTES:
23
+ #
24
+ # LICENSE:
25
+ # Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
26
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
27
+ # for details.
28
+ #
29
+
30
+ require 'sensu-plugin/check/cli'
31
+ require 'rest-client'
32
+ require 'json'
33
+ require 'daybreak'
34
+
35
+ class MesosLostTasksCheck < Sensu::Plugin::Check::CLI
36
+ check_name 'CheckMesosLostTasks'
37
+ @metrics_name = 'master/tasks_lost'.freeze
38
+
39
+ class << self
40
+ attr_reader :metrics_name
41
+ end
42
+
43
+ option :server,
44
+ description: 'Mesos server',
45
+ short: '-s SERVER',
46
+ long: '--server SERVER',
47
+ default: 'localhost'
48
+
49
+ option :port,
50
+ description: 'port (default 5050)',
51
+ short: '-p PORT',
52
+ long: '--port PORT',
53
+ default: 5050,
54
+ required: false
55
+
56
+ option :timeout,
57
+ description: 'timeout in seconds',
58
+ short: '-t TIMEOUT',
59
+ long: '--timeout TIMEOUT',
60
+ proc: proc(&:to_i),
61
+ default: 5
62
+
63
+ option :uri,
64
+ description: 'Endpoint URI',
65
+ short: '-u URI',
66
+ long: '--uri URI',
67
+ default: '/metrics/snapshot'
68
+
69
+ option :value,
70
+ description: 'value to check against',
71
+ short: '-v VALUE',
72
+ long: '--value VALUE',
73
+ default: 0,
74
+ proc: proc(&:to_i),
75
+ required: false
76
+
77
+ option :delta,
78
+ short: '-d',
79
+ long: '--delta',
80
+ description: 'Use this flag to compare the metric with the previously retreived value',
81
+ boolean: true
82
+
83
+ def run
84
+ if config[:value] < 0
85
+ unknown 'Number of lost tasks cannot be negative, please set --value to a number greater or equal to 0'
86
+ end
87
+
88
+ server = config[:server]
89
+ port = config[:port]
90
+ uri = config[:uri]
91
+ timeout = config[:timeout]
92
+ value = config[:value]
93
+
94
+ begin
95
+ server = get_leader_url server, port
96
+ # remove comment for debugging purpose
97
+ # puts(server)
98
+
99
+ r = RestClient::Resource.new("#{server}#{uri}", timeout).get
100
+ tasks_lost = check_tasks(r)
101
+ if config[:delta]
102
+ db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
103
+ prev_value = db["task_#{MesosLostTasksCheck.metrics_name}"]
104
+ db.lock do
105
+ db["task_#{MesosLostTasksCheck.metrics_name}"] = tasks_lost
106
+ end
107
+ tasks_lost -= prev_value
108
+ db.flush
109
+ db.compact
110
+ db.close
111
+ end
112
+ if tasks_lost >= value
113
+ critical "The number of LOST tasks [#{tasks_lost}] is bigger than provided [#{value}]!"
114
+ end
115
+ end
116
+ ok
117
+ end
118
+
119
+ def get_leader_url(server, port)
120
+ RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
121
+ end
122
+
123
+ # Parses JSON data as returned from Mesos's metrics API
124
+ # @param data [String] Server response
125
+ # @return [Integer] Number of lost tasks in Mesos
126
+ def check_tasks(data)
127
+ begin
128
+ tasks_lost = JSON.parse(data)[MesosLostTasksCheck.metrics_name]
129
+ rescue JSON::ParserError
130
+ raise "Could not parse JSON response: #{data}"
131
+ end
132
+
133
+ if tasks_lost.nil?
134
+ raise "No metrics for [#{MesosLostTasksCheck.metrics_name}] in server response: #{data}"
135
+ end
136
+
137
+ tasks_lost.round.to_i
138
+ end
139
+ end