sensu-plugins-mesos 0.1.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,154 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-mem-balance
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that there is less Memory imbalance than specified on a certain mesos cluster
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ # gem: json
18
+ #
19
+ # USAGE:
20
+ # #YELLOW
21
+ #
22
+ # NOTES:
23
+ #
24
+ # LICENSE:
25
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
26
+ # for details.
27
+ #
28
+
29
+ require 'sensu-plugin/check/cli'
30
+ require 'rest-client'
31
+ require 'json'
32
+
33
+ class MesosMemBalanceCheck < Sensu::Plugin::Check::CLI
34
+ check_name 'MesosMemBalanceCheck'
35
+ @metrics_name = 'slaves'.freeze
36
+ CHECK_TYPE = 'mem'.freeze
37
+
38
+ class << self
39
+ attr_reader :metrics_name
40
+ end
41
+
42
+ option :server,
43
+ description: 'Mesos server',
44
+ short: '-s SERVER',
45
+ long: '--server SERVER',
46
+ default: 'localhost'
47
+
48
+ option :port,
49
+ description: 'port (default 5050)',
50
+ short: '-p PORT',
51
+ long: '--port PORT',
52
+ default: 5050,
53
+ required: false
54
+
55
+ option :uri,
56
+ description: 'Endpoint URI',
57
+ short: '-u URI',
58
+ long: '--uri URI',
59
+ default: '/master/slaves'
60
+
61
+ option :timeout,
62
+ description: 'timeout in seconds',
63
+ short: '-t TIMEOUT',
64
+ long: '--timeout TIMEOUT',
65
+ proc: proc(&:to_i),
66
+ default: 5
67
+
68
+ option :crit,
69
+ description: 'Critical value to check against',
70
+ short: '-c VALUE',
71
+ long: '--critical VALUE',
72
+ proc: proc(&:to_i),
73
+ default: 0,
74
+ required: false
75
+
76
+ option :warn,
77
+ description: 'Warning value to check against',
78
+ short: '-w VALUE',
79
+ long: '--warning VALUE',
80
+ proc: proc(&:to_i),
81
+ default: 0,
82
+ required: false
83
+
84
+ def run
85
+ if config[:crit] < 0 || config[:warn] < 0
86
+ unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
87
+ end
88
+
89
+ server = config[:server]
90
+ port = config[:port]
91
+ uri = config[:uri]
92
+ timeout = config[:timeout]
93
+ crit = config[:crit]
94
+ warn = config[:warn]
95
+
96
+ begin
97
+ server = get_leader_url server, port
98
+ r = RestClient::Resource.new("#{server}#{uri}", timeout).get
99
+ compare = get_check_diff(get_slaves(r))
100
+ if compare['diff'] >= crit
101
+ critical "There is a Memory usage diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
102
+ end
103
+ if compare['diff'] >= warn
104
+ warning "There is a Memory usage diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
105
+ end
106
+ rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
107
+ unknown "Mesos #{server} is not responding"
108
+ rescue RestClient::RequestTimeout
109
+ unknown "Mesos #{server} connection timed out"
110
+ end
111
+ ok
112
+ end
113
+
114
+ # Redirects server call to discover the Leader
115
+ # @param server [String] Server address
116
+ # @param port [Number] api port
117
+ # @return [Url] Url representing the Leader
118
+
119
+ def get_leader_url(server, port)
120
+ RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
121
+ end
122
+
123
+ # Parses JSON data as returned from Mesos's metrics API
124
+ # @param data [String] Server response
125
+ # @return [Integer] Number of failed tasks in Mesos
126
+ def get_slaves(data)
127
+ begin
128
+ slaves = JSON.parse(data)[MesosMemBalanceCheck.metrics_name]
129
+ rescue JSON::ParserError
130
+ raise "Could not parse JSON response: #{data}"
131
+ end
132
+
133
+ if slaves.nil?
134
+ raise "No metrics for [#{MesosMemBalanceCheck.metrics_name}] in server response: #{data}"
135
+ end
136
+
137
+ slaves
138
+ end
139
+
140
+ def get_check_diff(slavelist)
141
+ begin
142
+ usages = {}
143
+ check_diff = {}
144
+ slavelist.each do |slaveinfo|
145
+ usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
146
+ end
147
+ sorted = usages.sort_by { |_hostname, total| total }
148
+ max = usages.length - 1
149
+ check_diff['diff'] = sorted[max][1] - sorted[0][1]
150
+ check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
151
+ end
152
+ check_diff
153
+ end
154
+ end
@@ -0,0 +1,182 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-mesos-running-tasks
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that there are running tasks on a mesos cluster
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ # gem: json
18
+ #
19
+ # USAGE:
20
+ # #YELLOW
21
+ #
22
+ # NOTES:
23
+ #
24
+ # LICENSE:
25
+ # Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
26
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
27
+ # for details.
28
+ #
29
+
30
+ require 'sensu-plugin/check/cli'
31
+ require 'rest-client'
32
+ require 'json'
33
+ require 'daybreak'
34
+
35
+ class MesosRunningTaskCheck < Sensu::Plugin::Check::CLI
36
+ check_name 'CheckMesosRunningTask'
37
+ @metrics_name = 'master/tasks_running'.freeze
38
+
39
+ class << self
40
+ attr_reader :metrics_name
41
+ end
42
+
43
+ option :server,
44
+ description: 'Mesos server',
45
+ short: '-s SERVER',
46
+ long: '--server SERVER',
47
+ default: 'localhost'
48
+
49
+ option :port,
50
+ description: 'port (default 5050)',
51
+ short: '-p PORT',
52
+ long: '--port PORT',
53
+ default: 5050,
54
+ required: false
55
+
56
+ option :uri,
57
+ description: 'Endpoint URI',
58
+ short: '-u URI',
59
+ long: '--uri URI',
60
+ default: '/metrics/snapshot'
61
+
62
+ option :timeout,
63
+ description: 'timeout in seconds',
64
+ short: '-t TIMEOUT',
65
+ long: '--timeout TIMEOUT',
66
+ proc: proc(&:to_i),
67
+ default: 5
68
+
69
+ option :mode,
70
+ description: 'eq ne lt gt or rg',
71
+ short: '-m MODE',
72
+ long: '--mode MODE',
73
+ required: true
74
+
75
+ option :min,
76
+ description: 'min value on range',
77
+ short: '-l VALUE',
78
+ long: '--low VALUE',
79
+ required: false,
80
+ proc: proc(&:to_i),
81
+ derfault: 0
82
+
83
+ option :max,
84
+ description: 'max value on range',
85
+ short: '-h VALUE',
86
+ long: '--high VALUE',
87
+ required: false,
88
+ proc: proc(&:to_i),
89
+ default: 1
90
+
91
+ option :value,
92
+ description: 'value to check against',
93
+ short: '-v VALUE',
94
+ long: '--value VALUE',
95
+ proc: proc(&:to_i),
96
+ default: 0,
97
+ required: false
98
+
99
+ option :delta,
100
+ short: '-d',
101
+ long: '--delta',
102
+ description: 'Use this flag to compare the metric with the previously retrieved value',
103
+ boolean: true
104
+
105
+ def run
106
+ port = config[:port]
107
+ uri = config[:uri]
108
+ timeout = config[:timeout]
109
+ mode = config[:mode]
110
+ value = config[:value]
111
+ server = config[:server]
112
+ min = config[:min]
113
+ max = config[:max]
114
+
115
+ begin
116
+ server = get_leader_url server, port
117
+ r = RestClient::Resource.new("#{server}#{uri}", timeout).get
118
+ metric_value = check_tasks(r)
119
+ check_mesos_tasks(metric_value, mode, value, min, max)
120
+ rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
121
+ unknown "Mesos #{server} is not responding"
122
+ rescue RestClient::RequestTimeout
123
+ unknown "Mesos #{server} connection timed out"
124
+ end
125
+ ok "Found #{metric_value} tasks running"
126
+ end
127
+
128
+ # Redirects server call to discover the Leader
129
+ # @param server [String] Server address
130
+ # @param port [Number] api port
131
+ # @return [Url] Url representing the Leader
132
+
133
+ def get_leader_url(server, port)
134
+ RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
135
+ end
136
+
137
+ # Parses JSON data as returned from Mesos API
138
+ # @param data [String] Server response
139
+ # @return [Numeric] Number of running tasks
140
+
141
+ def check_tasks(data)
142
+ begin
143
+ running_tasks = JSON.parse(data)[MesosRunningTaskCheck.metrics_name]
144
+ rescue JSON::ParserError
145
+ raise "Could not parse JSON response: #{data}"
146
+ end
147
+
148
+ if running_tasks.nil?
149
+ raise "No tasks in server response: #{data}"
150
+ end
151
+
152
+ running_tasks.round
153
+ end
154
+
155
+ def check_mesos_tasks(metric_value, mode, value, min, max)
156
+ if config[:delta]
157
+ db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
158
+ prev_value = db['task_running']
159
+ db.lock do
160
+ db['task_running'] = metric_value
161
+ end
162
+ metric_value -= prev_value
163
+ db.flush
164
+ db.compact
165
+ db.close
166
+ end
167
+ case mode
168
+ when 'eq'
169
+ critical "The number of running tasks cluster is equal to #{value}!" if metric_value.equal? value
170
+ when 'ne'
171
+ critical "The number of running tasks cluster is not equal to #{value}!" if metric_value != value
172
+ when 'lt'
173
+ critical "The number of running tasks cluster is lower than #{value}!" if metric_value < value
174
+ when 'gt'
175
+ critical "The number of running tasks cluster is greater than #{value}!" if metric_value > value
176
+ when 'rg'
177
+ unless (min.to_i..max.to_i).cover? metric_value
178
+ critical "The number of running tasks in cluster is not in #{min} - #{max} value range!"
179
+ end
180
+ end
181
+ end
182
+ end
@@ -29,10 +29,6 @@
29
29
  require 'sensu-plugin/check/cli'
30
30
  require 'rest-client'
31
31
 
32
- # Mesos default ports are defined here: http://mesos.apache.org/documentation/latest/configuration
33
- MASTER_DEFAULT_PORT = '5050'.freeze
34
- SLAVE_DEFAULT_PORT = '5051'.freeze
35
-
36
32
  class MesosNodeStatus < Sensu::Plugin::Check::CLI
37
33
  option :server,
38
34
  description: 'Mesos servers, comma separated',
@@ -40,18 +36,19 @@ class MesosNodeStatus < Sensu::Plugin::Check::CLI
40
36
  long: '--server SERVER1,SERVER2,...',
41
37
  default: 'localhost'
42
38
 
43
- option :mode,
44
- description: 'master or slave',
45
- short: '-m MODE',
46
- long: '--mode MODE',
47
- required: true
48
-
49
39
  option :port,
50
- description: "port (default #{MASTER_DEFAULT_PORT} for master, #{SLAVE_DEFAULT_PORT} for slave)",
40
+ description: 'port (default 5050, use 5051 for slaves)',
51
41
  short: '-p PORT',
52
42
  long: '--port PORT',
43
+ default: 5050,
53
44
  required: false
54
45
 
46
+ option :uri,
47
+ description: 'Endpoint URI',
48
+ short: '-u URI',
49
+ long: '--uri URI',
50
+ default: '/health'
51
+
55
52
  option :timeout,
56
53
  description: 'timeout in seconds',
57
54
  short: '-t TIMEOUT',
@@ -60,31 +57,24 @@ class MesosNodeStatus < Sensu::Plugin::Check::CLI
60
57
  default: 5
61
58
 
62
59
  def run
63
- mode = config[:mode]
64
60
  servers = config[:server]
65
- case mode
66
- when 'master'
67
- port = config[:port] || MASTER_DEFAULT_PORT
68
- uri = '/master/health'
69
- when 'slave'
70
- port = config[:port] || SLAVE_DEFAULT_PORT
71
- uri = '/slave(1)/health'
72
- end
61
+ uri = config[:uri]
62
+ port = config[:port]
73
63
  failures = []
74
64
  servers.split(',').each do |server|
75
65
  begin
76
66
  r = RestClient::Resource.new("http://#{server}:#{port}#{uri}", timeout: config[:timeout]).get
77
67
  if r.code != 200
78
- failures << "#{config[:mode]} on #{server} is not responding"
68
+ failures << "Mesos on #{server} is not responding"
79
69
  end
80
70
  rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
81
- failures << "Mesos #{mode} on #{server} is not responding"
71
+ failures << "Mesos on #{server} is not responding"
82
72
  rescue RestClient::RequestTimeout
83
- failures << "Mesos #{mode} on #{server} connection timed out"
73
+ failures << "Mesos on #{server} connection timed out"
84
74
  end
85
75
  end
86
76
  if failures.empty?
87
- ok "Mesos #{mode} is running on #{servers}"
77
+ ok "Mesos is running on #{servers}"
88
78
  else
89
79
  critical failures.join("\n")
90
80
  end
@@ -0,0 +1,80 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # check-metronome
4
+ #
5
+ # DESCRIPTION:
6
+ # This plugin checks that Metronome can query the existing job graph.
7
+ #
8
+ # OUTPUT:
9
+ # plain text
10
+ #
11
+ # PLATFORMS:
12
+ # Linux
13
+ #
14
+ # DEPENDENCIES:
15
+ # gem: sensu-plugin
16
+ # gem: rest-client
17
+ #
18
+ # USAGE:
19
+ #
20
+ #
21
+ # NOTES:
22
+ #
23
+ # LICENSE:
24
+ # Copyright 2017, PTC (www.ptc.com)
25
+ # Released under the same terms as Sensu (the MIT license); see LICENSE
26
+ # for details.
27
+ #
28
+
29
+ require 'sensu-plugin/check/cli'
30
+ require 'rest-client'
31
+
32
+ class MetronomeNodeStatus < Sensu::Plugin::Check::CLI
33
+ option :server,
34
+ description: 'Metronome hosts, comma separated',
35
+ short: '-s SERVER',
36
+ long: '--server SERVER',
37
+ default: 'localhost'
38
+
39
+ option :port,
40
+ description: 'Metronome port',
41
+ short: '-p PORT',
42
+ long: '--port PORT',
43
+ default: '9942'
44
+
45
+ option :uri,
46
+ description: 'Endpoint URI',
47
+ short: '-u URI',
48
+ long: '--uri URI',
49
+ default: '/v1/jobs'
50
+
51
+ option :timeout,
52
+ description: 'timeout in seconds',
53
+ short: '-t TIMEOUT',
54
+ long: '--timeout TIMEOUT',
55
+ proc: proc(&:to_i),
56
+ default: 5
57
+
58
+ def run
59
+ servers = config[:server]
60
+ uri = config[:uri]
61
+ failures = []
62
+ servers.split(',').each do |server|
63
+ begin
64
+ r = RestClient::Resource.new("http://#{server}:#{config[:port]}#{uri}", timeout: config[:timeout]).get
65
+ if r.code != 200
66
+ failures << "Metronome on #{server} is not responding"
67
+ end
68
+ rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
69
+ failures << "Metronome on #{server} is not responding"
70
+ rescue RestClient::RequestTimeout
71
+ failures << "Metronome on #{server} connection timed out"
72
+ end
73
+ end
74
+ if failures.empty?
75
+ ok "Metronome is running on #{servers}"
76
+ else
77
+ critical failures.join("\n")
78
+ end
79
+ end
80
+ end