riemann-tools-dgvz 0.2.2.1 → 0.2.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/riemann-rabbitmq +154 -7
- data/bin/riemann-riak +32 -3
- metadata +1 -1
data/bin/riemann-rabbitmq
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
#
|
3
3
|
|
4
|
-
require
|
4
|
+
require 'riemann/tools'
|
5
5
|
|
6
6
|
class Riemann::Tools::Rabbitmq
|
7
7
|
include Riemann::Tools
|
@@ -16,11 +16,28 @@ class Riemann::Tools::Rabbitmq
|
|
16
16
|
|
17
17
|
opt :monitor_user, 'RabbitMQ monitoring user', type: :string
|
18
18
|
opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
|
19
|
-
opt :monitor_port, 'RabbitMQ monitoring port', default: 15672
|
20
|
-
opt :monitor_host, 'RabbitMQ monitoring host', default: "localhost"
|
19
|
+
opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672
|
20
|
+
opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost"
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000
|
23
|
+
opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string
|
24
|
+
|
25
|
+
opt :node, "Specify a node to monitor", type: :strings
|
26
|
+
|
27
|
+
def base_url
|
28
|
+
"http://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
|
29
|
+
end
|
30
|
+
|
31
|
+
def overview_url
|
32
|
+
"#{base_url}/overview"
|
33
|
+
end
|
34
|
+
|
35
|
+
def node_url(n)
|
36
|
+
"#{base_url}/nodes/#{n}"
|
37
|
+
end
|
38
|
+
|
39
|
+
def queues_url
|
40
|
+
"#{base_url}/queues"
|
24
41
|
end
|
25
42
|
|
26
43
|
def event_host
|
@@ -50,8 +67,86 @@ class Riemann::Tools::Rabbitmq
|
|
50
67
|
response
|
51
68
|
end
|
52
69
|
|
53
|
-
def
|
54
|
-
|
70
|
+
def check_queues
|
71
|
+
response = safe_get(queues_url, event_host)
|
72
|
+
max_size_check_filter = if options[:ignore_max_size_queues]
|
73
|
+
Regexp.new(options[:ignore_max_size_queues])
|
74
|
+
else
|
75
|
+
nil
|
76
|
+
end
|
77
|
+
|
78
|
+
return if response.nil?
|
79
|
+
|
80
|
+
json = JSON.parse(response.body)
|
81
|
+
|
82
|
+
if response.status != 200
|
83
|
+
report(:host => event_host,
|
84
|
+
:service => "rabbitmq.queue",
|
85
|
+
:state => "critical",
|
86
|
+
:description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}"
|
87
|
+
)
|
88
|
+
else
|
89
|
+
report(:host => event_host,
|
90
|
+
:service => "rabbitmq.queue",
|
91
|
+
:state => "ok",
|
92
|
+
:description => "HTTP connection ok"
|
93
|
+
)
|
94
|
+
|
95
|
+
json = JSON.parse(response.body)
|
96
|
+
|
97
|
+
json.each do |queue|
|
98
|
+
svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}"
|
99
|
+
errs = []
|
100
|
+
|
101
|
+
if queue['messages_ready'] > 0 and queue['consumers'] == 0
|
102
|
+
errs << "Queue has jobs but no consumers"
|
103
|
+
end
|
104
|
+
|
105
|
+
if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready'] > options[:max_queue_size]
|
106
|
+
errs << "Queue has #{queue['messages_ready']} jobs"
|
107
|
+
end
|
108
|
+
|
109
|
+
unless errs.empty?
|
110
|
+
report(:host => event_host,
|
111
|
+
:service => svc,
|
112
|
+
:state => "critical",
|
113
|
+
:description => errs.join("; ")
|
114
|
+
)
|
115
|
+
end
|
116
|
+
|
117
|
+
stats = (queue['message_stats'] || {}).merge(
|
118
|
+
'messages' => queue['messages'],
|
119
|
+
'messages_details' => queue['messages_details'],
|
120
|
+
'messages_ready' => queue['messages_ready'],
|
121
|
+
'messages_ready_details' => queue['messages_ready_details'],
|
122
|
+
'messages_unacknowledged' => queue['messages_unacknowledged'],
|
123
|
+
'messages_unacknowledged_details' => queue['messages_unacknowledged_details'],
|
124
|
+
'consumers' => queue['consumers'],
|
125
|
+
'memory' => queue['memory'],
|
126
|
+
)
|
127
|
+
|
128
|
+
stats.each_pair do |k,v|
|
129
|
+
service = "#{svc}.#{k}"
|
130
|
+
if k =~ /details$/
|
131
|
+
metric = v['rate']
|
132
|
+
else
|
133
|
+
metric = v
|
134
|
+
end
|
135
|
+
|
136
|
+
# TODO: Set state via thresholds which can be configured
|
137
|
+
|
138
|
+
report(:host => event_host,
|
139
|
+
:service => service,
|
140
|
+
:metric => metric,
|
141
|
+
:description => "RabbitMQ monitor"
|
142
|
+
)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def check_overview
|
149
|
+
uri = URI(overview_url)
|
55
150
|
response = safe_get(uri, event_host)
|
56
151
|
|
57
152
|
return if response.nil?
|
@@ -95,5 +190,57 @@ class Riemann::Tools::Rabbitmq
|
|
95
190
|
end
|
96
191
|
end
|
97
192
|
end
|
193
|
+
|
194
|
+
def check_node
|
195
|
+
opts[:node].each do |n|
|
196
|
+
uri = URI(node_url(n))
|
197
|
+
response = safe_get(uri, event_host)
|
198
|
+
|
199
|
+
return if response.nil?
|
200
|
+
|
201
|
+
if response.status != 200
|
202
|
+
if response.status == 404
|
203
|
+
report(:host => event_host,
|
204
|
+
:service => "rabbitmq.node.#{n}",
|
205
|
+
:state => "critical",
|
206
|
+
:description => "Node was not found in the cluster"
|
207
|
+
)
|
208
|
+
else
|
209
|
+
report(:host => event_host,
|
210
|
+
:service => "rabbitmq.node.#{n}",
|
211
|
+
:state => "critical",
|
212
|
+
:description => "HTTP error: #{response.status} - #{response.body}"
|
213
|
+
)
|
214
|
+
end
|
215
|
+
return
|
216
|
+
end
|
217
|
+
|
218
|
+
json = JSON.parse(response.body)
|
219
|
+
|
220
|
+
if json['mem_alarm']
|
221
|
+
report(:host => event_host,
|
222
|
+
:service => "rabbitmq.node.#{n}",
|
223
|
+
:state => "critical",
|
224
|
+
:description => "Memory alarm has triggered; job submission throttled"
|
225
|
+
)
|
226
|
+
return
|
227
|
+
end
|
228
|
+
|
229
|
+
if json['disk_free_alarm']
|
230
|
+
report(:host => event_host,
|
231
|
+
:service => "rabbitmq.node.#{n}",
|
232
|
+
:state => "critical",
|
233
|
+
:description => "Disk free alarm has triggered; job submission throttled"
|
234
|
+
)
|
235
|
+
return
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
def tick
|
241
|
+
check_overview
|
242
|
+
check_node if opts[:node]
|
243
|
+
check_queues
|
244
|
+
end
|
98
245
|
end
|
99
246
|
Riemann::Tools::Rabbitmq.run
|
data/bin/riemann-riak
CHANGED
@@ -27,8 +27,8 @@ class Riemann::Tools::Riak
|
|
27
27
|
|
28
28
|
def initialize
|
29
29
|
detect_features
|
30
|
-
|
31
|
-
@httpstatus = true
|
30
|
+
|
31
|
+
@httpstatus = true
|
32
32
|
# What's going on here? --aphyr
|
33
33
|
if
|
34
34
|
begin
|
@@ -59,7 +59,7 @@ class Riemann::Tools::Riak
|
|
59
59
|
def detect_features
|
60
60
|
@escript = true # Whether escript is present on this machine
|
61
61
|
@riakadmin = true # Whether riak-admin is present
|
62
|
-
|
62
|
+
|
63
63
|
if `which escript` =~ /^\s*$/
|
64
64
|
@escript = false
|
65
65
|
end
|
@@ -113,6 +113,34 @@ class Riemann::Tools::Riak
|
|
113
113
|
end
|
114
114
|
end
|
115
115
|
|
116
|
+
def check_transfers
|
117
|
+
str = if @riakadmin
|
118
|
+
`riak-admin transfers`
|
119
|
+
else
|
120
|
+
nil
|
121
|
+
end
|
122
|
+
|
123
|
+
return if str.nil?
|
124
|
+
|
125
|
+
if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
|
126
|
+
report(
|
127
|
+
:host => opts[:riak_host],
|
128
|
+
:service => 'riak transfers',
|
129
|
+
:state => 'critical',
|
130
|
+
:metric => $1.to_i,
|
131
|
+
:description => "waiting to handoff #{$1} partitions"
|
132
|
+
)
|
133
|
+
else
|
134
|
+
report(
|
135
|
+
:host => opts[:riak_host],
|
136
|
+
:service => 'riak transfers',
|
137
|
+
:state => 'ok',
|
138
|
+
:metric => 0,
|
139
|
+
:description => "No pending transfers"
|
140
|
+
)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
116
144
|
def check_disk
|
117
145
|
gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
|
118
146
|
report(
|
@@ -294,6 +322,7 @@ class Riemann::Tools::Riak
|
|
294
322
|
check_stats
|
295
323
|
check_ring
|
296
324
|
check_disk
|
325
|
+
check_transfers
|
297
326
|
end
|
298
327
|
end
|
299
328
|
|