riemann-tools 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/dependabot.yml +11 -0
- data/.github/workflows/ci.yml +13 -0
- data/.github/workflows/codeql-analysis.yml +72 -0
- data/.rubocop.yml +32 -0
- data/CHANGELOG.md +31 -2
- data/README.markdown +8 -24
- data/Rakefile +4 -2
- data/SECURITY.md +42 -0
- data/bin/riemann-apache-status +92 -78
- data/bin/riemann-bench +54 -49
- data/bin/riemann-cloudant +44 -40
- data/bin/riemann-consul +82 -76
- data/bin/riemann-dir-files-count +53 -47
- data/bin/riemann-dir-space +53 -47
- data/bin/riemann-diskstats +78 -75
- data/bin/riemann-fd +68 -48
- data/bin/riemann-freeswitch +108 -103
- data/bin/riemann-haproxy +46 -40
- data/bin/riemann-health +4 -343
- data/bin/riemann-kvminstance +18 -13
- data/bin/riemann-memcached +35 -29
- data/bin/riemann-net +4 -104
- data/bin/riemann-nginx-status +74 -67
- data/bin/riemann-ntp +4 -33
- data/bin/riemann-portcheck +40 -31
- data/bin/riemann-proc +96 -90
- data/bin/riemann-varnish +51 -45
- data/bin/riemann-zookeeper +38 -34
- data/lib/riemann/tools/health.rb +347 -0
- data/lib/riemann/tools/net.rb +104 -0
- data/lib/riemann/tools/ntp.rb +41 -0
- data/lib/riemann/tools/version.rb +1 -1
- data/lib/riemann/tools.rb +37 -40
- data/riemann-tools.gemspec +4 -1
- data/tools/riemann-aws/{Rakefile.rb → Rakefile} +2 -0
- data/tools/riemann-aws/bin/riemann-aws-billing +72 -66
- data/tools/riemann-aws/bin/riemann-aws-rds-status +55 -41
- data/tools/riemann-aws/bin/riemann-aws-sqs-status +37 -31
- data/tools/riemann-aws/bin/riemann-aws-status +63 -51
- data/tools/riemann-aws/bin/riemann-elb-metrics +149 -148
- data/tools/riemann-aws/bin/riemann-s3-list +70 -65
- data/tools/riemann-aws/bin/riemann-s3-status +85 -82
- data/tools/riemann-chronos/{Rakefile.rb → Rakefile} +2 -0
- data/tools/riemann-chronos/bin/riemann-chronos +136 -119
- data/tools/riemann-docker/{Rakefile.rb → Rakefile} +2 -0
- data/tools/riemann-docker/bin/riemann-docker +163 -174
- data/tools/riemann-elasticsearch/{Rakefile.rb → Rakefile} +2 -0
- data/tools/riemann-elasticsearch/bin/riemann-elasticsearch +155 -147
- data/tools/riemann-marathon/{Rakefile.rb → Rakefile} +2 -0
- data/tools/riemann-marathon/bin/riemann-marathon +138 -122
- data/tools/riemann-mesos/{Rakefile.rb → Rakefile} +2 -0
- data/tools/riemann-mesos/bin/riemann-mesos +125 -110
- data/tools/riemann-munin/{Rakefile.rb → Rakefile} +2 -0
- data/tools/riemann-munin/bin/riemann-munin +28 -22
- data/tools/riemann-rabbitmq/{Rakefile.rb → Rakefile} +2 -0
- data/tools/riemann-rabbitmq/bin/riemann-rabbitmq +226 -222
- data/tools/riemann-riak/{Rakefile.rb → Rakefile} +2 -0
- data/tools/riemann-riak/bin/riemann-riak +281 -289
- data/tools/riemann-riak/riak_status/riak_status.rb +39 -39
- metadata +65 -16
@@ -1,269 +1,273 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
# frozen_string_literal: true
|
3
3
|
|
4
|
-
|
4
|
+
Process.setproctitle($PROGRAM_NAME)
|
5
5
|
|
6
|
-
|
7
|
-
include Riemann::Tools
|
6
|
+
require 'riemann/tools'
|
8
7
|
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
module Riemann
|
9
|
+
module Tools
|
10
|
+
class Rabbitmq
|
11
|
+
include Riemann::Tools
|
12
12
|
|
13
|
+
require 'faraday'
|
14
|
+
require 'json'
|
15
|
+
require 'uri'
|
13
16
|
|
14
|
-
|
15
|
-
|
17
|
+
opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
|
18
|
+
opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
|
16
19
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
20
|
+
opt :monitor_user, 'RabbitMQ monitoring user', type: :string
|
21
|
+
opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
|
22
|
+
opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15_672
|
23
|
+
opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: 'localhost'
|
24
|
+
opt :monitor_use_tls, 'RabbitMQ use tls', type: :bool, default: false
|
22
25
|
|
23
|
-
|
24
|
-
|
26
|
+
opt :max_queue_size, 'max number of items in a queue that is acceptable', type: :int, default: 1_000_000
|
27
|
+
opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string
|
25
28
|
|
26
|
-
|
29
|
+
opt :node, 'Specify a node to monitor', type: :strings
|
27
30
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
"#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
|
34
|
-
end
|
31
|
+
def base_url
|
32
|
+
protocol = 'http'
|
33
|
+
protocol = 'https' if options[:monitor_use_tls] && (options[:monitor_use_tls] == true)
|
34
|
+
"#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
|
35
|
+
end
|
35
36
|
|
36
|
-
|
37
|
-
|
38
|
-
|
37
|
+
def overview_url
|
38
|
+
"#{base_url}/overview"
|
39
|
+
end
|
39
40
|
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
def node_url(node)
|
42
|
+
"#{base_url}/nodes/#{node}"
|
43
|
+
end
|
43
44
|
|
44
|
-
|
45
|
-
|
46
|
-
|
45
|
+
def queues_url
|
46
|
+
"#{base_url}/queues"
|
47
|
+
end
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
else
|
52
|
-
return options[:monitor_host]
|
53
|
-
end
|
54
|
-
end
|
49
|
+
def event_host
|
50
|
+
options[:event_host] || :monitor_host
|
51
|
+
end
|
55
52
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
53
|
+
def safe_get(uri, event_host)
|
54
|
+
# Handle connection timeouts
|
55
|
+
response = nil
|
56
|
+
begin
|
57
|
+
connection = Faraday.new(uri)
|
58
|
+
response = connection.get do |req|
|
59
|
+
req.options[:timeout] = options[:read_timeout]
|
60
|
+
req.options[:open_timeout] = options[:open_timeout]
|
61
|
+
end
|
62
|
+
report(
|
63
|
+
host: event_host,
|
64
|
+
service: 'rabbitmq monitoring',
|
65
|
+
state: 'ok',
|
66
|
+
description: 'Monitoring operational',
|
67
|
+
)
|
68
|
+
rescue StandardError => e
|
69
|
+
report(
|
70
|
+
host: event_host,
|
71
|
+
service: 'rabbitmq monitoring',
|
72
|
+
state: 'critical',
|
73
|
+
description: "HTTP connection error: #{e.class} - #{e.message}",
|
74
|
+
)
|
64
75
|
end
|
65
|
-
|
66
|
-
:service => "rabbitmq monitoring",
|
67
|
-
:state => 'ok',
|
68
|
-
:description => "Monitoring operational"
|
69
|
-
)
|
70
|
-
rescue => e
|
71
|
-
report(:host => event_host,
|
72
|
-
:service => "rabbitmq monitoring",
|
73
|
-
:state => "critical",
|
74
|
-
:description => "HTTP connection error: #{e.class} - #{e.message}"
|
75
|
-
)
|
76
|
+
response
|
76
77
|
end
|
77
|
-
response
|
78
|
-
end
|
79
|
-
|
80
|
-
def check_queues
|
81
|
-
response = safe_get(queues_url, event_host)
|
82
|
-
max_size_check_filter = if options[:ignore_max_size_queues]
|
83
|
-
Regexp.new(options[:ignore_max_size_queues])
|
84
|
-
else
|
85
|
-
nil
|
86
|
-
end
|
87
78
|
|
88
|
-
|
79
|
+
def check_queues
|
80
|
+
response = safe_get(queues_url, event_host)
|
81
|
+
max_size_check_filter = (Regexp.new(options[:ignore_max_size_queues]) if options[:ignore_max_size_queues])
|
89
82
|
|
90
|
-
|
83
|
+
return if response.nil?
|
91
84
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
85
|
+
if response.status != 200
|
86
|
+
report(
|
87
|
+
host: event_host,
|
88
|
+
service: 'rabbitmq.queue',
|
89
|
+
state: 'critical',
|
90
|
+
description: "HTTP connection error to /api/queues: #{response.status} - #{response.body}",
|
91
|
+
)
|
92
|
+
else
|
93
|
+
report(
|
94
|
+
host: event_host,
|
95
|
+
service: 'rabbitmq.queue',
|
96
|
+
state: 'ok',
|
97
|
+
description: 'HTTP connection ok',
|
98
|
+
)
|
104
99
|
|
105
|
-
|
100
|
+
json = JSON.parse(response.body)
|
101
|
+
|
102
|
+
json.each do |queue|
|
103
|
+
svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}"
|
104
|
+
errs = []
|
105
|
+
|
106
|
+
errs << 'Queue has jobs but no consumers' if !queue['messages_ready'].nil? && (queue['messages_ready']).positive? && (queue['consumers']).zero?
|
107
|
+
|
108
|
+
errs << "Queue has #{queue['messages_ready']} jobs" if (max_size_check_filter.nil? || queue['name'] !~ (max_size_check_filter)) && !queue['messages_ready'].nil? && (queue['messages_ready'] > options[:max_queue_size])
|
109
|
+
|
110
|
+
if errs.empty?
|
111
|
+
report(
|
112
|
+
host: event_host,
|
113
|
+
service: svc,
|
114
|
+
state: 'ok',
|
115
|
+
description: 'Queue is looking good',
|
116
|
+
)
|
117
|
+
else
|
118
|
+
report(
|
119
|
+
host: event_host,
|
120
|
+
service: svc,
|
121
|
+
state: 'critical',
|
122
|
+
description: errs.join('; '),
|
123
|
+
)
|
124
|
+
end
|
125
|
+
|
126
|
+
stats = (queue['message_stats'] || {}).merge(
|
127
|
+
'messages' => queue['messages'],
|
128
|
+
'messages_details' => queue['messages_details'],
|
129
|
+
'messages_ready' => queue['messages_ready'],
|
130
|
+
'messages_ready_details' => queue['messages_ready_details'],
|
131
|
+
'messages_unacknowledged' => queue['messages_unacknowledged'],
|
132
|
+
'messages_unacknowledged_details' => queue['messages_unacknowledged_details'],
|
133
|
+
'consumers' => queue['consumers'],
|
134
|
+
'memory' => queue['memory'],
|
135
|
+
)
|
136
|
+
|
137
|
+
stats.each_pair do |k, v|
|
138
|
+
service = "#{svc}.#{k}"
|
139
|
+
metric = if k =~ (/details$/) && !v.nil?
|
140
|
+
v['rate']
|
141
|
+
else
|
142
|
+
v
|
143
|
+
end
|
144
|
+
|
145
|
+
# TODO: Set state via thresholds which can be configured
|
146
|
+
|
147
|
+
report(
|
148
|
+
host: event_host,
|
149
|
+
service: service,
|
150
|
+
metric: metric,
|
151
|
+
description: 'RabbitMQ monitor',
|
152
|
+
)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
106
157
|
|
107
|
-
|
108
|
-
|
109
|
-
|
158
|
+
def check_overview
|
159
|
+
uri = URI(overview_url)
|
160
|
+
response = safe_get(uri, event_host)
|
110
161
|
|
111
|
-
if
|
112
|
-
errs << "Queue has jobs but no consumers"
|
113
|
-
end
|
162
|
+
return if response.nil?
|
114
163
|
|
115
|
-
|
116
|
-
errs << "Queue has #{queue['messages_ready']} jobs"
|
117
|
-
end
|
164
|
+
json = JSON.parse(response.body)
|
118
165
|
|
119
|
-
if
|
120
|
-
report(
|
121
|
-
|
122
|
-
|
123
|
-
|
166
|
+
if response.status != 200
|
167
|
+
report(
|
168
|
+
host: event_host,
|
169
|
+
service: 'rabbitmq',
|
170
|
+
state: 'critical',
|
171
|
+
description: "HTTP connection error: #{response.status} - #{response.body}",
|
124
172
|
)
|
125
173
|
else
|
126
|
-
report(
|
127
|
-
|
128
|
-
|
129
|
-
|
174
|
+
report(
|
175
|
+
host: event_host,
|
176
|
+
service: 'rabbitmq monitoring',
|
177
|
+
state: 'ok',
|
178
|
+
description: 'HTTP connection ok',
|
130
179
|
)
|
131
|
-
end
|
132
180
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
181
|
+
%w[message_stats queue_totals object_totals].each do |stat|
|
182
|
+
# NOTE: / BUG ?
|
183
|
+
# Brand new servers can have blank message stats. Is this ok?
|
184
|
+
# I can't decide.
|
185
|
+
next if json[stat].empty?
|
186
|
+
|
187
|
+
json[stat].each_pair do |k, v|
|
188
|
+
service = "rabbitmq.#{stat}.#{k}"
|
189
|
+
metric = if k =~ /details$/
|
190
|
+
v['rate']
|
191
|
+
else
|
192
|
+
v
|
193
|
+
end
|
194
|
+
|
195
|
+
# TODO: Set state via thresholds which can be configured
|
196
|
+
|
197
|
+
report(
|
198
|
+
host: event_host,
|
199
|
+
service: service,
|
200
|
+
metric: metric,
|
201
|
+
description: 'RabbitMQ monitor',
|
202
|
+
)
|
203
|
+
end
|
150
204
|
end
|
151
|
-
|
152
|
-
# TODO: Set state via thresholds which can be configured
|
153
|
-
|
154
|
-
report(:host => event_host,
|
155
|
-
:service => service,
|
156
|
-
:metric => metric,
|
157
|
-
:description => "RabbitMQ monitor"
|
158
|
-
)
|
159
205
|
end
|
160
206
|
end
|
161
|
-
end
|
162
|
-
end
|
163
207
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
# I can't decide.
|
189
|
-
next if json[stat].empty?
|
190
|
-
json[stat].each_pair do |k,v|
|
191
|
-
service = "rabbitmq.#{stat}.#{k}"
|
192
|
-
if k =~ /details$/
|
193
|
-
metric = v['rate']
|
194
|
-
else
|
195
|
-
metric = v
|
208
|
+
def check_node
|
209
|
+
opts[:node].each do |n|
|
210
|
+
uri = URI(node_url(n))
|
211
|
+
response = safe_get(uri, event_host)
|
212
|
+
|
213
|
+
break if response.nil?
|
214
|
+
|
215
|
+
if response.status != 200
|
216
|
+
if response.status == 404
|
217
|
+
report(
|
218
|
+
host: event_host,
|
219
|
+
service: "rabbitmq.node.#{n}",
|
220
|
+
state: 'critical',
|
221
|
+
description: 'Node was not found in the cluster',
|
222
|
+
)
|
223
|
+
else
|
224
|
+
report(
|
225
|
+
host: event_host,
|
226
|
+
service: "rabbitmq.node.#{n}",
|
227
|
+
state: 'critical',
|
228
|
+
description: "HTTP error: #{response.status} - #{response.body}",
|
229
|
+
)
|
230
|
+
end
|
231
|
+
break
|
196
232
|
end
|
197
233
|
|
198
|
-
|
234
|
+
json = JSON.parse(response.body)
|
199
235
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
def check_node
|
211
|
-
opts[:node].each do |n|
|
212
|
-
uri = URI(node_url(n))
|
213
|
-
response = safe_get(uri, event_host)
|
236
|
+
if json['mem_alarm']
|
237
|
+
report(
|
238
|
+
host: event_host,
|
239
|
+
service: "rabbitmq.node.#{n}",
|
240
|
+
state: 'critical',
|
241
|
+
description: 'Memory alarm has triggered; job submission throttled',
|
242
|
+
)
|
243
|
+
break
|
244
|
+
end
|
214
245
|
|
215
|
-
|
246
|
+
if json['disk_free_alarm']
|
247
|
+
report(
|
248
|
+
host: event_host,
|
249
|
+
service: "rabbitmq.node.#{n}",
|
250
|
+
state: 'critical',
|
251
|
+
description: 'Disk free alarm has triggered; job submission throttled',
|
252
|
+
)
|
253
|
+
break
|
254
|
+
end
|
216
255
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
:
|
221
|
-
:
|
222
|
-
:description => "Node was not found in the cluster"
|
223
|
-
)
|
224
|
-
else
|
225
|
-
report(:host => event_host,
|
226
|
-
:service => "rabbitmq.node.#{n}",
|
227
|
-
:state => "critical",
|
228
|
-
:description => "HTTP error: #{response.status} - #{response.body}"
|
256
|
+
report(
|
257
|
+
host: event_host,
|
258
|
+
service: "rabbitmq.node.#{n}",
|
259
|
+
state: 'ok',
|
260
|
+
description: 'Node looks OK to me',
|
229
261
|
)
|
230
262
|
end
|
231
|
-
return
|
232
|
-
end
|
233
|
-
|
234
|
-
json = JSON.parse(response.body)
|
235
|
-
|
236
|
-
if json['mem_alarm']
|
237
|
-
report(:host => event_host,
|
238
|
-
:service => "rabbitmq.node.#{n}",
|
239
|
-
:state => "critical",
|
240
|
-
:description => "Memory alarm has triggered; job submission throttled"
|
241
|
-
)
|
242
|
-
return
|
243
263
|
end
|
244
264
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
:description => "Disk free alarm has triggered; job submission throttled"
|
250
|
-
)
|
251
|
-
return
|
265
|
+
def tick
|
266
|
+
check_overview
|
267
|
+
check_node if opts[:node]
|
268
|
+
check_queues
|
252
269
|
end
|
253
|
-
|
254
|
-
report(:host => event_host,
|
255
|
-
:service => "rabbitmq.node.#{n}",
|
256
|
-
:state => "ok",
|
257
|
-
:description => "Node looks OK to me"
|
258
|
-
)
|
259
270
|
end
|
260
271
|
end
|
261
|
-
|
262
|
-
def tick
|
263
|
-
check_overview
|
264
|
-
check_node if opts[:node]
|
265
|
-
check_queues
|
266
|
-
end
|
267
272
|
end
|
268
273
|
Riemann::Tools::Rabbitmq.run
|
269
|
-
|