riemann-tools 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
data/bin/riemann-rabbitmq DELETED
@@ -1,267 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.expand_path('../../lib/riemann/tools', __FILE__)
4
-
5
- class Riemann::Tools::Rabbitmq
6
- include Riemann::Tools
7
-
8
- require 'faraday'
9
- require 'json'
10
- require 'uri'
11
-
12
-
13
- opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
14
- opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
15
-
16
- opt :monitor_user, 'RabbitMQ monitoring user', type: :string
17
- opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
18
- opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672
19
- opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost"
20
- opt :monitor_use_tls, 'RabbitMQ use tls', type: :bool, default: false
21
-
22
- opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000
23
- opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string
24
-
25
- opt :node, "Specify a node to monitor", type: :strings
26
-
27
- def base_url
28
- protocol = "http"
29
- if (options[:monitor_use_tls]) && (options[:monitor_use_tls]==true)
30
- protocol = "https"
31
- end
32
- "#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
33
- end
34
-
35
- def overview_url
36
- "#{base_url}/overview"
37
- end
38
-
39
- def node_url(n)
40
- "#{base_url}/nodes/#{n}"
41
- end
42
-
43
- def queues_url
44
- "#{base_url}/queues"
45
- end
46
-
47
- def event_host
48
- if options[:event_host]
49
- return options[:event_host]
50
- else
51
- return options[:monitor_host]
52
- end
53
- end
54
-
55
- def safe_get(uri, event_host)
56
- # Handle connection timeouts
57
- response = nil
58
- begin
59
- connection = Faraday.new(uri)
60
- response = connection.get do |req|
61
- req.options[:timeout] = options[:read_timeout]
62
- req.options[:open_timeout] = options[:open_timeout]
63
- end
64
- report(:host => event_host,
65
- :service => "rabbitmq monitoring",
66
- :state => 'ok',
67
- :description => "Monitoring operational"
68
- )
69
- rescue => e
70
- report(:host => event_host,
71
- :service => "rabbitmq monitoring",
72
- :state => "critical",
73
- :description => "HTTP connection error: #{e.class} - #{e.message}"
74
- )
75
- end
76
- response
77
- end
78
-
79
- def check_queues
80
- response = safe_get(queues_url, event_host)
81
- max_size_check_filter = if options[:ignore_max_size_queues]
82
- Regexp.new(options[:ignore_max_size_queues])
83
- else
84
- nil
85
- end
86
-
87
- return if response.nil?
88
-
89
- json = JSON.parse(response.body)
90
-
91
- if response.status != 200
92
- report(:host => event_host,
93
- :service => "rabbitmq.queue",
94
- :state => "critical",
95
- :description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}"
96
- )
97
- else
98
- report(:host => event_host,
99
- :service => "rabbitmq.queue",
100
- :state => "ok",
101
- :description => "HTTP connection ok"
102
- )
103
-
104
- json = JSON.parse(response.body)
105
-
106
- json.each do |queue|
107
- svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}"
108
- errs = []
109
-
110
- if queue['messages_ready']!=nil and queue['messages_ready'] > 0 and queue['consumers'] == 0
111
- errs << "Queue has jobs but no consumers"
112
- end
113
-
114
- if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready']!=nil and queue['messages_ready'] > options[:max_queue_size]
115
- errs << "Queue has #{queue['messages_ready']} jobs"
116
- end
117
-
118
- if errs.empty?
119
- report(:host => event_host,
120
- :service => svc,
121
- :state => "ok",
122
- :description => "Queue is looking good"
123
- )
124
- else
125
- report(:host => event_host,
126
- :service => svc,
127
- :state => "critical",
128
- :description => errs.join("; ")
129
- )
130
- end
131
-
132
- stats = (queue['message_stats'] || {}).merge(
133
- 'messages' => queue['messages'],
134
- 'messages_details' => queue['messages_details'],
135
- 'messages_ready' => queue['messages_ready'],
136
- 'messages_ready_details' => queue['messages_ready_details'],
137
- 'messages_unacknowledged' => queue['messages_unacknowledged'],
138
- 'messages_unacknowledged_details' => queue['messages_unacknowledged_details'],
139
- 'consumers' => queue['consumers'],
140
- 'memory' => queue['memory'],
141
- )
142
-
143
- stats.each_pair do |k,v|
144
- service = "#{svc}.#{k}"
145
- if k =~ /details$/ and v!=nil
146
- metric = v['rate']
147
- else
148
- metric = v
149
- end
150
-
151
- # TODO: Set state via thresholds which can be configured
152
-
153
- report(:host => event_host,
154
- :service => service,
155
- :metric => metric,
156
- :description => "RabbitMQ monitor"
157
- )
158
- end
159
- end
160
- end
161
- end
162
-
163
- def check_overview
164
- uri = URI(overview_url)
165
- response = safe_get(uri, event_host)
166
-
167
- return if response.nil?
168
-
169
- json = JSON.parse(response.body)
170
-
171
- if response.status != 200
172
- report(:host => event_host,
173
- :service => "rabbitmq",
174
- :state => "critical",
175
- :description => "HTTP connection error: #{response.status} - #{response.body}"
176
- )
177
- else
178
- report(:host => event_host,
179
- :service => "rabbitmq monitoring",
180
- :state => "ok",
181
- :description => "HTTP connection ok"
182
- )
183
-
184
- %w( message_stats queue_totals object_totals ).each do |stat|
185
- # NOTE / BUG ?
186
- # Brand new servers can have blank message stats. Is this ok?
187
- # I can't decide.
188
- next if json[stat].empty?
189
- json[stat].each_pair do |k,v|
190
- service = "rabbitmq.#{stat}.#{k}"
191
- if k =~ /details$/
192
- metric = v['rate']
193
- else
194
- metric = v
195
- end
196
-
197
- # TODO: Set state via thresholds which can be configured
198
-
199
- report(:host => event_host,
200
- :service => service,
201
- :metric => metric,
202
- :description => "RabbitMQ monitor"
203
- )
204
- end
205
- end
206
- end
207
- end
208
-
209
- def check_node
210
- opts[:node].each do |n|
211
- uri = URI(node_url(n))
212
- response = safe_get(uri, event_host)
213
-
214
- return if response.nil?
215
-
216
- if response.status != 200
217
- if response.status == 404
218
- report(:host => event_host,
219
- :service => "rabbitmq.node.#{n}",
220
- :state => "critical",
221
- :description => "Node was not found in the cluster"
222
- )
223
- else
224
- report(:host => event_host,
225
- :service => "rabbitmq.node.#{n}",
226
- :state => "critical",
227
- :description => "HTTP error: #{response.status} - #{response.body}"
228
- )
229
- end
230
- return
231
- end
232
-
233
- json = JSON.parse(response.body)
234
-
235
- if json['mem_alarm']
236
- report(:host => event_host,
237
- :service => "rabbitmq.node.#{n}",
238
- :state => "critical",
239
- :description => "Memory alarm has triggered; job submission throttled"
240
- )
241
- return
242
- end
243
-
244
- if json['disk_free_alarm']
245
- report(:host => event_host,
246
- :service => "rabbitmq.node.#{n}",
247
- :state => "critical",
248
- :description => "Disk free alarm has triggered; job submission throttled"
249
- )
250
- return
251
- end
252
-
253
- report(:host => event_host,
254
- :service => "rabbitmq.node.#{n}",
255
- :state => "ok",
256
- :description => "Node looks OK to me"
257
- )
258
- end
259
- end
260
-
261
- def tick
262
- check_overview
263
- check_node if opts[:node]
264
- check_queues
265
- end
266
- end
267
- Riemann::Tools::Rabbitmq.run
data/bin/riemann-resmon DELETED
@@ -1,103 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.expand_path('../../lib/riemann/tools', __FILE__)
4
-
5
- class Riemann::Tools::Resmon
6
- include Riemann::Tools
7
- require 'nokogiri'
8
- require 'faraday'
9
-
10
- opt :resmon_hostfile, 'File with hostnames running Resmon (one URI per line)', type: :string
11
- opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
12
- opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
13
- opt :fqdn, 'Use FQDN for event host'
14
-
15
-
16
- def initialize
17
- @hosts = File.read(options[:resmon_hostfile]).split("\n")
18
- super
19
- end
20
-
21
-
22
- # Work out the hostname to submit with the event
23
- def get_event_host(host)
24
- unless options[:fqdn]
25
- return host.split('.')[0]
26
- end
27
- return host
28
- end
29
-
30
- # Handles HTTP connections and GET requests safely
31
- def safe_get(uri, event_host)
32
- # Handle connection timeouts
33
- response = nil
34
- begin
35
- connection = Faraday.new(uri)
36
- response = connection.get do |req|
37
- req.options[:timeout] = options[:read_timeout]
38
- req.options[:open_timeout] = options[:open_timeout]
39
- end
40
- rescue => e
41
- report(:host => event_host,
42
- :service => "resmon",
43
- :state => "critical",
44
- :description => "HTTP connection error: #{e.class} - #{e.message}"
45
- )
46
- end
47
- response
48
- end
49
-
50
- def tick
51
- @hosts.each do |host|
52
-
53
- uri = URI(host)
54
- event_host = get_event_host(uri.host)
55
-
56
- response = safe_get(uri, event_host)
57
- next if response.nil?
58
-
59
- # Handle non-200 responses
60
- if response.status != 200
61
- report(:host => event_host,
62
- :service => "resmon",
63
- :state => "critical",
64
- :description => "HTTP connection error: #{response.status} - #{response.body}"
65
- )
66
- next
67
- else
68
- report(:host => event_host,
69
- :service => "resmon",
70
- :state => "ok",
71
- :description => "Resmon connection ok"
72
- )
73
- doc = Nokogiri::XML(response.body)
74
- end
75
-
76
- doc.xpath('//ResmonResults/ResmonResult').each do |result|
77
- timestamp = result.xpath('last_update').first.text
78
- result.xpath('metric').each do |metric|
79
- hash = {
80
- host: event_host,
81
- service: "#{result.attributes['module'].value}`#{result.attributes['service'].value}`#{metric.attributes['name'].value}",
82
- time: timestamp.to_i
83
- }
84
-
85
- case metric.attributes['type'].value
86
- when /[iIlL]/
87
- hash[:metric] = metric.text.to_i
88
- when 'n'
89
- hash[:metric] = metric.text.to_f
90
- when 's'
91
- hash[:description] = metric.text
92
- when '0'
93
- raise 'dunno what 0 is yet'
94
- end
95
-
96
- report(hash)
97
- end
98
- end
99
- end
100
- end
101
- end
102
-
103
- Riemann::Tools::Resmon.run
data/bin/riemann-riak DELETED
@@ -1,329 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # Forwards information on a Riak node to Riemann.
4
-
5
- require File.expand_path('../../lib/riemann/tools', __FILE__)
6
-
7
- require 'net/http'
8
- require 'net/https'
9
- require 'yajl/json_gem'
10
-
11
- class Riemann::Tools::Riak
12
- include Riemann::Tools
13
-
14
- opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
15
- opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
16
- opt :stats_port, "Riak HTTP port for stats", :default => 8098
17
- opt :stats_path, "Riak HTTP stats path", :default => '/stats'
18
- opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}"
19
- opt :cookie, "Riak cookie to use", :default => "riak"
20
-
21
- opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000
22
- opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000
23
- opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000
24
- opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
25
- opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
26
- opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
27
-
28
- def initialize
29
- detect_features
30
-
31
- @httpstatus = true
32
- # What's going on here? --aphyr
33
- if
34
- begin
35
- uri = URI.parse(opts[:riak_host])
36
- if uri.host == nil
37
- uri.host = opts[:riak_host]
38
- end
39
- http = Net::HTTP.new(uri.host, opts[:stats_port])
40
- http.use_ssl = uri.scheme == 'https'
41
- if http.use_ssl?
42
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
43
- end
44
- http.start do |http|
45
- http.get opts[:stats_path]
46
- end
47
- rescue => e
48
- @httpstatus = false
49
- end
50
- end
51
-
52
- # we're going to override the emulator setting to allow users to
53
- # dynamically input the cookie
54
- # this is done only once - hopefully it doesn't get overridden.
55
- ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
56
- end
57
-
58
- # Identifies whether escript and riak-admin are installed
59
- def detect_features
60
- @escript = true # Whether escript is present on this machine
61
- @riakadmin = true # Whether riak-admin is present
62
-
63
- if `which escript` =~ /^\s*$/
64
- @escript = false
65
- end
66
-
67
- if `which riak-admin` =~ /^\s*$/
68
- @riakadmin = false
69
- end
70
- end
71
-
72
- def check_ring
73
- if @escript
74
- str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
75
- elsif @riakadmin
76
- str = `riak-admin ringready`
77
- end
78
-
79
- if str =~ /^TRUE/
80
- report(
81
- :host => opts[:riak_host],
82
- :service => 'riak ring',
83
- :state => 'ok',
84
- :description => str
85
- )
86
- else
87
- report(
88
- :host => opts[:riak_host],
89
- :service => 'riak ring',
90
- :state => 'warning',
91
- :description => str
92
- )
93
- end
94
- end
95
-
96
- def check_keys
97
- keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp
98
- if keys =~ /^\d+$/
99
- report(
100
- :host => opts[:riak_host],
101
- :service => 'riak keys',
102
- :state => 'ok',
103
- :metric => keys.to_i,
104
- :description => keys
105
- )
106
- else
107
- report(
108
- :host => opts[:riak_host],
109
- :service => 'riak keys',
110
- :state => 'unknown',
111
- :description => keys
112
- )
113
- end
114
- end
115
-
116
- def check_transfers
117
- str = if @riakadmin
118
- `riak-admin transfers`
119
- else
120
- nil
121
- end
122
-
123
- return if str.nil?
124
-
125
- if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
126
- report(
127
- :host => opts[:riak_host],
128
- :service => 'riak transfers',
129
- :state => 'critical',
130
- :metric => $1.to_i,
131
- :description => "waiting to handoff #{$1} partitions"
132
- )
133
- else
134
- report(
135
- :host => opts[:riak_host],
136
- :service => 'riak transfers',
137
- :state => 'ok',
138
- :metric => 0,
139
- :description => "No pending transfers"
140
- )
141
- end
142
- end
143
-
144
- def check_disk
145
- gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
146
- report(
147
- :host => opts[:riak_host],
148
- :service => 'riak disk',
149
- :state => 'ok',
150
- :metric => gb,
151
- :description => "#{gb} GB in #{opts[:data_dir]}"
152
- )
153
- end
154
-
155
- # Returns the riak stat for the given fsm type and percentile.
156
- def fsm_stat(type, property, percentile)
157
- "node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}"
158
- end
159
-
160
- # Returns the alerts state for the given fsm.
161
- def fsm_state(type, percentile, val)
162
- limit = opts["#{type}_#{percentile}_warning".to_sym]
163
- case val
164
- when 0 .. limit
165
- 'ok'
166
- when limit .. limit * 2
167
- 'warning'
168
- else
169
- 'critical'
170
- end
171
- end
172
-
173
- # Get current stats via HTTP
174
- def stats_http
175
- begin
176
- uri = URI.parse(opts[:riak_host])
177
- if uri.host == nil
178
- uri.host = opts[:riak_host]
179
- end
180
- http = Net::HTTP.new(uri.host, opts[:stats_port])
181
- http.use_ssl = uri.scheme == 'https'
182
- if http.use_ssl?
183
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
184
- end
185
- res = http.start do |http|
186
- http.get opts[:stats_path]
187
- end
188
- rescue => e
189
- report(
190
- :host => opts[:riak_host],
191
- :service => 'riak',
192
- :state => 'critical',
193
- :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
194
- )
195
- raise
196
- end
197
-
198
- if res.code.to_i == 200
199
- return JSON.parse(res.body)
200
- else
201
- report(
202
- :host => opts[:riak_host],
203
- :service => 'riak',
204
- :state => 'critical',
205
- :description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
206
- )
207
- raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}"
208
- end
209
- end
210
-
211
- # Get current stats via riak-admin
212
- def stats_riak_admin
213
- str = `riak-admin status`
214
- raise "riak-admin failed" unless $? == 0
215
- Hash[str.split(/\n/).map{|i| i.split(/ : /)}]
216
- end
217
-
218
- # Get current stats as a hash
219
- def stats
220
- if @httpstatus
221
- stats_http
222
- elsif @riakadmin
223
- stats_riak_admin
224
- else
225
- report(
226
- :host => opts[:riak_host],
227
- :service => 'riak',
228
- :state => 'critical',
229
- :description => "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
230
- )
231
- raise "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
232
- end
233
- end
234
-
235
- def core_services
236
- ['vnode_gets',
237
- 'vnode_puts',
238
- 'node_gets',
239
- 'node_puts',
240
- 'node_gets_set',
241
- 'node_puts_set',
242
- 'read_repairs']
243
- end
244
-
245
- def fsm_types
246
- [{'get' => 'time'}, {'put' => 'time'},
247
- {'get' => 'set_objsize'}]
248
- end
249
-
250
- def fsm_percentiles
251
- [50, 95, 99]
252
- end
253
-
254
- # Reports current stats to Riemann
255
- def check_stats
256
- begin
257
- stats = self.stats
258
- rescue => e
259
- event = {:state => 'critical',
260
- :description => e.message,
261
- :host => opts[:riak_host]}
262
- # Report errors
263
- report(event.merge(:service => 'riak'))
264
- core_services.each do |s|
265
- report(event.merge(:service => "riak #{s}"))
266
- end
267
- fsm_types.each do |typespec|
268
- typespec.each do |type, prop|
269
- fsm_percentiles.each do |percentile|
270
- report(event.merge(:service => "riak #{type} #{prop} #{percentile}"))
271
- end
272
- end
273
- end
274
- return
275
- end
276
-
277
- # Riak itself
278
- report(
279
- :host => opts[:riak_host],
280
- :service => 'riak',
281
- :state => 'ok'
282
- )
283
-
284
- # Gets/puts/rr
285
- core_services.each do |s|
286
- report(
287
- :host => opts[:riak_host],
288
- :service => "riak #{s}",
289
- :state => 'ok',
290
- :metric => stats[s].to_i/60.0,
291
- :description => "#{stats[s].to_i/60.0}/sec"
292
- )
293
- end
294
-
295
- # FSMs
296
- fsm_types.each do |typespec|
297
- typespec.each do |type, prop|
298
- fsm_percentiles.each do |percentile|
299
- val = stats[fsm_stat(type, prop, percentile)].to_i || 0
300
- val = 0 if val == 'undefined'
301
- val /= 1000.0 if prop == 'time' # Convert us to ms
302
- if prop == 'time'
303
- state = fsm_state(type, percentile, val)
304
- else
305
- state = "ok"
306
- end
307
- report(
308
- :host => opts[:riak_host],
309
- :service => "riak #{type} #{prop} #{percentile}",
310
- :state => state,
311
- :metric => val,
312
- :description => "#{val} ms"
313
- )
314
- end
315
- end
316
- end
317
- end
318
-
319
- def tick
320
- # This can utterly destroy a cluster, so we disable
321
- # check_keys
322
- check_stats
323
- check_ring
324
- check_disk
325
- check_transfers
326
- end
327
- end
328
-
329
- Riemann::Tools::Riak.run