riemann-tools 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5b58978e25c07331c031e4575ffb46b7a04c3db5
4
- data.tar.gz: 50649c44cd74a42dd1b7d9b82e27cbdfdb16341d
3
+ metadata.gz: db9b835831c44292e608818f39aded205bc4f0f9
4
+ data.tar.gz: 0049cb2ffc88932a2f7b9528b1760a1e571fc99c
5
5
  SHA512:
6
- metadata.gz: 2d2a3f364ed87b7d79252eafc11fb5d05c5ef000cc029460c08b1b1bd4457800329e3cf69dda990458e2fb91900b02413ebcb28c677ab696bb88a46790f14ef1
7
- data.tar.gz: 491e357bbf23baac6fbee6172a6f8ecb81fb004280c4135bfe67d00945785bbc190531b97a93ff06fbcd248bb5e178243a2effc92ddb5f31ad198c342fc4d39a
6
+ metadata.gz: 67c75f0cdba4c515c26e0d097b8b6675c2572341dbc5cb21256f512abcbe89ec4ecf28a3e9bd43d97702923e5a119016fd02dc0706e8f95a381ba95ef4b9fa43
7
+ data.tar.gz: 471dbf8c04d0f35fb396e900ae4d64dc6de2e4c6cc16f92a4d3b5f76870d34577cc58b6b4ac306b85cb9a576b55db2a861d76cdc0bacd4502285a9036a00a820
@@ -5,26 +5,107 @@ require File.expand_path('../../lib/riemann/tools', __FILE__)
5
5
  class Riemann::Tools::FreeSWITCH
6
6
  include Riemann::Tools
7
7
 
8
+ opt :calls_warning, "Calls warning threshold", :default => 100
9
+ opt :calls_critical, "Calls critical threshold", :default => 300
10
+ opt :pid_file, "FreeSWITCH daemon pidfile", :type => String, :default => "/var/run/freeswitch/freeswitch.pid"
11
+
12
+ def initialize
13
+ @limits = {
14
+ :calls => {:critical => opts[:calls_critical], :warning => opts[:calls_warning]}
15
+ }
16
+ end
17
+
18
+ def dead_proc?(pid)
19
+ begin
20
+ Process.getpgid(pid)
21
+ false
22
+ rescue Errno::ESRCH
23
+ true
24
+ end
25
+ end
26
+
27
+ def alert(service, state, metric, description)
28
+ report(
29
+ :service => service.to_s,
30
+ :state => state.to_s,
31
+ :metric => metric.to_f,
32
+ :description => description
33
+ )
34
+ end
35
+
36
+ def exec_with_timeout(cmd, timeout)
37
+ pid = Process.spawn(cmd, {[:err,:out] => :close, :pgroup => true})
38
+ begin
39
+ Timeout.timeout(timeout) do
40
+ Process.waitpid(pid, 0)
41
+ $?.exitstatus == 0
42
+ end
43
+ rescue Timeout::Error
44
+ Process.kill(15, -Process.getpgid(pid))
45
+ puts "Killed pid: #{pid}"
46
+ false
47
+ end
48
+ end
49
+
8
50
  def tick
51
+ # Determine how many current calls I have according to FreeSWITCH
52
+ fs_calls = %x[fs_cli -x "show calls count"| grep -Po '^\\d+'].to_i
53
+
54
+ # Determine how many current channels I have according to FreeSWITCH
55
+ fs_channels = %x[fs_cli -x "show channels count"| grep -Po '^\\d+'].to_i
56
+
57
+ # Determine how many conferences I have according to FreeSWITCH
58
+ fs_conferences = %x[fs_cli -x "conference list"| grep -Pco '^Conference'].to_i
59
+
60
+ # Try to read pidfile. If it fails use Devil's dummy PID
61
+ begin
62
+ fs_pid = File.read(opts[:pid_file]).to_i
63
+ rescue
64
+ puts "Couldn't read pidfile: #{opts[:pid_file]}"
65
+ fs_pid = -666
66
+ end
67
+
68
+ # Submit calls to riemann
69
+ if fs_calls > @limits[:calls][:critical]
70
+ alert "FreeSWITCH current calls", :critical, fs_calls, "Number of calls are #{fs_calls}"
71
+ elsif fs_calls > @limits[:calls][:warning]
72
+ alert "FreeSWITCH current calls", :warning, fs_calls, "Number of calls are #{fs_calls}"
73
+ else
74
+ alert "FreeSWITCH current calls", :ok, fs_calls, "Number of calls are #{fs_calls}"
75
+ end
76
+
77
+ # Submit channels to riemann
78
+ if fs_channels > @limits[:calls][:critical]
79
+ alert "FreeSWITCH current channels", :critical, fs_channels, "Number of channels are #{fs_channels}"
80
+ elsif fs_channels > @limits[:calls][:warning]
81
+ alert "FreeSWITCH current channels", :warning, fs_channels, "Number of channels are #{fs_channels}"
82
+ else
83
+ alert "FreeSWITCH current channels", :ok, fs_channels, "Number of channels are #{fs_channels}"
84
+ end
85
+
86
+ # Submit conferences to riemann
87
+ if fs_conferences > @limits[:calls][:critical]
88
+ alert "FreeSWITCH current conferences", :critical, fs_conferences, "Number of conferences are #{fs_conferences}"
89
+ elsif fs_conferences > @limits[:calls][:warning]
90
+ alert "FreeSWITCH current conferences", :warning, fs_conferences, "Number of conferences are #{fs_conferences}"
91
+ else
92
+ alert "FreeSWITCH current conferences", :ok, fs_conferences, "Number of conferences are #{fs_conferences}"
93
+ end
94
+
95
+ # Submit status to riemann
96
+ if dead_proc?(fs_pid)
97
+ alert "FreeSWITCH status", :critical, -1, "FreeSWITCH service status: not running"
98
+ else
99
+ alert "FreeSWITCH status", :ok, nil, "FreeSWITCH service status: running"
100
+ end
101
+
102
+ # Submit CLI status to riemann using timeout in case it's unresponsive
103
+ if exec_with_timeout("fs_cli -x status", 2)
104
+ alert "FreeSWITCH CLI status", :ok, nil, "FreeSWITCH CLI status: responsive"
105
+ else
106
+ alert "FreeSWITCH CLI status", :critical, -1, "FreeSWITCH CLI status: not responding"
107
+ end
9
108
 
10
- #determine how many current calls I have according to FreeSWITCH
11
- fs_calls = %x[fs_cli -x "show calls count"| grep -Po '^\\d+']
12
-
13
- #determine how many current channels I have according to FreeSWITCH
14
- fs_channels = %x[fs_cli -x "show channels count"| grep -Po '^\\d+']
15
-
16
- #submit them to riemann
17
- report(
18
- :service => "FreeSWITCH current calls",
19
- :metric => fs_calls.to_i,
20
- :state => "info"
21
- )
22
-
23
- report(
24
- :service => "FreeSWITCH current channels",
25
- :metric => fs_channels.to_i,
26
- :state => "info"
27
- )
28
109
  end
29
110
  end
30
111
 
data/bin/riemann-health CHANGED
@@ -114,7 +114,7 @@ class Riemann::Tools::Health
114
114
  total = used + i2-i1
115
115
  fraction = used.to_f / total
116
116
 
117
- report_pct :cpu, fraction, "user+nice+sytem\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
117
+ report_pct :cpu, fraction, "user+nice+system\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
118
118
  end
119
119
 
120
120
  @old_cpu = [u2, n2, s2, i2]
@@ -196,6 +196,14 @@ class Riemann::Tools::Health
196
196
  used = mdat[7].to_i * (1024 ** "BKMGT".index(mdat[8]))
197
197
  free = mdat[9].to_i * (1024 ** "BKMGT".index(mdat[10]))
198
198
  @topdata[:memory] = (wired + active + used).to_f / (wired + active + used + inactive + free)
199
+ # This is for OSX Mavericks which
200
+ # uses a different format for top
201
+ # Example: PhysMem: 4662M used (1328M wired), 2782M unused.
202
+ elsif mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) used \(([0-9]+)([BKMGT]) wired\), ([0-9]+)([BKMGT]) unused/i)
203
+ used = mdat[1].to_i * (1024 ** "BKMGT".index(mdat[2]))
204
+ wired = mdat[3].to_i * (1024 ** "BKMGT".index(mdat[4]))
205
+ unused = mdat[5].to_i * (1024 ** "BKMGT".index(mdat[6]))
206
+ @topdata[:memory] = (used).to_f / (used + unused)
199
207
  end
200
208
  end
201
209
  end
data/bin/riemann-net CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- # Gathers munin statistics and submits them to Riemann.
3
+ # Gathers network interface statistics and submits them to Riemann.
4
4
 
5
5
  require File.expand_path('../../lib/riemann/tools', __FILE__)
6
6
 
data/bin/riemann-rabbitmq CHANGED
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
- #
3
2
 
4
3
  require File.expand_path('../../lib/riemann/tools', __FILE__)
5
4
 
@@ -16,11 +15,28 @@ class Riemann::Tools::Rabbitmq
16
15
 
17
16
  opt :monitor_user, 'RabbitMQ monitoring user', type: :string
18
17
  opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
19
- opt :monitor_port, 'RabbitMQ monitoring port', default: 15672
20
- opt :monitor_host, 'RabbitMQ monitoring host', default: "localhost"
18
+ opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672
19
+ opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost"
21
20
 
22
- def monitor_url
23
- "http://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api/overview"
21
+ opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000
22
+ opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string
23
+
24
+ opt :node, "Specify a node to monitor", type: :strings
25
+
26
+ def base_url
27
+ "http://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
28
+ end
29
+
30
+ def overview_url
31
+ "#{base_url}/overview"
32
+ end
33
+
34
+ def node_url(n)
35
+ "#{base_url}/nodes/#{n}"
36
+ end
37
+
38
+ def queues_url
39
+ "#{base_url}/queues"
24
40
  end
25
41
 
26
42
  def event_host
@@ -40,6 +56,11 @@ class Riemann::Tools::Rabbitmq
40
56
  req.options[:timeout] = options[:read_timeout]
41
57
  req.options[:open_timeout] = options[:open_timeout]
42
58
  end
59
+ report(:host => event_host,
60
+ :service => "rabbitmq monitoring",
61
+ :state => 'ok',
62
+ :description => "Monitoring operational"
63
+ )
43
64
  rescue => e
44
65
  report(:host => event_host,
45
66
  :service => "rabbitmq monitoring",
@@ -50,8 +71,92 @@ class Riemann::Tools::Rabbitmq
50
71
  response
51
72
  end
52
73
 
53
- def tick
54
- uri = URI(monitor_url)
74
+ def check_queues
75
+ response = safe_get(queues_url, event_host)
76
+ max_size_check_filter = if options[:ignore_max_size_queues]
77
+ Regexp.new(options[:ignore_max_size_queues])
78
+ else
79
+ nil
80
+ end
81
+
82
+ return if response.nil?
83
+
84
+ json = JSON.parse(response.body)
85
+
86
+ if response.status != 200
87
+ report(:host => event_host,
88
+ :service => "rabbitmq.queue",
89
+ :state => "critical",
90
+ :description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}"
91
+ )
92
+ else
93
+ report(:host => event_host,
94
+ :service => "rabbitmq.queue",
95
+ :state => "ok",
96
+ :description => "HTTP connection ok"
97
+ )
98
+
99
+ json = JSON.parse(response.body)
100
+
101
+ json.each do |queue|
102
+ svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}"
103
+ errs = []
104
+
105
+ if queue['messages_ready'] > 0 and queue['consumers'] == 0
106
+ errs << "Queue has jobs but no consumers"
107
+ end
108
+
109
+ if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready'] > options[:max_queue_size]
110
+ errs << "Queue has #{queue['messages_ready']} jobs"
111
+ end
112
+
113
+ if errs.empty?
114
+ report(:host => event_host,
115
+ :service => svc,
116
+ :state => "ok",
117
+ :description => "Queue is looking good"
118
+ )
119
+ else
120
+ report(:host => event_host,
121
+ :service => svc,
122
+ :state => "critical",
123
+ :description => errs.join("; ")
124
+ )
125
+ end
126
+
127
+ stats = (queue['message_stats'] || {}).merge(
128
+ 'messages' => queue['messages'],
129
+ 'messages_details' => queue['messages_details'],
130
+ 'messages_ready' => queue['messages_ready'],
131
+ 'messages_ready_details' => queue['messages_ready_details'],
132
+ 'messages_unacknowledged' => queue['messages_unacknowledged'],
133
+ 'messages_unacknowledged_details' => queue['messages_unacknowledged_details'],
134
+ 'consumers' => queue['consumers'],
135
+ 'memory' => queue['memory'],
136
+ )
137
+
138
+ stats.each_pair do |k,v|
139
+ service = "#{svc}.#{k}"
140
+ if k =~ /details$/
141
+ metric = v['rate']
142
+ else
143
+ metric = v
144
+ end
145
+
146
+ # TODO: Set state via thresholds which can be configured
147
+
148
+ report(:host => event_host,
149
+ :service => service,
150
+ :metric => metric,
151
+ :description => "RabbitMQ monitor"
152
+ )
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ def check_overview
159
+ uri = URI(overview_url)
55
160
  response = safe_get(uri, event_host)
56
161
 
57
162
  return if response.nil?
@@ -95,5 +200,63 @@ class Riemann::Tools::Rabbitmq
95
200
  end
96
201
  end
97
202
  end
203
+
204
+ def check_node
205
+ opts[:node].each do |n|
206
+ uri = URI(node_url(n))
207
+ response = safe_get(uri, event_host)
208
+
209
+ return if response.nil?
210
+
211
+ if response.status != 200
212
+ if response.status == 404
213
+ report(:host => event_host,
214
+ :service => "rabbitmq.node.#{n}",
215
+ :state => "critical",
216
+ :description => "Node was not found in the cluster"
217
+ )
218
+ else
219
+ report(:host => event_host,
220
+ :service => "rabbitmq.node.#{n}",
221
+ :state => "critical",
222
+ :description => "HTTP error: #{response.status} - #{response.body}"
223
+ )
224
+ end
225
+ return
226
+ end
227
+
228
+ json = JSON.parse(response.body)
229
+
230
+ if json['mem_alarm']
231
+ report(:host => event_host,
232
+ :service => "rabbitmq.node.#{n}",
233
+ :state => "critical",
234
+ :description => "Memory alarm has triggered; job submission throttled"
235
+ )
236
+ return
237
+ end
238
+
239
+ if json['disk_free_alarm']
240
+ report(:host => event_host,
241
+ :service => "rabbitmq.node.#{n}",
242
+ :state => "critical",
243
+ :description => "Disk free alarm has triggered; job submission throttled"
244
+ )
245
+ return
246
+ end
247
+
248
+ report(:host => event_host,
249
+ :service => "rabbitmq.node.#{n}",
250
+ :state => "ok",
251
+ :description => "Node looks OK to me"
252
+ )
253
+ end
254
+ end
255
+
256
+ def tick
257
+ check_overview
258
+ check_node if opts[:node]
259
+ check_queues
260
+ end
98
261
  end
99
262
  Riemann::Tools::Rabbitmq.run
data/bin/riemann-riak CHANGED
@@ -27,8 +27,8 @@ class Riemann::Tools::Riak
27
27
 
28
28
  def initialize
29
29
  detect_features
30
-
31
- @httpstatus = true
30
+
31
+ @httpstatus = true
32
32
  # What's going on here? --aphyr
33
33
  if
34
34
  begin
@@ -59,7 +59,7 @@ class Riemann::Tools::Riak
59
59
  def detect_features
60
60
  @escript = true # Whether escript is present on this machine
61
61
  @riakadmin = true # Whether riak-admin is present
62
-
62
+
63
63
  if `which escript` =~ /^\s*$/
64
64
  @escript = false
65
65
  end
@@ -113,6 +113,34 @@ class Riemann::Tools::Riak
113
113
  end
114
114
  end
115
115
 
116
+ def check_transfers
117
+ str = if @riakadmin
118
+ `riak-admin transfers`
119
+ else
120
+ nil
121
+ end
122
+
123
+ return if str.nil?
124
+
125
+ if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
126
+ report(
127
+ :host => opts[:riak_host],
128
+ :service => 'riak transfers',
129
+ :state => 'critical',
130
+ :metric => $1.to_i,
131
+ :description => "waiting to handoff #{$1} partitions"
132
+ )
133
+ else
134
+ report(
135
+ :host => opts[:riak_host],
136
+ :service => 'riak transfers',
137
+ :state => 'ok',
138
+ :metric => 0,
139
+ :description => "No pending transfers"
140
+ )
141
+ end
142
+ end
143
+
116
144
  def check_disk
117
145
  gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
118
146
  report(
@@ -125,8 +153,8 @@ class Riemann::Tools::Riak
125
153
  end
126
154
 
127
155
  # Returns the riak stat for the given fsm type and percentile.
128
- def fsm_stat(type, percentile)
129
- "node_#{type}_fsm_time_#{percentile == 50 ? 'median' : percentile}"
156
+ def fsm_stat(type, property, percentile)
157
+ "node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}"
130
158
  end
131
159
 
132
160
  # Returns the alerts state for the given fsm.
@@ -209,11 +237,14 @@ class Riemann::Tools::Riak
209
237
  'vnode_puts',
210
238
  'node_gets',
211
239
  'node_puts',
240
+ 'node_gets_set',
241
+ 'node_puts_set',
212
242
  'read_repairs']
213
243
  end
214
244
 
215
245
  def fsm_types
216
- ['get', 'put']
246
+ [{'get' => 'time'}, {'put' => 'time'},
247
+ {'get' => 'set_objsize'}]
217
248
  end
218
249
 
219
250
  def fsm_percentiles
@@ -233,9 +264,11 @@ class Riemann::Tools::Riak
233
264
  core_services.each do |s|
234
265
  report(event.merge(:service => "riak #{s}"))
235
266
  end
236
- fsm_types.each do |type|
237
- fsm_percentiles.each do |percentile|
238
- report(event.merge(:service => "riak #{type} #{percentile}"))
267
+ fsm_types.each do |typespec|
268
+ typespec.each do |type, prop|
269
+ fsm_percentiles.each do |percentile|
270
+ report(event.merge(:service => "riak #{type} #{prop} #{percentile}"))
271
+ end
239
272
  end
240
273
  end
241
274
  return
@@ -260,19 +293,25 @@ class Riemann::Tools::Riak
260
293
  end
261
294
 
262
295
  # FSMs
263
- fsm_types.each do |type|
264
- fsm_percentiles.each do |percentile|
265
- val = stats[fsm_stat(type, percentile)].to_i || 0
266
- val = 0 if val == 'undefined'
267
- val /= 1000.0 # Convert us to ms
268
- state = fsm_state(type, percentile, val)
269
- report(
270
- :host => opts[:riak_host],
271
- :service => "riak #{type} #{percentile}",
272
- :state => state,
273
- :metric => val,
274
- :description => "#{val} ms"
275
- )
296
+ fsm_types.each do |typespec|
297
+ typespec.each do |type, prop|
298
+ fsm_percentiles.each do |percentile|
299
+ val = stats[fsm_stat(type, prop, percentile)].to_i || 0
300
+ val = 0 if val == 'undefined'
301
+ val /= 1000.0 if prop == 'time' # Convert us to ms
302
+ if prop == 'time'
303
+ state = fsm_state(type, percentile, val)
304
+ else
305
+ state = "ok"
306
+ end
307
+ report(
308
+ :host => opts[:riak_host],
309
+ :service => "riak #{type} #{prop} #{percentile}",
310
+ :state => state,
311
+ :metric => val,
312
+ :description => "#{val} ms"
313
+ )
314
+ end
276
315
  end
277
316
  end
278
317
  end
@@ -283,6 +322,7 @@ class Riemann::Tools::Riak
283
322
  check_stats
284
323
  check_ring
285
324
  check_disk
325
+ check_transfers
286
326
  end
287
327
  end
288
328
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: riemann-tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kyle Kingsbury
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-30 00:00:00.000000000 Z
11
+ date: 2015-01-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: riemann-client