riemann-tools 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5b58978e25c07331c031e4575ffb46b7a04c3db5
4
- data.tar.gz: 50649c44cd74a42dd1b7d9b82e27cbdfdb16341d
3
+ metadata.gz: db9b835831c44292e608818f39aded205bc4f0f9
4
+ data.tar.gz: 0049cb2ffc88932a2f7b9528b1760a1e571fc99c
5
5
  SHA512:
6
- metadata.gz: 2d2a3f364ed87b7d79252eafc11fb5d05c5ef000cc029460c08b1b1bd4457800329e3cf69dda990458e2fb91900b02413ebcb28c677ab696bb88a46790f14ef1
7
- data.tar.gz: 491e357bbf23baac6fbee6172a6f8ecb81fb004280c4135bfe67d00945785bbc190531b97a93ff06fbcd248bb5e178243a2effc92ddb5f31ad198c342fc4d39a
6
+ metadata.gz: 67c75f0cdba4c515c26e0d097b8b6675c2572341dbc5cb21256f512abcbe89ec4ecf28a3e9bd43d97702923e5a119016fd02dc0706e8f95a381ba95ef4b9fa43
7
+ data.tar.gz: 471dbf8c04d0f35fb396e900ae4d64dc6de2e4c6cc16f92a4d3b5f76870d34577cc58b6b4ac306b85cb9a576b55db2a861d76cdc0bacd4502285a9036a00a820
@@ -5,26 +5,107 @@ require File.expand_path('../../lib/riemann/tools', __FILE__)
5
5
  class Riemann::Tools::FreeSWITCH
6
6
  include Riemann::Tools
7
7
 
8
+ opt :calls_warning, "Calls warning threshold", :default => 100
9
+ opt :calls_critical, "Calls critical threshold", :default => 300
10
+ opt :pid_file, "FreeSWITCH daemon pidfile", :type => String, :default => "/var/run/freeswitch/freeswitch.pid"
11
+
12
+ def initialize
13
+ @limits = {
14
+ :calls => {:critical => opts[:calls_critical], :warning => opts[:calls_warning]}
15
+ }
16
+ end
17
+
18
+ def dead_proc?(pid)
19
+ begin
20
+ Process.getpgid(pid)
21
+ false
22
+ rescue Errno::ESRCH
23
+ true
24
+ end
25
+ end
26
+
27
+ def alert(service, state, metric, description)
28
+ report(
29
+ :service => service.to_s,
30
+ :state => state.to_s,
31
+ :metric => metric.to_f,
32
+ :description => description
33
+ )
34
+ end
35
+
36
+ def exec_with_timeout(cmd, timeout)
37
+ pid = Process.spawn(cmd, {[:err,:out] => :close, :pgroup => true})
38
+ begin
39
+ Timeout.timeout(timeout) do
40
+ Process.waitpid(pid, 0)
41
+ $?.exitstatus == 0
42
+ end
43
+ rescue Timeout::Error
44
+ Process.kill(15, -Process.getpgid(pid))
45
+ puts "Killed pid: #{pid}"
46
+ false
47
+ end
48
+ end
49
+
8
50
  def tick
51
+ # Determine how many current calls I have according to FreeSWITCH
52
+ fs_calls = %x[fs_cli -x "show calls count"| grep -Po '^\\d+'].to_i
53
+
54
+ # Determine how many current channels I have according to FreeSWITCH
55
+ fs_channels = %x[fs_cli -x "show channels count"| grep -Po '^\\d+'].to_i
56
+
57
+ # Determine how many conferences I have according to FreeSWITCH
58
+ fs_conferences = %x[fs_cli -x "conference list"| grep -Pco '^Conference'].to_i
59
+
60
+ # Try to read pidfile. If it fails use Devil's dummy PID
61
+ begin
62
+ fs_pid = File.read(opts[:pid_file]).to_i
63
+ rescue
64
+ puts "Couldn't read pidfile: #{opts[:pid_file]}"
65
+ fs_pid = -666
66
+ end
67
+
68
+ # Submit calls to riemann
69
+ if fs_calls > @limits[:calls][:critical]
70
+ alert "FreeSWITCH current calls", :critical, fs_calls, "Number of calls are #{fs_calls}"
71
+ elsif fs_calls > @limits[:calls][:warning]
72
+ alert "FreeSWITCH current calls", :warning, fs_calls, "Number of calls are #{fs_calls}"
73
+ else
74
+ alert "FreeSWITCH current calls", :ok, fs_calls, "Number of calls are #{fs_calls}"
75
+ end
76
+
77
+ # Submit channels to riemann
78
+ if fs_channels > @limits[:calls][:critical]
79
+ alert "FreeSWITCH current channels", :critical, fs_channels, "Number of channels are #{fs_channels}"
80
+ elsif fs_channels > @limits[:calls][:warning]
81
+ alert "FreeSWITCH current channels", :warning, fs_channels, "Number of channels are #{fs_channels}"
82
+ else
83
+ alert "FreeSWITCH current channels", :ok, fs_channels, "Number of channels are #{fs_channels}"
84
+ end
85
+
86
+ # Submit conferences to riemann
87
+ if fs_conferences > @limits[:calls][:critical]
88
+ alert "FreeSWITCH current conferences", :critical, fs_conferences, "Number of conferences are #{fs_conferences}"
89
+ elsif fs_conferences > @limits[:calls][:warning]
90
+ alert "FreeSWITCH current conferences", :warning, fs_conferences, "Number of conferences are #{fs_conferences}"
91
+ else
92
+ alert "FreeSWITCH current conferences", :ok, fs_conferences, "Number of conferences are #{fs_conferences}"
93
+ end
94
+
95
+ # Submit status to riemann
96
+ if dead_proc?(fs_pid)
97
+ alert "FreeSWITCH status", :critical, -1, "FreeSWITCH service status: not running"
98
+ else
99
+ alert "FreeSWITCH status", :ok, nil, "FreeSWITCH service status: running"
100
+ end
101
+
102
+ # Submit CLI status to riemann using timeout in case it's unresponsive
103
+ if exec_with_timeout("fs_cli -x status", 2)
104
+ alert "FreeSWITCH CLI status", :ok, nil, "FreeSWITCH CLI status: responsive"
105
+ else
106
+ alert "FreeSWITCH CLI status", :critical, -1, "FreeSWITCH CLI status: not responding"
107
+ end
9
108
 
10
- #determine how many current calls I have according to FreeSWITCH
11
- fs_calls = %x[fs_cli -x "show calls count"| grep -Po '^\\d+']
12
-
13
- #determine how many current channels I have according to FreeSWITCH
14
- fs_channels = %x[fs_cli -x "show channels count"| grep -Po '^\\d+']
15
-
16
- #submit them to riemann
17
- report(
18
- :service => "FreeSWITCH current calls",
19
- :metric => fs_calls.to_i,
20
- :state => "info"
21
- )
22
-
23
- report(
24
- :service => "FreeSWITCH current channels",
25
- :metric => fs_channels.to_i,
26
- :state => "info"
27
- )
28
109
  end
29
110
  end
30
111
 
data/bin/riemann-health CHANGED
@@ -114,7 +114,7 @@ class Riemann::Tools::Health
114
114
  total = used + i2-i1
115
115
  fraction = used.to_f / total
116
116
 
117
- report_pct :cpu, fraction, "user+nice+sytem\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
117
+ report_pct :cpu, fraction, "user+nice+system\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
118
118
  end
119
119
 
120
120
  @old_cpu = [u2, n2, s2, i2]
@@ -196,6 +196,14 @@ class Riemann::Tools::Health
196
196
  used = mdat[7].to_i * (1024 ** "BKMGT".index(mdat[8]))
197
197
  free = mdat[9].to_i * (1024 ** "BKMGT".index(mdat[10]))
198
198
  @topdata[:memory] = (wired + active + used).to_f / (wired + active + used + inactive + free)
199
+ # This is for OSX Mavericks which
200
+ # uses a different format for top
201
+ # Example: PhysMem: 4662M used (1328M wired), 2782M unused.
202
+ elsif mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) used \(([0-9]+)([BKMGT]) wired\), ([0-9]+)([BKMGT]) unused/i)
203
+ used = mdat[1].to_i * (1024 ** "BKMGT".index(mdat[2]))
204
+ wired = mdat[3].to_i * (1024 ** "BKMGT".index(mdat[4]))
205
+ unused = mdat[5].to_i * (1024 ** "BKMGT".index(mdat[6]))
206
+ @topdata[:memory] = (used).to_f / (used + unused)
199
207
  end
200
208
  end
201
209
  end
data/bin/riemann-net CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- # Gathers munin statistics and submits them to Riemann.
3
+ # Gathers network interface statistics and submits them to Riemann.
4
4
 
5
5
  require File.expand_path('../../lib/riemann/tools', __FILE__)
6
6
 
data/bin/riemann-rabbitmq CHANGED
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
- #
3
2
 
4
3
  require File.expand_path('../../lib/riemann/tools', __FILE__)
5
4
 
@@ -16,11 +15,28 @@ class Riemann::Tools::Rabbitmq
16
15
 
17
16
  opt :monitor_user, 'RabbitMQ monitoring user', type: :string
18
17
  opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
19
- opt :monitor_port, 'RabbitMQ monitoring port', default: 15672
20
- opt :monitor_host, 'RabbitMQ monitoring host', default: "localhost"
18
+ opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672
19
+ opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost"
21
20
 
22
- def monitor_url
23
- "http://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api/overview"
21
+ opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000
22
+ opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string
23
+
24
+ opt :node, "Specify a node to monitor", type: :strings
25
+
26
+ def base_url
27
+ "http://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
28
+ end
29
+
30
+ def overview_url
31
+ "#{base_url}/overview"
32
+ end
33
+
34
+ def node_url(n)
35
+ "#{base_url}/nodes/#{n}"
36
+ end
37
+
38
+ def queues_url
39
+ "#{base_url}/queues"
24
40
  end
25
41
 
26
42
  def event_host
@@ -40,6 +56,11 @@ class Riemann::Tools::Rabbitmq
40
56
  req.options[:timeout] = options[:read_timeout]
41
57
  req.options[:open_timeout] = options[:open_timeout]
42
58
  end
59
+ report(:host => event_host,
60
+ :service => "rabbitmq monitoring",
61
+ :state => 'ok',
62
+ :description => "Monitoring operational"
63
+ )
43
64
  rescue => e
44
65
  report(:host => event_host,
45
66
  :service => "rabbitmq monitoring",
@@ -50,8 +71,92 @@ class Riemann::Tools::Rabbitmq
50
71
  response
51
72
  end
52
73
 
53
- def tick
54
- uri = URI(monitor_url)
74
+ def check_queues
75
+ response = safe_get(queues_url, event_host)
76
+ max_size_check_filter = if options[:ignore_max_size_queues]
77
+ Regexp.new(options[:ignore_max_size_queues])
78
+ else
79
+ nil
80
+ end
81
+
82
+ return if response.nil?
83
+
84
+ json = JSON.parse(response.body)
85
+
86
+ if response.status != 200
87
+ report(:host => event_host,
88
+ :service => "rabbitmq.queue",
89
+ :state => "critical",
90
+ :description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}"
91
+ )
92
+ else
93
+ report(:host => event_host,
94
+ :service => "rabbitmq.queue",
95
+ :state => "ok",
96
+ :description => "HTTP connection ok"
97
+ )
98
+
99
+ json = JSON.parse(response.body)
100
+
101
+ json.each do |queue|
102
+ svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}"
103
+ errs = []
104
+
105
+ if queue['messages_ready'] > 0 and queue['consumers'] == 0
106
+ errs << "Queue has jobs but no consumers"
107
+ end
108
+
109
+ if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready'] > options[:max_queue_size]
110
+ errs << "Queue has #{queue['messages_ready']} jobs"
111
+ end
112
+
113
+ if errs.empty?
114
+ report(:host => event_host,
115
+ :service => svc,
116
+ :state => "ok",
117
+ :description => "Queue is looking good"
118
+ )
119
+ else
120
+ report(:host => event_host,
121
+ :service => svc,
122
+ :state => "critical",
123
+ :description => errs.join("; ")
124
+ )
125
+ end
126
+
127
+ stats = (queue['message_stats'] || {}).merge(
128
+ 'messages' => queue['messages'],
129
+ 'messages_details' => queue['messages_details'],
130
+ 'messages_ready' => queue['messages_ready'],
131
+ 'messages_ready_details' => queue['messages_ready_details'],
132
+ 'messages_unacknowledged' => queue['messages_unacknowledged'],
133
+ 'messages_unacknowledged_details' => queue['messages_unacknowledged_details'],
134
+ 'consumers' => queue['consumers'],
135
+ 'memory' => queue['memory'],
136
+ )
137
+
138
+ stats.each_pair do |k,v|
139
+ service = "#{svc}.#{k}"
140
+ if k =~ /details$/
141
+ metric = v['rate']
142
+ else
143
+ metric = v
144
+ end
145
+
146
+ # TODO: Set state via thresholds which can be configured
147
+
148
+ report(:host => event_host,
149
+ :service => service,
150
+ :metric => metric,
151
+ :description => "RabbitMQ monitor"
152
+ )
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ def check_overview
159
+ uri = URI(overview_url)
55
160
  response = safe_get(uri, event_host)
56
161
 
57
162
  return if response.nil?
@@ -95,5 +200,63 @@ class Riemann::Tools::Rabbitmq
95
200
  end
96
201
  end
97
202
  end
203
+
204
+ def check_node
205
+ opts[:node].each do |n|
206
+ uri = URI(node_url(n))
207
+ response = safe_get(uri, event_host)
208
+
209
+ return if response.nil?
210
+
211
+ if response.status != 200
212
+ if response.status == 404
213
+ report(:host => event_host,
214
+ :service => "rabbitmq.node.#{n}",
215
+ :state => "critical",
216
+ :description => "Node was not found in the cluster"
217
+ )
218
+ else
219
+ report(:host => event_host,
220
+ :service => "rabbitmq.node.#{n}",
221
+ :state => "critical",
222
+ :description => "HTTP error: #{response.status} - #{response.body}"
223
+ )
224
+ end
225
+ return
226
+ end
227
+
228
+ json = JSON.parse(response.body)
229
+
230
+ if json['mem_alarm']
231
+ report(:host => event_host,
232
+ :service => "rabbitmq.node.#{n}",
233
+ :state => "critical",
234
+ :description => "Memory alarm has triggered; job submission throttled"
235
+ )
236
+ return
237
+ end
238
+
239
+ if json['disk_free_alarm']
240
+ report(:host => event_host,
241
+ :service => "rabbitmq.node.#{n}",
242
+ :state => "critical",
243
+ :description => "Disk free alarm has triggered; job submission throttled"
244
+ )
245
+ return
246
+ end
247
+
248
+ report(:host => event_host,
249
+ :service => "rabbitmq.node.#{n}",
250
+ :state => "ok",
251
+ :description => "Node looks OK to me"
252
+ )
253
+ end
254
+ end
255
+
256
+ def tick
257
+ check_overview
258
+ check_node if opts[:node]
259
+ check_queues
260
+ end
98
261
  end
99
262
  Riemann::Tools::Rabbitmq.run
data/bin/riemann-riak CHANGED
@@ -27,8 +27,8 @@ class Riemann::Tools::Riak
27
27
 
28
28
  def initialize
29
29
  detect_features
30
-
31
- @httpstatus = true
30
+
31
+ @httpstatus = true
32
32
  # What's going on here? --aphyr
33
33
  if
34
34
  begin
@@ -59,7 +59,7 @@ class Riemann::Tools::Riak
59
59
  def detect_features
60
60
  @escript = true # Whether escript is present on this machine
61
61
  @riakadmin = true # Whether riak-admin is present
62
-
62
+
63
63
  if `which escript` =~ /^\s*$/
64
64
  @escript = false
65
65
  end
@@ -113,6 +113,34 @@ class Riemann::Tools::Riak
113
113
  end
114
114
  end
115
115
 
116
+ def check_transfers
117
+ str = if @riakadmin
118
+ `riak-admin transfers`
119
+ else
120
+ nil
121
+ end
122
+
123
+ return if str.nil?
124
+
125
+ if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
126
+ report(
127
+ :host => opts[:riak_host],
128
+ :service => 'riak transfers',
129
+ :state => 'critical',
130
+ :metric => $1.to_i,
131
+ :description => "waiting to handoff #{$1} partitions"
132
+ )
133
+ else
134
+ report(
135
+ :host => opts[:riak_host],
136
+ :service => 'riak transfers',
137
+ :state => 'ok',
138
+ :metric => 0,
139
+ :description => "No pending transfers"
140
+ )
141
+ end
142
+ end
143
+
116
144
  def check_disk
117
145
  gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
118
146
  report(
@@ -125,8 +153,8 @@ class Riemann::Tools::Riak
125
153
  end
126
154
 
127
155
  # Returns the riak stat for the given fsm type and percentile.
128
- def fsm_stat(type, percentile)
129
- "node_#{type}_fsm_time_#{percentile == 50 ? 'median' : percentile}"
156
+ def fsm_stat(type, property, percentile)
157
+ "node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}"
130
158
  end
131
159
 
132
160
  # Returns the alerts state for the given fsm.
@@ -209,11 +237,14 @@ class Riemann::Tools::Riak
209
237
  'vnode_puts',
210
238
  'node_gets',
211
239
  'node_puts',
240
+ 'node_gets_set',
241
+ 'node_puts_set',
212
242
  'read_repairs']
213
243
  end
214
244
 
215
245
  def fsm_types
216
- ['get', 'put']
246
+ [{'get' => 'time'}, {'put' => 'time'},
247
+ {'get' => 'set_objsize'}]
217
248
  end
218
249
 
219
250
  def fsm_percentiles
@@ -233,9 +264,11 @@ class Riemann::Tools::Riak
233
264
  core_services.each do |s|
234
265
  report(event.merge(:service => "riak #{s}"))
235
266
  end
236
- fsm_types.each do |type|
237
- fsm_percentiles.each do |percentile|
238
- report(event.merge(:service => "riak #{type} #{percentile}"))
267
+ fsm_types.each do |typespec|
268
+ typespec.each do |type, prop|
269
+ fsm_percentiles.each do |percentile|
270
+ report(event.merge(:service => "riak #{type} #{prop} #{percentile}"))
271
+ end
239
272
  end
240
273
  end
241
274
  return
@@ -260,19 +293,25 @@ class Riemann::Tools::Riak
260
293
  end
261
294
 
262
295
  # FSMs
263
- fsm_types.each do |type|
264
- fsm_percentiles.each do |percentile|
265
- val = stats[fsm_stat(type, percentile)].to_i || 0
266
- val = 0 if val == 'undefined'
267
- val /= 1000.0 # Convert us to ms
268
- state = fsm_state(type, percentile, val)
269
- report(
270
- :host => opts[:riak_host],
271
- :service => "riak #{type} #{percentile}",
272
- :state => state,
273
- :metric => val,
274
- :description => "#{val} ms"
275
- )
296
+ fsm_types.each do |typespec|
297
+ typespec.each do |type, prop|
298
+ fsm_percentiles.each do |percentile|
299
+ val = stats[fsm_stat(type, prop, percentile)].to_i || 0
300
+ val = 0 if val == 'undefined'
301
+ val /= 1000.0 if prop == 'time' # Convert us to ms
302
+ if prop == 'time'
303
+ state = fsm_state(type, percentile, val)
304
+ else
305
+ state = "ok"
306
+ end
307
+ report(
308
+ :host => opts[:riak_host],
309
+ :service => "riak #{type} #{prop} #{percentile}",
310
+ :state => state,
311
+ :metric => val,
312
+ :description => "#{val} ms"
313
+ )
314
+ end
276
315
  end
277
316
  end
278
317
  end
@@ -283,6 +322,7 @@ class Riemann::Tools::Riak
283
322
  check_stats
284
323
  check_ring
285
324
  check_disk
325
+ check_transfers
286
326
  end
287
327
  end
288
328
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: riemann-tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kyle Kingsbury
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-30 00:00:00.000000000 Z
11
+ date: 2015-01-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: riemann-client