riemann-tools 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/riemann-rabbitmq DELETED
@@ -1,267 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.expand_path('../../lib/riemann/tools', __FILE__)
4
-
5
- class Riemann::Tools::Rabbitmq
6
- include Riemann::Tools
7
-
8
- require 'faraday'
9
- require 'json'
10
- require 'uri'
11
-
12
-
13
- opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
14
- opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
15
-
16
- opt :monitor_user, 'RabbitMQ monitoring user', type: :string
17
- opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
18
- opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672
19
- opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost"
20
- opt :monitor_use_tls, 'RabbitMQ use tls', type: :bool, default: false
21
-
22
- opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000
23
- opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string
24
-
25
- opt :node, "Specify a node to monitor", type: :strings
26
-
27
- def base_url
28
- protocol = "http"
29
- if (options[:monitor_use_tls]) && (options[:monitor_use_tls]==true)
30
- protocol = "https"
31
- end
32
- "#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
33
- end
34
-
35
- def overview_url
36
- "#{base_url}/overview"
37
- end
38
-
39
- def node_url(n)
40
- "#{base_url}/nodes/#{n}"
41
- end
42
-
43
- def queues_url
44
- "#{base_url}/queues"
45
- end
46
-
47
- def event_host
48
- if options[:event_host]
49
- return options[:event_host]
50
- else
51
- return options[:monitor_host]
52
- end
53
- end
54
-
55
- def safe_get(uri, event_host)
56
- # Handle connection timeouts
57
- response = nil
58
- begin
59
- connection = Faraday.new(uri)
60
- response = connection.get do |req|
61
- req.options[:timeout] = options[:read_timeout]
62
- req.options[:open_timeout] = options[:open_timeout]
63
- end
64
- report(:host => event_host,
65
- :service => "rabbitmq monitoring",
66
- :state => 'ok',
67
- :description => "Monitoring operational"
68
- )
69
- rescue => e
70
- report(:host => event_host,
71
- :service => "rabbitmq monitoring",
72
- :state => "critical",
73
- :description => "HTTP connection error: #{e.class} - #{e.message}"
74
- )
75
- end
76
- response
77
- end
78
-
79
- def check_queues
80
- response = safe_get(queues_url, event_host)
81
- max_size_check_filter = if options[:ignore_max_size_queues]
82
- Regexp.new(options[:ignore_max_size_queues])
83
- else
84
- nil
85
- end
86
-
87
- return if response.nil?
88
-
89
- json = JSON.parse(response.body)
90
-
91
- if response.status != 200
92
- report(:host => event_host,
93
- :service => "rabbitmq.queue",
94
- :state => "critical",
95
- :description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}"
96
- )
97
- else
98
- report(:host => event_host,
99
- :service => "rabbitmq.queue",
100
- :state => "ok",
101
- :description => "HTTP connection ok"
102
- )
103
-
104
- json = JSON.parse(response.body)
105
-
106
- json.each do |queue|
107
- svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}"
108
- errs = []
109
-
110
- if queue['messages_ready']!=nil and queue['messages_ready'] > 0 and queue['consumers'] == 0
111
- errs << "Queue has jobs but no consumers"
112
- end
113
-
114
- if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready']!=nil and queue['messages_ready'] > options[:max_queue_size]
115
- errs << "Queue has #{queue['messages_ready']} jobs"
116
- end
117
-
118
- if errs.empty?
119
- report(:host => event_host,
120
- :service => svc,
121
- :state => "ok",
122
- :description => "Queue is looking good"
123
- )
124
- else
125
- report(:host => event_host,
126
- :service => svc,
127
- :state => "critical",
128
- :description => errs.join("; ")
129
- )
130
- end
131
-
132
- stats = (queue['message_stats'] || {}).merge(
133
- 'messages' => queue['messages'],
134
- 'messages_details' => queue['messages_details'],
135
- 'messages_ready' => queue['messages_ready'],
136
- 'messages_ready_details' => queue['messages_ready_details'],
137
- 'messages_unacknowledged' => queue['messages_unacknowledged'],
138
- 'messages_unacknowledged_details' => queue['messages_unacknowledged_details'],
139
- 'consumers' => queue['consumers'],
140
- 'memory' => queue['memory'],
141
- )
142
-
143
- stats.each_pair do |k,v|
144
- service = "#{svc}.#{k}"
145
- if k =~ /details$/ and v!=nil
146
- metric = v['rate']
147
- else
148
- metric = v
149
- end
150
-
151
- # TODO: Set state via thresholds which can be configured
152
-
153
- report(:host => event_host,
154
- :service => service,
155
- :metric => metric,
156
- :description => "RabbitMQ monitor"
157
- )
158
- end
159
- end
160
- end
161
- end
162
-
163
- def check_overview
164
- uri = URI(overview_url)
165
- response = safe_get(uri, event_host)
166
-
167
- return if response.nil?
168
-
169
- json = JSON.parse(response.body)
170
-
171
- if response.status != 200
172
- report(:host => event_host,
173
- :service => "rabbitmq",
174
- :state => "critical",
175
- :description => "HTTP connection error: #{response.status} - #{response.body}"
176
- )
177
- else
178
- report(:host => event_host,
179
- :service => "rabbitmq monitoring",
180
- :state => "ok",
181
- :description => "HTTP connection ok"
182
- )
183
-
184
- %w( message_stats queue_totals object_totals ).each do |stat|
185
- # NOTE / BUG ?
186
- # Brand new servers can have blank message stats. Is this ok?
187
- # I can't decide.
188
- next if json[stat].empty?
189
- json[stat].each_pair do |k,v|
190
- service = "rabbitmq.#{stat}.#{k}"
191
- if k =~ /details$/
192
- metric = v['rate']
193
- else
194
- metric = v
195
- end
196
-
197
- # TODO: Set state via thresholds which can be configured
198
-
199
- report(:host => event_host,
200
- :service => service,
201
- :metric => metric,
202
- :description => "RabbitMQ monitor"
203
- )
204
- end
205
- end
206
- end
207
- end
208
-
209
- def check_node
210
- opts[:node].each do |n|
211
- uri = URI(node_url(n))
212
- response = safe_get(uri, event_host)
213
-
214
- return if response.nil?
215
-
216
- if response.status != 200
217
- if response.status == 404
218
- report(:host => event_host,
219
- :service => "rabbitmq.node.#{n}",
220
- :state => "critical",
221
- :description => "Node was not found in the cluster"
222
- )
223
- else
224
- report(:host => event_host,
225
- :service => "rabbitmq.node.#{n}",
226
- :state => "critical",
227
- :description => "HTTP error: #{response.status} - #{response.body}"
228
- )
229
- end
230
- return
231
- end
232
-
233
- json = JSON.parse(response.body)
234
-
235
- if json['mem_alarm']
236
- report(:host => event_host,
237
- :service => "rabbitmq.node.#{n}",
238
- :state => "critical",
239
- :description => "Memory alarm has triggered; job submission throttled"
240
- )
241
- return
242
- end
243
-
244
- if json['disk_free_alarm']
245
- report(:host => event_host,
246
- :service => "rabbitmq.node.#{n}",
247
- :state => "critical",
248
- :description => "Disk free alarm has triggered; job submission throttled"
249
- )
250
- return
251
- end
252
-
253
- report(:host => event_host,
254
- :service => "rabbitmq.node.#{n}",
255
- :state => "ok",
256
- :description => "Node looks OK to me"
257
- )
258
- end
259
- end
260
-
261
- def tick
262
- check_overview
263
- check_node if opts[:node]
264
- check_queues
265
- end
266
- end
267
- Riemann::Tools::Rabbitmq.run
data/bin/riemann-resmon DELETED
@@ -1,103 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.expand_path('../../lib/riemann/tools', __FILE__)
4
-
5
- class Riemann::Tools::Resmon
6
- include Riemann::Tools
7
- require 'nokogiri'
8
- require 'faraday'
9
-
10
- opt :resmon_hostfile, 'File with hostnames running Resmon (one URI per line)', type: :string
11
- opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
12
- opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
13
- opt :fqdn, 'Use FQDN for event host'
14
-
15
-
16
- def initialize
17
- @hosts = File.read(options[:resmon_hostfile]).split("\n")
18
- super
19
- end
20
-
21
-
22
- # Work out the hostname to submit with the event
23
- def get_event_host(host)
24
- unless options[:fqdn]
25
- return host.split('.')[0]
26
- end
27
- return host
28
- end
29
-
30
- # Handles HTTP connections and GET requests safely
31
- def safe_get(uri, event_host)
32
- # Handle connection timeouts
33
- response = nil
34
- begin
35
- connection = Faraday.new(uri)
36
- response = connection.get do |req|
37
- req.options[:timeout] = options[:read_timeout]
38
- req.options[:open_timeout] = options[:open_timeout]
39
- end
40
- rescue => e
41
- report(:host => event_host,
42
- :service => "resmon",
43
- :state => "critical",
44
- :description => "HTTP connection error: #{e.class} - #{e.message}"
45
- )
46
- end
47
- response
48
- end
49
-
50
- def tick
51
- @hosts.each do |host|
52
-
53
- uri = URI(host)
54
- event_host = get_event_host(uri.host)
55
-
56
- response = safe_get(uri, event_host)
57
- next if response.nil?
58
-
59
- # Handle non-200 responses
60
- if response.status != 200
61
- report(:host => event_host,
62
- :service => "resmon",
63
- :state => "critical",
64
- :description => "HTTP connection error: #{response.status} - #{response.body}"
65
- )
66
- next
67
- else
68
- report(:host => event_host,
69
- :service => "resmon",
70
- :state => "ok",
71
- :description => "Resmon connection ok"
72
- )
73
- doc = Nokogiri::XML(response.body)
74
- end
75
-
76
- doc.xpath('//ResmonResults/ResmonResult').each do |result|
77
- timestamp = result.xpath('last_update').first.text
78
- result.xpath('metric').each do |metric|
79
- hash = {
80
- host: event_host,
81
- service: "#{result.attributes['module'].value}`#{result.attributes['service'].value}`#{metric.attributes['name'].value}",
82
- time: timestamp.to_i
83
- }
84
-
85
- case metric.attributes['type'].value
86
- when /[iIlL]/
87
- hash[:metric] = metric.text.to_i
88
- when 'n'
89
- hash[:metric] = metric.text.to_f
90
- when 's'
91
- hash[:description] = metric.text
92
- when '0'
93
- raise 'dunno what 0 is yet'
94
- end
95
-
96
- report(hash)
97
- end
98
- end
99
- end
100
- end
101
- end
102
-
103
- Riemann::Tools::Resmon.run
data/bin/riemann-riak DELETED
@@ -1,329 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # Forwards information on a Riak node to Riemann.
4
-
5
- require File.expand_path('../../lib/riemann/tools', __FILE__)
6
-
7
- require 'net/http'
8
- require 'net/https'
9
- require 'yajl/json_gem'
10
-
11
- class Riemann::Tools::Riak
12
- include Riemann::Tools
13
-
14
- opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
15
- opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
16
- opt :stats_port, "Riak HTTP port for stats", :default => 8098
17
- opt :stats_path, "Riak HTTP stats path", :default => '/stats'
18
- opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}"
19
- opt :cookie, "Riak cookie to use", :default => "riak"
20
-
21
- opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000
22
- opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000
23
- opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000
24
- opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
25
- opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
26
- opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
27
-
28
- def initialize
29
- detect_features
30
-
31
- @httpstatus = true
32
- # What's going on here? --aphyr
33
- if
34
- begin
35
- uri = URI.parse(opts[:riak_host])
36
- if uri.host == nil
37
- uri.host = opts[:riak_host]
38
- end
39
- http = Net::HTTP.new(uri.host, opts[:stats_port])
40
- http.use_ssl = uri.scheme == 'https'
41
- if http.use_ssl?
42
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
43
- end
44
- http.start do |http|
45
- http.get opts[:stats_path]
46
- end
47
- rescue => e
48
- @httpstatus = false
49
- end
50
- end
51
-
52
- # we're going to override the emulator setting to allow users to
53
- # dynamically input the cookie
54
- # this is done only once - hopefully it doesn't get overridden.
55
- ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
56
- end
57
-
58
- # Identifies whether escript and riak-admin are installed
59
- def detect_features
60
- @escript = true # Whether escript is present on this machine
61
- @riakadmin = true # Whether riak-admin is present
62
-
63
- if `which escript` =~ /^\s*$/
64
- @escript = false
65
- end
66
-
67
- if `which riak-admin` =~ /^\s*$/
68
- @riakadmin = false
69
- end
70
- end
71
-
72
- def check_ring
73
- if @escript
74
- str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
75
- elsif @riakadmin
76
- str = `riak-admin ringready`
77
- end
78
-
79
- if str =~ /^TRUE/
80
- report(
81
- :host => opts[:riak_host],
82
- :service => 'riak ring',
83
- :state => 'ok',
84
- :description => str
85
- )
86
- else
87
- report(
88
- :host => opts[:riak_host],
89
- :service => 'riak ring',
90
- :state => 'warning',
91
- :description => str
92
- )
93
- end
94
- end
95
-
96
- def check_keys
97
- keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp
98
- if keys =~ /^\d+$/
99
- report(
100
- :host => opts[:riak_host],
101
- :service => 'riak keys',
102
- :state => 'ok',
103
- :metric => keys.to_i,
104
- :description => keys
105
- )
106
- else
107
- report(
108
- :host => opts[:riak_host],
109
- :service => 'riak keys',
110
- :state => 'unknown',
111
- :description => keys
112
- )
113
- end
114
- end
115
-
116
- def check_transfers
117
- str = if @riakadmin
118
- `riak-admin transfers`
119
- else
120
- nil
121
- end
122
-
123
- return if str.nil?
124
-
125
- if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
126
- report(
127
- :host => opts[:riak_host],
128
- :service => 'riak transfers',
129
- :state => 'critical',
130
- :metric => $1.to_i,
131
- :description => "waiting to handoff #{$1} partitions"
132
- )
133
- else
134
- report(
135
- :host => opts[:riak_host],
136
- :service => 'riak transfers',
137
- :state => 'ok',
138
- :metric => 0,
139
- :description => "No pending transfers"
140
- )
141
- end
142
- end
143
-
144
- def check_disk
145
- gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
146
- report(
147
- :host => opts[:riak_host],
148
- :service => 'riak disk',
149
- :state => 'ok',
150
- :metric => gb,
151
- :description => "#{gb} GB in #{opts[:data_dir]}"
152
- )
153
- end
154
-
155
- # Returns the riak stat for the given fsm type and percentile.
156
- def fsm_stat(type, property, percentile)
157
- "node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}"
158
- end
159
-
160
- # Returns the alerts state for the given fsm.
161
- def fsm_state(type, percentile, val)
162
- limit = opts["#{type}_#{percentile}_warning".to_sym]
163
- case val
164
- when 0 .. limit
165
- 'ok'
166
- when limit .. limit * 2
167
- 'warning'
168
- else
169
- 'critical'
170
- end
171
- end
172
-
173
- # Get current stats via HTTP
174
- def stats_http
175
- begin
176
- uri = URI.parse(opts[:riak_host])
177
- if uri.host == nil
178
- uri.host = opts[:riak_host]
179
- end
180
- http = Net::HTTP.new(uri.host, opts[:stats_port])
181
- http.use_ssl = uri.scheme == 'https'
182
- if http.use_ssl?
183
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
184
- end
185
- res = http.start do |http|
186
- http.get opts[:stats_path]
187
- end
188
- rescue => e
189
- report(
190
- :host => opts[:riak_host],
191
- :service => 'riak',
192
- :state => 'critical',
193
- :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
194
- )
195
- raise
196
- end
197
-
198
- if res.code.to_i == 200
199
- return JSON.parse(res.body)
200
- else
201
- report(
202
- :host => opts[:riak_host],
203
- :service => 'riak',
204
- :state => 'critical',
205
- :description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
206
- )
207
- raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}"
208
- end
209
- end
210
-
211
- # Get current stats via riak-admin
212
- def stats_riak_admin
213
- str = `riak-admin status`
214
- raise "riak-admin failed" unless $? == 0
215
- Hash[str.split(/\n/).map{|i| i.split(/ : /)}]
216
- end
217
-
218
- # Get current stats as a hash
219
- def stats
220
- if @httpstatus
221
- stats_http
222
- elsif @riakadmin
223
- stats_riak_admin
224
- else
225
- report(
226
- :host => opts[:riak_host],
227
- :service => 'riak',
228
- :state => 'critical',
229
- :description => "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
230
- )
231
- raise "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
232
- end
233
- end
234
-
235
- def core_services
236
- ['vnode_gets',
237
- 'vnode_puts',
238
- 'node_gets',
239
- 'node_puts',
240
- 'node_gets_set',
241
- 'node_puts_set',
242
- 'read_repairs']
243
- end
244
-
245
- def fsm_types
246
- [{'get' => 'time'}, {'put' => 'time'},
247
- {'get' => 'set_objsize'}]
248
- end
249
-
250
- def fsm_percentiles
251
- [50, 95, 99]
252
- end
253
-
254
- # Reports current stats to Riemann
255
- def check_stats
256
- begin
257
- stats = self.stats
258
- rescue => e
259
- event = {:state => 'critical',
260
- :description => e.message,
261
- :host => opts[:riak_host]}
262
- # Report errors
263
- report(event.merge(:service => 'riak'))
264
- core_services.each do |s|
265
- report(event.merge(:service => "riak #{s}"))
266
- end
267
- fsm_types.each do |typespec|
268
- typespec.each do |type, prop|
269
- fsm_percentiles.each do |percentile|
270
- report(event.merge(:service => "riak #{type} #{prop} #{percentile}"))
271
- end
272
- end
273
- end
274
- return
275
- end
276
-
277
- # Riak itself
278
- report(
279
- :host => opts[:riak_host],
280
- :service => 'riak',
281
- :state => 'ok'
282
- )
283
-
284
- # Gets/puts/rr
285
- core_services.each do |s|
286
- report(
287
- :host => opts[:riak_host],
288
- :service => "riak #{s}",
289
- :state => 'ok',
290
- :metric => stats[s].to_i/60.0,
291
- :description => "#{stats[s].to_i/60.0}/sec"
292
- )
293
- end
294
-
295
- # FSMs
296
- fsm_types.each do |typespec|
297
- typespec.each do |type, prop|
298
- fsm_percentiles.each do |percentile|
299
- val = stats[fsm_stat(type, prop, percentile)].to_i || 0
300
- val = 0 if val == 'undefined'
301
- val /= 1000.0 if prop == 'time' # Convert us to ms
302
- if prop == 'time'
303
- state = fsm_state(type, percentile, val)
304
- else
305
- state = "ok"
306
- end
307
- report(
308
- :host => opts[:riak_host],
309
- :service => "riak #{type} #{prop} #{percentile}",
310
- :state => state,
311
- :metric => val,
312
- :description => "#{val} ms"
313
- )
314
- end
315
- end
316
- end
317
- end
318
-
319
- def tick
320
- # This can utterly destroy a cluster, so we disable
321
- # check_keys
322
- check_stats
323
- check_ring
324
- check_disk
325
- check_transfers
326
- end
327
- end
328
-
329
- Riemann::Tools::Riak.run