riemann-tools 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.markdown +63 -5
- data/bin/riemann-consul +106 -0
- data/bin/riemann-haproxy +7 -1
- data/bin/riemann-health +4 -1
- data/bin/riemann-net +1 -1
- data/lib/riemann/tools.rb +2 -2
- metadata +33 -125
- data/bin/riemann-aws-billing +0 -79
- data/bin/riemann-aws-rds-status +0 -48
- data/bin/riemann-aws-status +0 -64
- data/bin/riemann-elasticsearch +0 -91
- data/bin/riemann-elb-metrics +0 -154
- data/bin/riemann-munin +0 -36
- data/bin/riemann-rabbitmq +0 -267
- data/bin/riemann-resmon +0 -103
- data/bin/riemann-riak +0 -329
- data/bin/riemann-riak-keys +0 -12
- data/bin/riemann-riak-ring +0 -8
data/bin/riemann-rabbitmq
DELETED
@@ -1,267 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require File.expand_path('../../lib/riemann/tools', __FILE__)
|
4
|
-
|
5
|
-
class Riemann::Tools::Rabbitmq
|
6
|
-
include Riemann::Tools
|
7
|
-
|
8
|
-
require 'faraday'
|
9
|
-
require 'json'
|
10
|
-
require 'uri'
|
11
|
-
|
12
|
-
|
13
|
-
opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
|
14
|
-
opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
|
15
|
-
|
16
|
-
opt :monitor_user, 'RabbitMQ monitoring user', type: :string
|
17
|
-
opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
|
18
|
-
opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672
|
19
|
-
opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost"
|
20
|
-
opt :monitor_use_tls, 'RabbitMQ use tls', type: :bool, default: false
|
21
|
-
|
22
|
-
opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000
|
23
|
-
opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string
|
24
|
-
|
25
|
-
opt :node, "Specify a node to monitor", type: :strings
|
26
|
-
|
27
|
-
def base_url
|
28
|
-
protocol = "http"
|
29
|
-
if (options[:monitor_use_tls]) && (options[:monitor_use_tls]==true)
|
30
|
-
protocol = "https"
|
31
|
-
end
|
32
|
-
"#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
|
33
|
-
end
|
34
|
-
|
35
|
-
def overview_url
|
36
|
-
"#{base_url}/overview"
|
37
|
-
end
|
38
|
-
|
39
|
-
def node_url(n)
|
40
|
-
"#{base_url}/nodes/#{n}"
|
41
|
-
end
|
42
|
-
|
43
|
-
def queues_url
|
44
|
-
"#{base_url}/queues"
|
45
|
-
end
|
46
|
-
|
47
|
-
def event_host
|
48
|
-
if options[:event_host]
|
49
|
-
return options[:event_host]
|
50
|
-
else
|
51
|
-
return options[:monitor_host]
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def safe_get(uri, event_host)
|
56
|
-
# Handle connection timeouts
|
57
|
-
response = nil
|
58
|
-
begin
|
59
|
-
connection = Faraday.new(uri)
|
60
|
-
response = connection.get do |req|
|
61
|
-
req.options[:timeout] = options[:read_timeout]
|
62
|
-
req.options[:open_timeout] = options[:open_timeout]
|
63
|
-
end
|
64
|
-
report(:host => event_host,
|
65
|
-
:service => "rabbitmq monitoring",
|
66
|
-
:state => 'ok',
|
67
|
-
:description => "Monitoring operational"
|
68
|
-
)
|
69
|
-
rescue => e
|
70
|
-
report(:host => event_host,
|
71
|
-
:service => "rabbitmq monitoring",
|
72
|
-
:state => "critical",
|
73
|
-
:description => "HTTP connection error: #{e.class} - #{e.message}"
|
74
|
-
)
|
75
|
-
end
|
76
|
-
response
|
77
|
-
end
|
78
|
-
|
79
|
-
def check_queues
|
80
|
-
response = safe_get(queues_url, event_host)
|
81
|
-
max_size_check_filter = if options[:ignore_max_size_queues]
|
82
|
-
Regexp.new(options[:ignore_max_size_queues])
|
83
|
-
else
|
84
|
-
nil
|
85
|
-
end
|
86
|
-
|
87
|
-
return if response.nil?
|
88
|
-
|
89
|
-
json = JSON.parse(response.body)
|
90
|
-
|
91
|
-
if response.status != 200
|
92
|
-
report(:host => event_host,
|
93
|
-
:service => "rabbitmq.queue",
|
94
|
-
:state => "critical",
|
95
|
-
:description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}"
|
96
|
-
)
|
97
|
-
else
|
98
|
-
report(:host => event_host,
|
99
|
-
:service => "rabbitmq.queue",
|
100
|
-
:state => "ok",
|
101
|
-
:description => "HTTP connection ok"
|
102
|
-
)
|
103
|
-
|
104
|
-
json = JSON.parse(response.body)
|
105
|
-
|
106
|
-
json.each do |queue|
|
107
|
-
svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}"
|
108
|
-
errs = []
|
109
|
-
|
110
|
-
if queue['messages_ready']!=nil and queue['messages_ready'] > 0 and queue['consumers'] == 0
|
111
|
-
errs << "Queue has jobs but no consumers"
|
112
|
-
end
|
113
|
-
|
114
|
-
if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready']!=nil and queue['messages_ready'] > options[:max_queue_size]
|
115
|
-
errs << "Queue has #{queue['messages_ready']} jobs"
|
116
|
-
end
|
117
|
-
|
118
|
-
if errs.empty?
|
119
|
-
report(:host => event_host,
|
120
|
-
:service => svc,
|
121
|
-
:state => "ok",
|
122
|
-
:description => "Queue is looking good"
|
123
|
-
)
|
124
|
-
else
|
125
|
-
report(:host => event_host,
|
126
|
-
:service => svc,
|
127
|
-
:state => "critical",
|
128
|
-
:description => errs.join("; ")
|
129
|
-
)
|
130
|
-
end
|
131
|
-
|
132
|
-
stats = (queue['message_stats'] || {}).merge(
|
133
|
-
'messages' => queue['messages'],
|
134
|
-
'messages_details' => queue['messages_details'],
|
135
|
-
'messages_ready' => queue['messages_ready'],
|
136
|
-
'messages_ready_details' => queue['messages_ready_details'],
|
137
|
-
'messages_unacknowledged' => queue['messages_unacknowledged'],
|
138
|
-
'messages_unacknowledged_details' => queue['messages_unacknowledged_details'],
|
139
|
-
'consumers' => queue['consumers'],
|
140
|
-
'memory' => queue['memory'],
|
141
|
-
)
|
142
|
-
|
143
|
-
stats.each_pair do |k,v|
|
144
|
-
service = "#{svc}.#{k}"
|
145
|
-
if k =~ /details$/ and v!=nil
|
146
|
-
metric = v['rate']
|
147
|
-
else
|
148
|
-
metric = v
|
149
|
-
end
|
150
|
-
|
151
|
-
# TODO: Set state via thresholds which can be configured
|
152
|
-
|
153
|
-
report(:host => event_host,
|
154
|
-
:service => service,
|
155
|
-
:metric => metric,
|
156
|
-
:description => "RabbitMQ monitor"
|
157
|
-
)
|
158
|
-
end
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
def check_overview
|
164
|
-
uri = URI(overview_url)
|
165
|
-
response = safe_get(uri, event_host)
|
166
|
-
|
167
|
-
return if response.nil?
|
168
|
-
|
169
|
-
json = JSON.parse(response.body)
|
170
|
-
|
171
|
-
if response.status != 200
|
172
|
-
report(:host => event_host,
|
173
|
-
:service => "rabbitmq",
|
174
|
-
:state => "critical",
|
175
|
-
:description => "HTTP connection error: #{response.status} - #{response.body}"
|
176
|
-
)
|
177
|
-
else
|
178
|
-
report(:host => event_host,
|
179
|
-
:service => "rabbitmq monitoring",
|
180
|
-
:state => "ok",
|
181
|
-
:description => "HTTP connection ok"
|
182
|
-
)
|
183
|
-
|
184
|
-
%w( message_stats queue_totals object_totals ).each do |stat|
|
185
|
-
# NOTE / BUG ?
|
186
|
-
# Brand new servers can have blank message stats. Is this ok?
|
187
|
-
# I can't decide.
|
188
|
-
next if json[stat].empty?
|
189
|
-
json[stat].each_pair do |k,v|
|
190
|
-
service = "rabbitmq.#{stat}.#{k}"
|
191
|
-
if k =~ /details$/
|
192
|
-
metric = v['rate']
|
193
|
-
else
|
194
|
-
metric = v
|
195
|
-
end
|
196
|
-
|
197
|
-
# TODO: Set state via thresholds which can be configured
|
198
|
-
|
199
|
-
report(:host => event_host,
|
200
|
-
:service => service,
|
201
|
-
:metric => metric,
|
202
|
-
:description => "RabbitMQ monitor"
|
203
|
-
)
|
204
|
-
end
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
208
|
-
|
209
|
-
def check_node
|
210
|
-
opts[:node].each do |n|
|
211
|
-
uri = URI(node_url(n))
|
212
|
-
response = safe_get(uri, event_host)
|
213
|
-
|
214
|
-
return if response.nil?
|
215
|
-
|
216
|
-
if response.status != 200
|
217
|
-
if response.status == 404
|
218
|
-
report(:host => event_host,
|
219
|
-
:service => "rabbitmq.node.#{n}",
|
220
|
-
:state => "critical",
|
221
|
-
:description => "Node was not found in the cluster"
|
222
|
-
)
|
223
|
-
else
|
224
|
-
report(:host => event_host,
|
225
|
-
:service => "rabbitmq.node.#{n}",
|
226
|
-
:state => "critical",
|
227
|
-
:description => "HTTP error: #{response.status} - #{response.body}"
|
228
|
-
)
|
229
|
-
end
|
230
|
-
return
|
231
|
-
end
|
232
|
-
|
233
|
-
json = JSON.parse(response.body)
|
234
|
-
|
235
|
-
if json['mem_alarm']
|
236
|
-
report(:host => event_host,
|
237
|
-
:service => "rabbitmq.node.#{n}",
|
238
|
-
:state => "critical",
|
239
|
-
:description => "Memory alarm has triggered; job submission throttled"
|
240
|
-
)
|
241
|
-
return
|
242
|
-
end
|
243
|
-
|
244
|
-
if json['disk_free_alarm']
|
245
|
-
report(:host => event_host,
|
246
|
-
:service => "rabbitmq.node.#{n}",
|
247
|
-
:state => "critical",
|
248
|
-
:description => "Disk free alarm has triggered; job submission throttled"
|
249
|
-
)
|
250
|
-
return
|
251
|
-
end
|
252
|
-
|
253
|
-
report(:host => event_host,
|
254
|
-
:service => "rabbitmq.node.#{n}",
|
255
|
-
:state => "ok",
|
256
|
-
:description => "Node looks OK to me"
|
257
|
-
)
|
258
|
-
end
|
259
|
-
end
|
260
|
-
|
261
|
-
def tick
|
262
|
-
check_overview
|
263
|
-
check_node if opts[:node]
|
264
|
-
check_queues
|
265
|
-
end
|
266
|
-
end
|
267
|
-
Riemann::Tools::Rabbitmq.run
|
data/bin/riemann-resmon
DELETED
@@ -1,103 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require File.expand_path('../../lib/riemann/tools', __FILE__)
|
4
|
-
|
5
|
-
class Riemann::Tools::Resmon
|
6
|
-
include Riemann::Tools
|
7
|
-
require 'nokogiri'
|
8
|
-
require 'faraday'
|
9
|
-
|
10
|
-
opt :resmon_hostfile, 'File with hostnames running Resmon (one URI per line)', type: :string
|
11
|
-
opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
|
12
|
-
opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
|
13
|
-
opt :fqdn, 'Use FQDN for event host'
|
14
|
-
|
15
|
-
|
16
|
-
def initialize
|
17
|
-
@hosts = File.read(options[:resmon_hostfile]).split("\n")
|
18
|
-
super
|
19
|
-
end
|
20
|
-
|
21
|
-
|
22
|
-
# Work out the hostname to submit with the event
|
23
|
-
def get_event_host(host)
|
24
|
-
unless options[:fqdn]
|
25
|
-
return host.split('.')[0]
|
26
|
-
end
|
27
|
-
return host
|
28
|
-
end
|
29
|
-
|
30
|
-
# Handles HTTP connections and GET requests safely
|
31
|
-
def safe_get(uri, event_host)
|
32
|
-
# Handle connection timeouts
|
33
|
-
response = nil
|
34
|
-
begin
|
35
|
-
connection = Faraday.new(uri)
|
36
|
-
response = connection.get do |req|
|
37
|
-
req.options[:timeout] = options[:read_timeout]
|
38
|
-
req.options[:open_timeout] = options[:open_timeout]
|
39
|
-
end
|
40
|
-
rescue => e
|
41
|
-
report(:host => event_host,
|
42
|
-
:service => "resmon",
|
43
|
-
:state => "critical",
|
44
|
-
:description => "HTTP connection error: #{e.class} - #{e.message}"
|
45
|
-
)
|
46
|
-
end
|
47
|
-
response
|
48
|
-
end
|
49
|
-
|
50
|
-
def tick
|
51
|
-
@hosts.each do |host|
|
52
|
-
|
53
|
-
uri = URI(host)
|
54
|
-
event_host = get_event_host(uri.host)
|
55
|
-
|
56
|
-
response = safe_get(uri, event_host)
|
57
|
-
next if response.nil?
|
58
|
-
|
59
|
-
# Handle non-200 responses
|
60
|
-
if response.status != 200
|
61
|
-
report(:host => event_host,
|
62
|
-
:service => "resmon",
|
63
|
-
:state => "critical",
|
64
|
-
:description => "HTTP connection error: #{response.status} - #{response.body}"
|
65
|
-
)
|
66
|
-
next
|
67
|
-
else
|
68
|
-
report(:host => event_host,
|
69
|
-
:service => "resmon",
|
70
|
-
:state => "ok",
|
71
|
-
:description => "Resmon connection ok"
|
72
|
-
)
|
73
|
-
doc = Nokogiri::XML(response.body)
|
74
|
-
end
|
75
|
-
|
76
|
-
doc.xpath('//ResmonResults/ResmonResult').each do |result|
|
77
|
-
timestamp = result.xpath('last_update').first.text
|
78
|
-
result.xpath('metric').each do |metric|
|
79
|
-
hash = {
|
80
|
-
host: event_host,
|
81
|
-
service: "#{result.attributes['module'].value}`#{result.attributes['service'].value}`#{metric.attributes['name'].value}",
|
82
|
-
time: timestamp.to_i
|
83
|
-
}
|
84
|
-
|
85
|
-
case metric.attributes['type'].value
|
86
|
-
when /[iIlL]/
|
87
|
-
hash[:metric] = metric.text.to_i
|
88
|
-
when 'n'
|
89
|
-
hash[:metric] = metric.text.to_f
|
90
|
-
when 's'
|
91
|
-
hash[:description] = metric.text
|
92
|
-
when '0'
|
93
|
-
raise 'dunno what 0 is yet'
|
94
|
-
end
|
95
|
-
|
96
|
-
report(hash)
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
Riemann::Tools::Resmon.run
|
data/bin/riemann-riak
DELETED
@@ -1,329 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# Forwards information on a Riak node to Riemann.
|
4
|
-
|
5
|
-
require File.expand_path('../../lib/riemann/tools', __FILE__)
|
6
|
-
|
7
|
-
require 'net/http'
|
8
|
-
require 'net/https'
|
9
|
-
require 'yajl/json_gem'
|
10
|
-
|
11
|
-
class Riemann::Tools::Riak
|
12
|
-
include Riemann::Tools
|
13
|
-
|
14
|
-
opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
|
15
|
-
opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
|
16
|
-
opt :stats_port, "Riak HTTP port for stats", :default => 8098
|
17
|
-
opt :stats_path, "Riak HTTP stats path", :default => '/stats'
|
18
|
-
opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}"
|
19
|
-
opt :cookie, "Riak cookie to use", :default => "riak"
|
20
|
-
|
21
|
-
opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000
|
22
|
-
opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000
|
23
|
-
opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000
|
24
|
-
opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
|
25
|
-
opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
|
26
|
-
opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
|
27
|
-
|
28
|
-
def initialize
|
29
|
-
detect_features
|
30
|
-
|
31
|
-
@httpstatus = true
|
32
|
-
# What's going on here? --aphyr
|
33
|
-
if
|
34
|
-
begin
|
35
|
-
uri = URI.parse(opts[:riak_host])
|
36
|
-
if uri.host == nil
|
37
|
-
uri.host = opts[:riak_host]
|
38
|
-
end
|
39
|
-
http = Net::HTTP.new(uri.host, opts[:stats_port])
|
40
|
-
http.use_ssl = uri.scheme == 'https'
|
41
|
-
if http.use_ssl?
|
42
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
43
|
-
end
|
44
|
-
http.start do |http|
|
45
|
-
http.get opts[:stats_path]
|
46
|
-
end
|
47
|
-
rescue => e
|
48
|
-
@httpstatus = false
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# we're going to override the emulator setting to allow users to
|
53
|
-
# dynamically input the cookie
|
54
|
-
# this is done only once - hopefully it doesn't get overridden.
|
55
|
-
ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
|
56
|
-
end
|
57
|
-
|
58
|
-
# Identifies whether escript and riak-admin are installed
|
59
|
-
def detect_features
|
60
|
-
@escript = true # Whether escript is present on this machine
|
61
|
-
@riakadmin = true # Whether riak-admin is present
|
62
|
-
|
63
|
-
if `which escript` =~ /^\s*$/
|
64
|
-
@escript = false
|
65
|
-
end
|
66
|
-
|
67
|
-
if `which riak-admin` =~ /^\s*$/
|
68
|
-
@riakadmin = false
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def check_ring
|
73
|
-
if @escript
|
74
|
-
str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
|
75
|
-
elsif @riakadmin
|
76
|
-
str = `riak-admin ringready`
|
77
|
-
end
|
78
|
-
|
79
|
-
if str =~ /^TRUE/
|
80
|
-
report(
|
81
|
-
:host => opts[:riak_host],
|
82
|
-
:service => 'riak ring',
|
83
|
-
:state => 'ok',
|
84
|
-
:description => str
|
85
|
-
)
|
86
|
-
else
|
87
|
-
report(
|
88
|
-
:host => opts[:riak_host],
|
89
|
-
:service => 'riak ring',
|
90
|
-
:state => 'warning',
|
91
|
-
:description => str
|
92
|
-
)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
def check_keys
|
97
|
-
keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp
|
98
|
-
if keys =~ /^\d+$/
|
99
|
-
report(
|
100
|
-
:host => opts[:riak_host],
|
101
|
-
:service => 'riak keys',
|
102
|
-
:state => 'ok',
|
103
|
-
:metric => keys.to_i,
|
104
|
-
:description => keys
|
105
|
-
)
|
106
|
-
else
|
107
|
-
report(
|
108
|
-
:host => opts[:riak_host],
|
109
|
-
:service => 'riak keys',
|
110
|
-
:state => 'unknown',
|
111
|
-
:description => keys
|
112
|
-
)
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
def check_transfers
|
117
|
-
str = if @riakadmin
|
118
|
-
`riak-admin transfers`
|
119
|
-
else
|
120
|
-
nil
|
121
|
-
end
|
122
|
-
|
123
|
-
return if str.nil?
|
124
|
-
|
125
|
-
if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
|
126
|
-
report(
|
127
|
-
:host => opts[:riak_host],
|
128
|
-
:service => 'riak transfers',
|
129
|
-
:state => 'critical',
|
130
|
-
:metric => $1.to_i,
|
131
|
-
:description => "waiting to handoff #{$1} partitions"
|
132
|
-
)
|
133
|
-
else
|
134
|
-
report(
|
135
|
-
:host => opts[:riak_host],
|
136
|
-
:service => 'riak transfers',
|
137
|
-
:state => 'ok',
|
138
|
-
:metric => 0,
|
139
|
-
:description => "No pending transfers"
|
140
|
-
)
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
def check_disk
|
145
|
-
gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
|
146
|
-
report(
|
147
|
-
:host => opts[:riak_host],
|
148
|
-
:service => 'riak disk',
|
149
|
-
:state => 'ok',
|
150
|
-
:metric => gb,
|
151
|
-
:description => "#{gb} GB in #{opts[:data_dir]}"
|
152
|
-
)
|
153
|
-
end
|
154
|
-
|
155
|
-
# Returns the riak stat for the given fsm type and percentile.
|
156
|
-
def fsm_stat(type, property, percentile)
|
157
|
-
"node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}"
|
158
|
-
end
|
159
|
-
|
160
|
-
# Returns the alerts state for the given fsm.
|
161
|
-
def fsm_state(type, percentile, val)
|
162
|
-
limit = opts["#{type}_#{percentile}_warning".to_sym]
|
163
|
-
case val
|
164
|
-
when 0 .. limit
|
165
|
-
'ok'
|
166
|
-
when limit .. limit * 2
|
167
|
-
'warning'
|
168
|
-
else
|
169
|
-
'critical'
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
# Get current stats via HTTP
|
174
|
-
def stats_http
|
175
|
-
begin
|
176
|
-
uri = URI.parse(opts[:riak_host])
|
177
|
-
if uri.host == nil
|
178
|
-
uri.host = opts[:riak_host]
|
179
|
-
end
|
180
|
-
http = Net::HTTP.new(uri.host, opts[:stats_port])
|
181
|
-
http.use_ssl = uri.scheme == 'https'
|
182
|
-
if http.use_ssl?
|
183
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
184
|
-
end
|
185
|
-
res = http.start do |http|
|
186
|
-
http.get opts[:stats_path]
|
187
|
-
end
|
188
|
-
rescue => e
|
189
|
-
report(
|
190
|
-
:host => opts[:riak_host],
|
191
|
-
:service => 'riak',
|
192
|
-
:state => 'critical',
|
193
|
-
:description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
|
194
|
-
)
|
195
|
-
raise
|
196
|
-
end
|
197
|
-
|
198
|
-
if res.code.to_i == 200
|
199
|
-
return JSON.parse(res.body)
|
200
|
-
else
|
201
|
-
report(
|
202
|
-
:host => opts[:riak_host],
|
203
|
-
:service => 'riak',
|
204
|
-
:state => 'critical',
|
205
|
-
:description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
|
206
|
-
)
|
207
|
-
raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}"
|
208
|
-
end
|
209
|
-
end
|
210
|
-
|
211
|
-
# Get current stats via riak-admin
|
212
|
-
def stats_riak_admin
|
213
|
-
str = `riak-admin status`
|
214
|
-
raise "riak-admin failed" unless $? == 0
|
215
|
-
Hash[str.split(/\n/).map{|i| i.split(/ : /)}]
|
216
|
-
end
|
217
|
-
|
218
|
-
# Get current stats as a hash
|
219
|
-
def stats
|
220
|
-
if @httpstatus
|
221
|
-
stats_http
|
222
|
-
elsif @riakadmin
|
223
|
-
stats_riak_admin
|
224
|
-
else
|
225
|
-
report(
|
226
|
-
:host => opts[:riak_host],
|
227
|
-
:service => 'riak',
|
228
|
-
:state => 'critical',
|
229
|
-
:description => "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
|
230
|
-
)
|
231
|
-
raise "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
235
|
-
def core_services
|
236
|
-
['vnode_gets',
|
237
|
-
'vnode_puts',
|
238
|
-
'node_gets',
|
239
|
-
'node_puts',
|
240
|
-
'node_gets_set',
|
241
|
-
'node_puts_set',
|
242
|
-
'read_repairs']
|
243
|
-
end
|
244
|
-
|
245
|
-
def fsm_types
|
246
|
-
[{'get' => 'time'}, {'put' => 'time'},
|
247
|
-
{'get' => 'set_objsize'}]
|
248
|
-
end
|
249
|
-
|
250
|
-
def fsm_percentiles
|
251
|
-
[50, 95, 99]
|
252
|
-
end
|
253
|
-
|
254
|
-
# Reports current stats to Riemann
|
255
|
-
def check_stats
|
256
|
-
begin
|
257
|
-
stats = self.stats
|
258
|
-
rescue => e
|
259
|
-
event = {:state => 'critical',
|
260
|
-
:description => e.message,
|
261
|
-
:host => opts[:riak_host]}
|
262
|
-
# Report errors
|
263
|
-
report(event.merge(:service => 'riak'))
|
264
|
-
core_services.each do |s|
|
265
|
-
report(event.merge(:service => "riak #{s}"))
|
266
|
-
end
|
267
|
-
fsm_types.each do |typespec|
|
268
|
-
typespec.each do |type, prop|
|
269
|
-
fsm_percentiles.each do |percentile|
|
270
|
-
report(event.merge(:service => "riak #{type} #{prop} #{percentile}"))
|
271
|
-
end
|
272
|
-
end
|
273
|
-
end
|
274
|
-
return
|
275
|
-
end
|
276
|
-
|
277
|
-
# Riak itself
|
278
|
-
report(
|
279
|
-
:host => opts[:riak_host],
|
280
|
-
:service => 'riak',
|
281
|
-
:state => 'ok'
|
282
|
-
)
|
283
|
-
|
284
|
-
# Gets/puts/rr
|
285
|
-
core_services.each do |s|
|
286
|
-
report(
|
287
|
-
:host => opts[:riak_host],
|
288
|
-
:service => "riak #{s}",
|
289
|
-
:state => 'ok',
|
290
|
-
:metric => stats[s].to_i/60.0,
|
291
|
-
:description => "#{stats[s].to_i/60.0}/sec"
|
292
|
-
)
|
293
|
-
end
|
294
|
-
|
295
|
-
# FSMs
|
296
|
-
fsm_types.each do |typespec|
|
297
|
-
typespec.each do |type, prop|
|
298
|
-
fsm_percentiles.each do |percentile|
|
299
|
-
val = stats[fsm_stat(type, prop, percentile)].to_i || 0
|
300
|
-
val = 0 if val == 'undefined'
|
301
|
-
val /= 1000.0 if prop == 'time' # Convert us to ms
|
302
|
-
if prop == 'time'
|
303
|
-
state = fsm_state(type, percentile, val)
|
304
|
-
else
|
305
|
-
state = "ok"
|
306
|
-
end
|
307
|
-
report(
|
308
|
-
:host => opts[:riak_host],
|
309
|
-
:service => "riak #{type} #{prop} #{percentile}",
|
310
|
-
:state => state,
|
311
|
-
:metric => val,
|
312
|
-
:description => "#{val} ms"
|
313
|
-
)
|
314
|
-
end
|
315
|
-
end
|
316
|
-
end
|
317
|
-
end
|
318
|
-
|
319
|
-
def tick
|
320
|
-
# This can utterly destroy a cluster, so we disable
|
321
|
-
# check_keys
|
322
|
-
check_stats
|
323
|
-
check_ring
|
324
|
-
check_disk
|
325
|
-
check_transfers
|
326
|
-
end
|
327
|
-
end
|
328
|
-
|
329
|
-
Riemann::Tools::Riak.run
|