riemann-tools 0.2.14 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.docker/Dockerfile +7 -0
- data/.docker/publish.sh +35 -0
- data/.github/workflows/ci.yml +29 -0
- data/.gitignore +6 -0
- data/.rspec +2 -0
- data/.travis.yml +31 -0
- data/CHANGELOG.md +393 -0
- data/Gemfile +6 -0
- data/ISSUE_TEMPLATE.md +15 -0
- data/README.markdown +14 -1
- data/Rakefile +21 -0
- data/bin/riemann-apache-status +1 -0
- data/bin/riemann-bench +1 -0
- data/bin/riemann-cloudant +1 -0
- data/bin/riemann-consul +1 -0
- data/bin/riemann-dir-files-count +1 -0
- data/bin/riemann-dir-space +1 -0
- data/bin/riemann-diskstats +1 -0
- data/bin/riemann-fd +1 -0
- data/bin/riemann-freeswitch +1 -0
- data/bin/riemann-haproxy +1 -0
- data/bin/riemann-health +19 -11
- data/bin/riemann-kvminstance +1 -0
- data/bin/riemann-memcached +1 -0
- data/bin/riemann-net +1 -0
- data/bin/riemann-nginx-status +1 -0
- data/bin/riemann-ntp +1 -0
- data/bin/riemann-portcheck +1 -0
- data/bin/riemann-proc +1 -0
- data/bin/riemann-varnish +1 -0
- data/bin/riemann-zookeeper +1 -0
- data/lib/riemann/tools/utils.rb +17 -0
- data/lib/riemann/tools/version.rb +7 -0
- data/lib/riemann/tools.rb +12 -2
- data/riemann-tools.gemspec +39 -0
- data/tools/riemann-aws/LICENSE +21 -0
- data/tools/riemann-aws/README.md +54 -0
- data/tools/riemann-aws/Rakefile.rb +35 -0
- data/tools/riemann-aws/bin/riemann-aws-billing +87 -0
- data/tools/riemann-aws/bin/riemann-aws-rds-status +54 -0
- data/tools/riemann-aws/bin/riemann-aws-sqs-status +44 -0
- data/tools/riemann-aws/bin/riemann-aws-status +71 -0
- data/tools/riemann-aws/bin/riemann-elb-metrics +167 -0
- data/tools/riemann-aws/bin/riemann-s3-list +82 -0
- data/tools/riemann-aws/bin/riemann-s3-status +99 -0
- data/tools/riemann-chronos/LICENSE +21 -0
- data/tools/riemann-chronos/README.md +10 -0
- data/tools/riemann-chronos/Rakefile.rb +35 -0
- data/tools/riemann-chronos/bin/riemann-chronos +144 -0
- data/tools/riemann-docker/LICENSE +21 -0
- data/tools/riemann-docker/README.md +10 -0
- data/tools/riemann-docker/Rakefile.rb +34 -0
- data/tools/riemann-docker/bin/riemann-docker +217 -0
- data/tools/riemann-elasticsearch/LICENSE +21 -0
- data/tools/riemann-elasticsearch/README.md +10 -0
- data/tools/riemann-elasticsearch/Rakefile.rb +35 -0
- data/tools/riemann-elasticsearch/bin/riemann-elasticsearch +166 -0
- data/tools/riemann-marathon/LICENSE +21 -0
- data/tools/riemann-marathon/README.md +10 -0
- data/tools/riemann-marathon/Rakefile.rb +35 -0
- data/tools/riemann-marathon/bin/riemann-marathon +147 -0
- data/tools/riemann-mesos/LICENSE +21 -0
- data/tools/riemann-mesos/README.md +10 -0
- data/tools/riemann-mesos/Rakefile.rb +35 -0
- data/tools/riemann-mesos/bin/riemann-mesos +131 -0
- data/tools/riemann-munin/LICENSE +21 -0
- data/tools/riemann-munin/README.md +10 -0
- data/tools/riemann-munin/Rakefile.rb +34 -0
- data/tools/riemann-munin/bin/riemann-munin +37 -0
- data/tools/riemann-rabbitmq/LICENSE +21 -0
- data/tools/riemann-rabbitmq/README.md +10 -0
- data/tools/riemann-rabbitmq/Rakefile.rb +35 -0
- data/tools/riemann-rabbitmq/bin/riemann-rabbitmq +269 -0
- data/tools/riemann-riak/LICENSE +21 -0
- data/tools/riemann-riak/README.md +10 -0
- data/tools/riemann-riak/Rakefile.rb +34 -0
- data/tools/riemann-riak/bin/riemann-riak +331 -0
- data/tools/riemann-riak/bin/riemann-riak-keys +13 -0
- data/tools/riemann-riak/bin/riemann-riak-ring +9 -0
- data/tools/riemann-riak/riak_status/key_count.erl +13 -0
- data/tools/riemann-riak/riak_status/riak_status.rb +152 -0
- data/tools/riemann-riak/riak_status/ringready.erl +9 -0
- metadata +134 -34
@@ -0,0 +1,269 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
Process.setproctitle($0)
|
3
|
+
|
4
|
+
require 'riemann/tools'
|
5
|
+
|
6
|
+
class Riemann::Tools::Rabbitmq
|
7
|
+
include Riemann::Tools
|
8
|
+
|
9
|
+
require 'faraday'
|
10
|
+
require 'json'
|
11
|
+
require 'uri'
|
12
|
+
|
13
|
+
|
14
|
+
opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
|
15
|
+
opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
|
16
|
+
|
17
|
+
opt :monitor_user, 'RabbitMQ monitoring user', type: :string
|
18
|
+
opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
|
19
|
+
opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672
|
20
|
+
opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost"
|
21
|
+
opt :monitor_use_tls, 'RabbitMQ use tls', type: :bool, default: false
|
22
|
+
|
23
|
+
opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000
|
24
|
+
opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string
|
25
|
+
|
26
|
+
opt :node, "Specify a node to monitor", type: :strings
|
27
|
+
|
28
|
+
def base_url
|
29
|
+
protocol = "http"
|
30
|
+
if (options[:monitor_use_tls]) && (options[:monitor_use_tls]==true)
|
31
|
+
protocol = "https"
|
32
|
+
end
|
33
|
+
"#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
|
34
|
+
end
|
35
|
+
|
36
|
+
def overview_url
|
37
|
+
"#{base_url}/overview"
|
38
|
+
end
|
39
|
+
|
40
|
+
def node_url(n)
|
41
|
+
"#{base_url}/nodes/#{n}"
|
42
|
+
end
|
43
|
+
|
44
|
+
def queues_url
|
45
|
+
"#{base_url}/queues"
|
46
|
+
end
|
47
|
+
|
48
|
+
def event_host
|
49
|
+
if options[:event_host]
|
50
|
+
return options[:event_host]
|
51
|
+
else
|
52
|
+
return options[:monitor_host]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def safe_get(uri, event_host)
|
57
|
+
# Handle connection timeouts
|
58
|
+
response = nil
|
59
|
+
begin
|
60
|
+
connection = Faraday.new(uri)
|
61
|
+
response = connection.get do |req|
|
62
|
+
req.options[:timeout] = options[:read_timeout]
|
63
|
+
req.options[:open_timeout] = options[:open_timeout]
|
64
|
+
end
|
65
|
+
report(:host => event_host,
|
66
|
+
:service => "rabbitmq monitoring",
|
67
|
+
:state => 'ok',
|
68
|
+
:description => "Monitoring operational"
|
69
|
+
)
|
70
|
+
rescue => e
|
71
|
+
report(:host => event_host,
|
72
|
+
:service => "rabbitmq monitoring",
|
73
|
+
:state => "critical",
|
74
|
+
:description => "HTTP connection error: #{e.class} - #{e.message}"
|
75
|
+
)
|
76
|
+
end
|
77
|
+
response
|
78
|
+
end
|
79
|
+
|
80
|
+
def check_queues
|
81
|
+
response = safe_get(queues_url, event_host)
|
82
|
+
max_size_check_filter = if options[:ignore_max_size_queues]
|
83
|
+
Regexp.new(options[:ignore_max_size_queues])
|
84
|
+
else
|
85
|
+
nil
|
86
|
+
end
|
87
|
+
|
88
|
+
return if response.nil?
|
89
|
+
|
90
|
+
json = JSON.parse(response.body)
|
91
|
+
|
92
|
+
if response.status != 200
|
93
|
+
report(:host => event_host,
|
94
|
+
:service => "rabbitmq.queue",
|
95
|
+
:state => "critical",
|
96
|
+
:description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}"
|
97
|
+
)
|
98
|
+
else
|
99
|
+
report(:host => event_host,
|
100
|
+
:service => "rabbitmq.queue",
|
101
|
+
:state => "ok",
|
102
|
+
:description => "HTTP connection ok"
|
103
|
+
)
|
104
|
+
|
105
|
+
json = JSON.parse(response.body)
|
106
|
+
|
107
|
+
json.each do |queue|
|
108
|
+
svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}"
|
109
|
+
errs = []
|
110
|
+
|
111
|
+
if queue['messages_ready']!=nil and queue['messages_ready'] > 0 and queue['consumers'] == 0
|
112
|
+
errs << "Queue has jobs but no consumers"
|
113
|
+
end
|
114
|
+
|
115
|
+
if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready']!=nil and queue['messages_ready'] > options[:max_queue_size]
|
116
|
+
errs << "Queue has #{queue['messages_ready']} jobs"
|
117
|
+
end
|
118
|
+
|
119
|
+
if errs.empty?
|
120
|
+
report(:host => event_host,
|
121
|
+
:service => svc,
|
122
|
+
:state => "ok",
|
123
|
+
:description => "Queue is looking good"
|
124
|
+
)
|
125
|
+
else
|
126
|
+
report(:host => event_host,
|
127
|
+
:service => svc,
|
128
|
+
:state => "critical",
|
129
|
+
:description => errs.join("; ")
|
130
|
+
)
|
131
|
+
end
|
132
|
+
|
133
|
+
stats = (queue['message_stats'] || {}).merge(
|
134
|
+
'messages' => queue['messages'],
|
135
|
+
'messages_details' => queue['messages_details'],
|
136
|
+
'messages_ready' => queue['messages_ready'],
|
137
|
+
'messages_ready_details' => queue['messages_ready_details'],
|
138
|
+
'messages_unacknowledged' => queue['messages_unacknowledged'],
|
139
|
+
'messages_unacknowledged_details' => queue['messages_unacknowledged_details'],
|
140
|
+
'consumers' => queue['consumers'],
|
141
|
+
'memory' => queue['memory'],
|
142
|
+
)
|
143
|
+
|
144
|
+
stats.each_pair do |k,v|
|
145
|
+
service = "#{svc}.#{k}"
|
146
|
+
if k =~ /details$/ and v!=nil
|
147
|
+
metric = v['rate']
|
148
|
+
else
|
149
|
+
metric = v
|
150
|
+
end
|
151
|
+
|
152
|
+
# TODO: Set state via thresholds which can be configured
|
153
|
+
|
154
|
+
report(:host => event_host,
|
155
|
+
:service => service,
|
156
|
+
:metric => metric,
|
157
|
+
:description => "RabbitMQ monitor"
|
158
|
+
)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def check_overview
|
165
|
+
uri = URI(overview_url)
|
166
|
+
response = safe_get(uri, event_host)
|
167
|
+
|
168
|
+
return if response.nil?
|
169
|
+
|
170
|
+
json = JSON.parse(response.body)
|
171
|
+
|
172
|
+
if response.status != 200
|
173
|
+
report(:host => event_host,
|
174
|
+
:service => "rabbitmq",
|
175
|
+
:state => "critical",
|
176
|
+
:description => "HTTP connection error: #{response.status} - #{response.body}"
|
177
|
+
)
|
178
|
+
else
|
179
|
+
report(:host => event_host,
|
180
|
+
:service => "rabbitmq monitoring",
|
181
|
+
:state => "ok",
|
182
|
+
:description => "HTTP connection ok"
|
183
|
+
)
|
184
|
+
|
185
|
+
%w( message_stats queue_totals object_totals ).each do |stat|
|
186
|
+
# NOTE / BUG ?
|
187
|
+
# Brand new servers can have blank message stats. Is this ok?
|
188
|
+
# I can't decide.
|
189
|
+
next if json[stat].empty?
|
190
|
+
json[stat].each_pair do |k,v|
|
191
|
+
service = "rabbitmq.#{stat}.#{k}"
|
192
|
+
if k =~ /details$/
|
193
|
+
metric = v['rate']
|
194
|
+
else
|
195
|
+
metric = v
|
196
|
+
end
|
197
|
+
|
198
|
+
# TODO: Set state via thresholds which can be configured
|
199
|
+
|
200
|
+
report(:host => event_host,
|
201
|
+
:service => service,
|
202
|
+
:metric => metric,
|
203
|
+
:description => "RabbitMQ monitor"
|
204
|
+
)
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
def check_node
|
211
|
+
opts[:node].each do |n|
|
212
|
+
uri = URI(node_url(n))
|
213
|
+
response = safe_get(uri, event_host)
|
214
|
+
|
215
|
+
return if response.nil?
|
216
|
+
|
217
|
+
if response.status != 200
|
218
|
+
if response.status == 404
|
219
|
+
report(:host => event_host,
|
220
|
+
:service => "rabbitmq.node.#{n}",
|
221
|
+
:state => "critical",
|
222
|
+
:description => "Node was not found in the cluster"
|
223
|
+
)
|
224
|
+
else
|
225
|
+
report(:host => event_host,
|
226
|
+
:service => "rabbitmq.node.#{n}",
|
227
|
+
:state => "critical",
|
228
|
+
:description => "HTTP error: #{response.status} - #{response.body}"
|
229
|
+
)
|
230
|
+
end
|
231
|
+
return
|
232
|
+
end
|
233
|
+
|
234
|
+
json = JSON.parse(response.body)
|
235
|
+
|
236
|
+
if json['mem_alarm']
|
237
|
+
report(:host => event_host,
|
238
|
+
:service => "rabbitmq.node.#{n}",
|
239
|
+
:state => "critical",
|
240
|
+
:description => "Memory alarm has triggered; job submission throttled"
|
241
|
+
)
|
242
|
+
return
|
243
|
+
end
|
244
|
+
|
245
|
+
if json['disk_free_alarm']
|
246
|
+
report(:host => event_host,
|
247
|
+
:service => "rabbitmq.node.#{n}",
|
248
|
+
:state => "critical",
|
249
|
+
:description => "Disk free alarm has triggered; job submission throttled"
|
250
|
+
)
|
251
|
+
return
|
252
|
+
end
|
253
|
+
|
254
|
+
report(:host => event_host,
|
255
|
+
:service => "rabbitmq.node.#{n}",
|
256
|
+
:state => "ok",
|
257
|
+
:description => "Node looks OK to me"
|
258
|
+
)
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
def tick
|
263
|
+
check_overview
|
264
|
+
check_node if opts[:node]
|
265
|
+
check_queues
|
266
|
+
end
|
267
|
+
end
|
268
|
+
Riemann::Tools::Rabbitmq.run
|
269
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2011 Kyle Kingsbury
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rubygems/package_task'
|
3
|
+
require 'rdoc/task'
|
4
|
+
require 'find'
|
5
|
+
|
6
|
+
# Don't include resource forks in tarballs on Mac OS X.
|
7
|
+
ENV['COPY_EXTENDED_ATTRIBUTES_DISABLE'] = 'true'
|
8
|
+
ENV['COPYFILE_DISABLE'] = 'true'
|
9
|
+
|
10
|
+
# Gemspec
|
11
|
+
gemspec = Gem::Specification.new do |s|
|
12
|
+
s.rubyforge_project = 'riemann-riak'
|
13
|
+
|
14
|
+
s.name = 'riemann-riak'
|
15
|
+
s.version = '0.1.2'
|
16
|
+
s.author = 'Kyle Kingsbury'
|
17
|
+
s.email = 'aphyr@aphyr.com'
|
18
|
+
s.homepage = 'https://github.com/riemann/riemann-tools'
|
19
|
+
s.platform = Gem::Platform::RUBY
|
20
|
+
s.summary = 'Submits riak stats to riemann.'
|
21
|
+
s.license = 'MIT'
|
22
|
+
|
23
|
+
s.add_dependency 'riemann-tools', '>= 0.2.13'
|
24
|
+
s.add_dependency 'yajl-ruby', '>= 1.1.0'
|
25
|
+
|
26
|
+
s.files = FileList['bin/*', 'LICENSE', 'README.md'].to_a
|
27
|
+
s.executables |= Dir.entries('bin/')
|
28
|
+
s.has_rdoc = false
|
29
|
+
|
30
|
+
s.required_ruby_version = '>= 1.8.7'
|
31
|
+
end
|
32
|
+
|
33
|
+
Gem::PackageTask.new gemspec do |p|
|
34
|
+
end
|
@@ -0,0 +1,331 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
Process.setproctitle($0)
|
3
|
+
|
4
|
+
# Forwards information on a Riak node to Riemann.
|
5
|
+
|
6
|
+
require 'riemann/tools'
|
7
|
+
|
8
|
+
class Riemann::Tools::Riak
|
9
|
+
include Riemann::Tools
|
10
|
+
require 'net/http'
|
11
|
+
require 'net/https'
|
12
|
+
require 'yajl/json_gem'
|
13
|
+
|
14
|
+
opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
|
15
|
+
opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
|
16
|
+
opt :stats_port, "Riak HTTP port for stats", :default => 8098
|
17
|
+
opt :stats_path, "Riak HTTP stats path", :default => '/stats'
|
18
|
+
opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}"
|
19
|
+
opt :cookie, "Riak cookie to use", :default => "riak"
|
20
|
+
|
21
|
+
opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000
|
22
|
+
opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000
|
23
|
+
opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000
|
24
|
+
opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
|
25
|
+
opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
|
26
|
+
opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
|
27
|
+
|
28
|
+
def initialize
|
29
|
+
detect_features
|
30
|
+
|
31
|
+
@httpstatus = true
|
32
|
+
|
33
|
+
begin
|
34
|
+
uri = URI.parse(opts[:riak_host])
|
35
|
+
if uri.host == nil
|
36
|
+
uri.host = opts[:riak_host]
|
37
|
+
end
|
38
|
+
http = Net::HTTP.new(uri.host, opts[:stats_port])
|
39
|
+
http.use_ssl = uri.scheme == 'https'
|
40
|
+
if http.use_ssl?
|
41
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
42
|
+
end
|
43
|
+
http.start do |h|
|
44
|
+
h.get opts[:stats_path]
|
45
|
+
end
|
46
|
+
rescue => _e
|
47
|
+
@httpstatus = false
|
48
|
+
end
|
49
|
+
|
50
|
+
# we're going to override the emulator setting to allow users to
|
51
|
+
# dynamically input the cookie
|
52
|
+
# this is done only once - hopefully it doesn't get overridden.
|
53
|
+
ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
|
54
|
+
end
|
55
|
+
|
56
|
+
# Identifies whether escript and riak-admin are installed
|
57
|
+
def detect_features
|
58
|
+
@escript = true # Whether escript is present on this machine
|
59
|
+
@riakadmin = true # Whether riak-admin is present
|
60
|
+
|
61
|
+
if `which escript` =~ /^\s*$/
|
62
|
+
@escript = false
|
63
|
+
end
|
64
|
+
|
65
|
+
if `which riak-admin` =~ /^\s*$/
|
66
|
+
@riakadmin = false
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def check_ring
|
71
|
+
str = if @escript
|
72
|
+
str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
|
73
|
+
elsif @riakadmin
|
74
|
+
str = `riak-admin ringready`
|
75
|
+
else
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
|
79
|
+
return if str.nil?
|
80
|
+
|
81
|
+
if str =~ /^TRUE/
|
82
|
+
report(
|
83
|
+
:host => opts[:riak_host],
|
84
|
+
:service => 'riak ring',
|
85
|
+
:state => 'ok',
|
86
|
+
:description => str
|
87
|
+
)
|
88
|
+
else
|
89
|
+
report(
|
90
|
+
:host => opts[:riak_host],
|
91
|
+
:service => 'riak ring',
|
92
|
+
:state => 'warning',
|
93
|
+
:description => str
|
94
|
+
)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def check_keys
|
99
|
+
keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp
|
100
|
+
if keys =~ /^\d+$/
|
101
|
+
report(
|
102
|
+
:host => opts[:riak_host],
|
103
|
+
:service => 'riak keys',
|
104
|
+
:state => 'ok',
|
105
|
+
:metric => keys.to_i,
|
106
|
+
:description => keys
|
107
|
+
)
|
108
|
+
else
|
109
|
+
report(
|
110
|
+
:host => opts[:riak_host],
|
111
|
+
:service => 'riak keys',
|
112
|
+
:state => 'unknown',
|
113
|
+
:description => keys
|
114
|
+
)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def check_transfers
|
119
|
+
str = if @riakadmin
|
120
|
+
`riak-admin transfers`
|
121
|
+
else
|
122
|
+
nil
|
123
|
+
end
|
124
|
+
|
125
|
+
return if str.nil?
|
126
|
+
|
127
|
+
if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
|
128
|
+
report(
|
129
|
+
:host => opts[:riak_host],
|
130
|
+
:service => 'riak transfers',
|
131
|
+
:state => 'critical',
|
132
|
+
:metric => $1.to_i,
|
133
|
+
:description => "waiting to handoff #{$1} partitions"
|
134
|
+
)
|
135
|
+
else
|
136
|
+
report(
|
137
|
+
:host => opts[:riak_host],
|
138
|
+
:service => 'riak transfers',
|
139
|
+
:state => 'ok',
|
140
|
+
:metric => 0,
|
141
|
+
:description => "No pending transfers"
|
142
|
+
)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def check_disk
|
147
|
+
gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
|
148
|
+
report(
|
149
|
+
:host => opts[:riak_host],
|
150
|
+
:service => 'riak disk',
|
151
|
+
:state => 'ok',
|
152
|
+
:metric => gb,
|
153
|
+
:description => "#{gb} GB in #{opts[:data_dir]}"
|
154
|
+
)
|
155
|
+
end
|
156
|
+
|
157
|
+
# Returns the riak stat for the given fsm type and percentile.
|
158
|
+
def fsm_stat(type, property, percentile)
|
159
|
+
"node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}"
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns the alerts state for the given fsm.
|
163
|
+
def fsm_state(type, percentile, val)
|
164
|
+
limit = opts["#{type}_#{percentile}_warning".to_sym]
|
165
|
+
case val
|
166
|
+
when 0 .. limit
|
167
|
+
'ok'
|
168
|
+
when limit .. limit * 2
|
169
|
+
'warning'
|
170
|
+
else
|
171
|
+
'critical'
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Get current stats via HTTP
|
176
|
+
def stats_http
|
177
|
+
begin
|
178
|
+
uri = URI.parse(opts[:riak_host])
|
179
|
+
if uri.host == nil
|
180
|
+
uri.host = opts[:riak_host]
|
181
|
+
end
|
182
|
+
http = Net::HTTP.new(uri.host, opts[:stats_port])
|
183
|
+
http.use_ssl = uri.scheme == 'https'
|
184
|
+
if http.use_ssl?
|
185
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
186
|
+
end
|
187
|
+
res = http.start do |h|
|
188
|
+
h.get opts[:stats_path]
|
189
|
+
end
|
190
|
+
rescue => e
|
191
|
+
report(
|
192
|
+
:host => opts[:riak_host],
|
193
|
+
:service => 'riak',
|
194
|
+
:state => 'critical',
|
195
|
+
:description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
|
196
|
+
)
|
197
|
+
raise
|
198
|
+
end
|
199
|
+
|
200
|
+
if res.code.to_i == 200
|
201
|
+
return JSON.parse(res.body)
|
202
|
+
else
|
203
|
+
report(
|
204
|
+
:host => opts[:riak_host],
|
205
|
+
:service => 'riak',
|
206
|
+
:state => 'critical',
|
207
|
+
:description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
|
208
|
+
)
|
209
|
+
raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}"
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
# Get current stats via riak-admin
|
214
|
+
def stats_riak_admin
|
215
|
+
str = `riak-admin status`
|
216
|
+
raise "riak-admin failed" unless $? == 0
|
217
|
+
Hash[str.split(/\n/).map{|i| i.split(/ : /)}]
|
218
|
+
end
|
219
|
+
|
220
|
+
# Get current stats as a hash
|
221
|
+
def stats
|
222
|
+
if @httpstatus
|
223
|
+
stats_http
|
224
|
+
elsif @riakadmin
|
225
|
+
stats_riak_admin
|
226
|
+
else
|
227
|
+
report(
|
228
|
+
:host => opts[:riak_host],
|
229
|
+
:service => 'riak',
|
230
|
+
:state => 'critical',
|
231
|
+
:description => "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
|
232
|
+
)
|
233
|
+
raise "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
def core_services
|
238
|
+
['vnode_gets',
|
239
|
+
'vnode_puts',
|
240
|
+
'node_gets',
|
241
|
+
'node_puts',
|
242
|
+
'node_gets_set',
|
243
|
+
'node_puts_set',
|
244
|
+
'read_repairs']
|
245
|
+
end
|
246
|
+
|
247
|
+
def fsm_types
|
248
|
+
[{'get' => 'time'}, {'put' => 'time'},
|
249
|
+
{'get' => 'set_objsize'}]
|
250
|
+
end
|
251
|
+
|
252
|
+
def fsm_percentiles
|
253
|
+
[50, 95, 99]
|
254
|
+
end
|
255
|
+
|
256
|
+
# Reports current stats to Riemann
|
257
|
+
def check_stats
|
258
|
+
begin
|
259
|
+
stats = self.stats
|
260
|
+
rescue => e
|
261
|
+
event = {:state => 'critical',
|
262
|
+
:description => e.message,
|
263
|
+
:host => opts[:riak_host]}
|
264
|
+
# Report errors
|
265
|
+
report(event.merge(:service => 'riak'))
|
266
|
+
core_services.each do |s|
|
267
|
+
report(event.merge(:service => "riak #{s}"))
|
268
|
+
end
|
269
|
+
fsm_types.each do |typespec|
|
270
|
+
typespec.each do |type, prop|
|
271
|
+
fsm_percentiles.each do |percentile|
|
272
|
+
report(event.merge(:service => "riak #{type} #{prop} #{percentile}"))
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
return
|
277
|
+
end
|
278
|
+
|
279
|
+
# Riak itself
|
280
|
+
report(
|
281
|
+
:host => opts[:riak_host],
|
282
|
+
:service => 'riak',
|
283
|
+
:state => 'ok'
|
284
|
+
)
|
285
|
+
|
286
|
+
# Gets/puts/rr
|
287
|
+
core_services.each do |s|
|
288
|
+
report(
|
289
|
+
:host => opts[:riak_host],
|
290
|
+
:service => "riak #{s}",
|
291
|
+
:state => 'ok',
|
292
|
+
:metric => stats[s].to_i/60.0,
|
293
|
+
:description => "#{stats[s].to_i/60.0}/sec"
|
294
|
+
)
|
295
|
+
end
|
296
|
+
|
297
|
+
# FSMs
|
298
|
+
fsm_types.each do |typespec|
|
299
|
+
typespec.each do |type, prop|
|
300
|
+
fsm_percentiles.each do |percentile|
|
301
|
+
val = stats[fsm_stat(type, prop, percentile)].to_i || 0
|
302
|
+
val = 0 if val == 'undefined'
|
303
|
+
val /= 1000.0 if prop == 'time' # Convert us to ms
|
304
|
+
if prop == 'time'
|
305
|
+
state = fsm_state(type, percentile, val)
|
306
|
+
else
|
307
|
+
state = "ok"
|
308
|
+
end
|
309
|
+
report(
|
310
|
+
:host => opts[:riak_host],
|
311
|
+
:service => "riak #{type} #{prop} #{percentile}",
|
312
|
+
:state => state,
|
313
|
+
:metric => val,
|
314
|
+
:description => "#{val} ms"
|
315
|
+
)
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
def tick
|
322
|
+
# This can utterly destroy a cluster, so we disable
|
323
|
+
# check_keys
|
324
|
+
check_stats
|
325
|
+
check_ring
|
326
|
+
check_disk
|
327
|
+
check_transfers
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
Riemann::Tools::Riak.run
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env escript
|
2
|
+
Process.setproctitle($0)
|
3
|
+
%%! -name riakstatuscheck@127.0.0.1 -hidden
|
4
|
+
|
5
|
+
main([]) -> main(["riak@127.0.0.1"]);
|
6
|
+
main([Node]) ->
|
7
|
+
io:format("~w\n", [
|
8
|
+
lists:foldl(
|
9
|
+
fun({_VNode, Count}, Sum) -> Sum + Count end,
|
10
|
+
0,
|
11
|
+
rpc:call(list_to_atom(Node), riak_kv_bitcask_backend, key_counts, [])
|
12
|
+
)
|
13
|
+
]).
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env escript
|
2
|
+
Process.setproctitle($0)
|
3
|
+
%%! -name riakstatuscheck -setcookie riak -hidden
|
4
|
+
|
5
|
+
main([]) -> main(["riak@127.0.0.1"]);
|
6
|
+
main([Node]) ->
|
7
|
+
io:format("~w\n", [
|
8
|
+
lists:foldl(
|
9
|
+
fun({_VNode, Count}, Sum) -> Sum + Count end,
|
10
|
+
0,
|
11
|
+
rpc:call(list_to_atom(Node), riak_kv_bitcask_backend, key_counts, [])
|
12
|
+
)
|
13
|
+
]).
|