riemann-tools.haf 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README.markdown +18 -0
- data/bin/riemann-aws-status +64 -0
- data/bin/riemann-bench +70 -0
- data/bin/riemann-cloudant +58 -0
- data/bin/riemann-diskstats +86 -0
- data/bin/riemann-elasticsearch +86 -0
- data/bin/riemann-elb-metrics +154 -0
- data/bin/riemann-fd +66 -0
- data/bin/riemann-freeswitch +31 -0
- data/bin/riemann-haproxy +52 -0
- data/bin/riemann-health +270 -0
- data/bin/riemann-kvminstance +22 -0
- data/bin/riemann-memcached +37 -0
- data/bin/riemann-munin +36 -0
- data/bin/riemann-net +101 -0
- data/bin/riemann-nginx-status +84 -0
- data/bin/riemann-proc +48 -0
- data/bin/riemann-rabbitmq +99 -0
- data/bin/riemann-redis +71 -0
- data/bin/riemann-redis-slowlog +44 -0
- data/bin/riemann-resmon +103 -0
- data/bin/riemann-riak +237 -0
- data/bin/riemann-riak-keys +12 -0
- data/bin/riemann-riak-ring +8 -0
- data/lib/riemann/tools.rb +114 -0
- metadata +224 -0
data/bin/riemann-resmon
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path('../../lib/riemann/tools', __FILE__)
|
4
|
+
|
5
|
+
class Riemann::Tools::Resmon
|
6
|
+
include Riemann::Tools
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'faraday'
|
9
|
+
|
10
|
+
opt :resmon_hostfile, 'File with hostnames running Resmon (one URI per line)', type: :string
|
11
|
+
opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
|
12
|
+
opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
|
13
|
+
opt :fqdn, 'Use FQDN for event host'
|
14
|
+
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@hosts = File.read(options[:resmon_hostfile]).split("\n")
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
# Work out the hostname to submit with the event
|
23
|
+
def get_event_host(host)
|
24
|
+
unless options[:fqdn]
|
25
|
+
return host.split('.')[0]
|
26
|
+
end
|
27
|
+
return host
|
28
|
+
end
|
29
|
+
|
30
|
+
# Handles HTTP connections and GET requests safely
|
31
|
+
def safe_get(uri, event_host)
|
32
|
+
# Handle connection timeouts
|
33
|
+
response = nil
|
34
|
+
begin
|
35
|
+
connection = Faraday.new(uri)
|
36
|
+
response = connection.get do |req|
|
37
|
+
req.options[:timeout] = options[:read_timeout]
|
38
|
+
req.options[:open_timeout] = options[:open_timeout]
|
39
|
+
end
|
40
|
+
rescue => e
|
41
|
+
report(:host => event_host,
|
42
|
+
:service => "resmon",
|
43
|
+
:state => "critical",
|
44
|
+
:description => "HTTP connection error: #{e.class} - #{e.message}"
|
45
|
+
)
|
46
|
+
end
|
47
|
+
response
|
48
|
+
end
|
49
|
+
|
50
|
+
def tick
|
51
|
+
@hosts.each do |host|
|
52
|
+
|
53
|
+
uri = URI(host)
|
54
|
+
event_host = get_event_host(uri.host)
|
55
|
+
|
56
|
+
response = safe_get(uri, event_host)
|
57
|
+
next if response.nil?
|
58
|
+
|
59
|
+
# Handle non-200 responses
|
60
|
+
if response.status != 200
|
61
|
+
report(:host => event_host,
|
62
|
+
:service => "resmon",
|
63
|
+
:state => "critical",
|
64
|
+
:description => "HTTP connection error: #{response.status} - #{response.body}"
|
65
|
+
)
|
66
|
+
next
|
67
|
+
else
|
68
|
+
report(:host => event_host,
|
69
|
+
:service => "resmon",
|
70
|
+
:state => "ok",
|
71
|
+
:description => "Resmon connection ok"
|
72
|
+
)
|
73
|
+
doc = Nokogiri::XML(response.body)
|
74
|
+
end
|
75
|
+
|
76
|
+
doc.xpath('//ResmonResults/ResmonResult').each do |result|
|
77
|
+
timestamp = result.xpath('last_update').first.text
|
78
|
+
result.xpath('metric').each do |metric|
|
79
|
+
hash = {
|
80
|
+
host: event_host,
|
81
|
+
service: "#{result.attributes['module'].value}`#{result.attributes['service'].value}`#{metric.attributes['name'].value}",
|
82
|
+
time: timestamp.to_i
|
83
|
+
}
|
84
|
+
|
85
|
+
case metric.attributes['type'].value
|
86
|
+
when /[iIlL]/
|
87
|
+
hash[:metric] = metric.text.to_i
|
88
|
+
when 'n'
|
89
|
+
hash[:metric] = metric.text.to_f
|
90
|
+
when 's'
|
91
|
+
hash[:description] = metric.text
|
92
|
+
when '0'
|
93
|
+
raise 'dunno what 0 is yet'
|
94
|
+
end
|
95
|
+
|
96
|
+
report(hash)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
Riemann::Tools::Resmon.run
|
data/bin/riemann-riak
ADDED
@@ -0,0 +1,237 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Forwards information on a Riak node to Riemann.
|
4
|
+
|
5
|
+
require File.expand_path('../../lib/riemann/tools', __FILE__)
|
6
|
+
|
7
|
+
require 'net/http'
|
8
|
+
require 'net/https'
|
9
|
+
require 'yajl/json_gem'
|
10
|
+
|
11
|
+
class Riemann::Tools::Riak
|
12
|
+
include Riemann::Tools
|
13
|
+
|
14
|
+
opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
|
15
|
+
opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
|
16
|
+
opt :stats_port, "Riak HTTP port for stats", :default => 8098
|
17
|
+
opt :stats_path, "Riak HTTP stats path", :default => '/stats'
|
18
|
+
opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}"
|
19
|
+
opt :cookie, "Riak cookie to use", :default => "riak"
|
20
|
+
|
21
|
+
opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000
|
22
|
+
opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000
|
23
|
+
opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000
|
24
|
+
opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
|
25
|
+
opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
|
26
|
+
opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
|
27
|
+
|
28
|
+
def initialize
|
29
|
+
@escript = true
|
30
|
+
@riakadmin = true
|
31
|
+
@httpstatus = true
|
32
|
+
|
33
|
+
if `which escript` =~ /^\s*$/
|
34
|
+
@escript = false
|
35
|
+
end
|
36
|
+
|
37
|
+
if `which riak-admin` =~ /^\s*$/
|
38
|
+
@riakadmin = false
|
39
|
+
end
|
40
|
+
|
41
|
+
if
|
42
|
+
begin
|
43
|
+
uri = URI.parse(opts[:riak_host])
|
44
|
+
if uri.host == nil
|
45
|
+
uri.host = opts[:riak_host]
|
46
|
+
end
|
47
|
+
http = Net::HTTP.new(uri.host, opts[:stats_port])
|
48
|
+
http.use_ssl = uri.scheme == 'https'
|
49
|
+
if http.use_ssl?
|
50
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
51
|
+
end
|
52
|
+
http.start do |http|
|
53
|
+
http.get opts[:stats_path]
|
54
|
+
end
|
55
|
+
rescue => e
|
56
|
+
@httpstatus = false
|
57
|
+
end
|
58
|
+
end
|
59
|
+
# we're going to override the emulator setting to allow users to
|
60
|
+
# dynamically input the cookie
|
61
|
+
# this is done only once - hopefully it doesn't get overridden.
|
62
|
+
ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
def check_ring
|
67
|
+
if @escript
|
68
|
+
str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
|
69
|
+
elsif @riakadmin
|
70
|
+
str = `riak-admin ringready`
|
71
|
+
end
|
72
|
+
|
73
|
+
if str =~ /^TRUE/
|
74
|
+
report(
|
75
|
+
:host => opts[:riak_host],
|
76
|
+
:service => 'riak ring',
|
77
|
+
:state => 'ok',
|
78
|
+
:description => str
|
79
|
+
)
|
80
|
+
else
|
81
|
+
report(
|
82
|
+
:host => opts[:riak_host],
|
83
|
+
:service => 'riak ring',
|
84
|
+
:state => 'warning',
|
85
|
+
:description => str
|
86
|
+
)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def check_keys
|
91
|
+
keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp
|
92
|
+
if keys =~ /^\d+$/
|
93
|
+
report(
|
94
|
+
:host => opts[:riak_host],
|
95
|
+
:service => 'riak keys',
|
96
|
+
:state => 'ok',
|
97
|
+
:metric => keys.to_i,
|
98
|
+
:description => keys
|
99
|
+
)
|
100
|
+
else
|
101
|
+
report(
|
102
|
+
:host => opts[:riak_host],
|
103
|
+
:service => 'riak keys',
|
104
|
+
:state => 'unknown',
|
105
|
+
:description => keys
|
106
|
+
)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def check_disk
|
111
|
+
gb = `du -s #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
|
112
|
+
report(
|
113
|
+
:host => opts[:riak_host],
|
114
|
+
:service => 'riak disk',
|
115
|
+
:state => 'ok',
|
116
|
+
:metric => gb,
|
117
|
+
:description => "#{gb} GB in #{opts[:data_dir]}"
|
118
|
+
)
|
119
|
+
end
|
120
|
+
|
121
|
+
# Returns the riak stat for the given fsm type and percentile.
|
122
|
+
def fsm_stat(type, percentile)
|
123
|
+
"node_#{type}_fsm_time_#{percentile == 50 ? 'median' : percentile}"
|
124
|
+
end
|
125
|
+
|
126
|
+
# Returns the alerts state for the given fsm.
|
127
|
+
def fsm_state(type, percentile, val)
|
128
|
+
limit = opts["#{type}_#{percentile}_warning".to_sym]
|
129
|
+
case val
|
130
|
+
when 0 .. limit
|
131
|
+
'ok'
|
132
|
+
when limit .. limit * 2
|
133
|
+
'warning'
|
134
|
+
else
|
135
|
+
'critical'
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def check_stats
|
140
|
+
if @httpstatus
|
141
|
+
begin
|
142
|
+
uri = URI.parse(opts[:riak_host])
|
143
|
+
if uri.host == nil
|
144
|
+
uri.host = opts[:riak_host]
|
145
|
+
end
|
146
|
+
http = Net::HTTP.new(uri.host, opts[:stats_port])
|
147
|
+
http.use_ssl = uri.scheme == 'https'
|
148
|
+
if http.use_ssl?
|
149
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
150
|
+
end
|
151
|
+
res = http.start do |http|
|
152
|
+
http.get opts[:stats_path]
|
153
|
+
end
|
154
|
+
rescue => e
|
155
|
+
report(
|
156
|
+
:host => opts[:riak_host],
|
157
|
+
:service => 'riak',
|
158
|
+
:state => 'critical',
|
159
|
+
:description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
|
160
|
+
)
|
161
|
+
return
|
162
|
+
end
|
163
|
+
|
164
|
+
if res.code.to_i == 200
|
165
|
+
stats = JSON.parse(res.body)
|
166
|
+
else
|
167
|
+
report(
|
168
|
+
:host => opts[:riak_host],
|
169
|
+
:service => 'riak',
|
170
|
+
:state => 'critical',
|
171
|
+
:description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
|
172
|
+
)
|
173
|
+
return
|
174
|
+
end
|
175
|
+
elsif @riakadmin
|
176
|
+
stats = Hash[`riak-admin status`.split(/\n/).map{|i| i.split(/ : /)}]
|
177
|
+
else
|
178
|
+
report(
|
179
|
+
:host => opts[:riak_host],
|
180
|
+
:service => 'riak',
|
181
|
+
:state => 'critical',
|
182
|
+
:description => "error fetching Riak stats"
|
183
|
+
)
|
184
|
+
return
|
185
|
+
end
|
186
|
+
|
187
|
+
report(
|
188
|
+
:host => opts[:riak_host],
|
189
|
+
:service => 'riak',
|
190
|
+
:state => 'ok'
|
191
|
+
)
|
192
|
+
|
193
|
+
# Gets/puts/rr
|
194
|
+
[
|
195
|
+
'vnode_gets',
|
196
|
+
'vnode_puts',
|
197
|
+
'node_gets',
|
198
|
+
'node_puts',
|
199
|
+
'read_repairs'
|
200
|
+
].each do |s|
|
201
|
+
report(
|
202
|
+
:host => opts[:riak_host],
|
203
|
+
:service => "riak #{s}",
|
204
|
+
:state => 'ok',
|
205
|
+
:metric => stats[s].to_i/60.0,
|
206
|
+
:description => "#{stats[s].to_i/60.0}/sec"
|
207
|
+
)
|
208
|
+
end
|
209
|
+
|
210
|
+
# FSMs
|
211
|
+
['get', 'put'].each do |type|
|
212
|
+
[50, 95, 99].each do |percentile|
|
213
|
+
val = stats[fsm_stat(type, percentile)].to_i || 0
|
214
|
+
val = 0 if val == 'undefined'
|
215
|
+
val /= 1000.0 # Convert us to ms
|
216
|
+
state = fsm_state(type, percentile, val)
|
217
|
+
report(
|
218
|
+
:host => opts[:riak_host],
|
219
|
+
:service => "riak #{type} #{percentile}",
|
220
|
+
:state => state,
|
221
|
+
:metric => val,
|
222
|
+
:description => "#{val} ms"
|
223
|
+
)
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def tick
|
229
|
+
# This can utterly destroy a cluster, so we disable
|
230
|
+
# check_keys
|
231
|
+
check_stats
|
232
|
+
check_ring
|
233
|
+
check_disk
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
Riemann::Tools::Riak.run
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env escript
|
2
|
+
%%! -name riakstatuscheck@127.0.0.1 -hidden
|
3
|
+
|
4
|
+
main([]) -> main(["riak@127.0.0.1"]);
|
5
|
+
main([Node]) ->
|
6
|
+
io:format("~w\n", [
|
7
|
+
lists:foldl(
|
8
|
+
fun({_VNode, Count}, Sum) -> Sum + Count end,
|
9
|
+
0,
|
10
|
+
rpc:call(list_to_atom(Node), riak_kv_bitcask_backend, key_counts, [])
|
11
|
+
)
|
12
|
+
]).
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module Riemann
|
2
|
+
module Tools
|
3
|
+
require 'rubygems'
|
4
|
+
require 'trollop'
|
5
|
+
require 'riemann/client'
|
6
|
+
require 'timeout'
|
7
|
+
|
8
|
+
def self.included(base)
|
9
|
+
base.instance_eval do
|
10
|
+
def run
|
11
|
+
new.run
|
12
|
+
end
|
13
|
+
|
14
|
+
def opt(*a)
|
15
|
+
a.unshift :opt
|
16
|
+
@opts ||= []
|
17
|
+
@opts << a
|
18
|
+
end
|
19
|
+
|
20
|
+
def options
|
21
|
+
p = Trollop::Parser.new
|
22
|
+
@opts.each do |o|
|
23
|
+
p.send *o
|
24
|
+
end
|
25
|
+
Trollop::with_standard_exception_handling(p) do
|
26
|
+
p.parse ARGV
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
opt :host, "Riemann host", :default => '127.0.0.1'
|
31
|
+
opt :port, "Riemann port", :default => 5555
|
32
|
+
opt :event_host, "Event hostname", :type => String
|
33
|
+
opt :interval, "Seconds between updates", :default => 5
|
34
|
+
opt :tag, "Tag to add to events", :type => String, :multi => true
|
35
|
+
opt :ttl, "TTL for events", :type => Integer
|
36
|
+
opt :attribute, "Attribute to add to the event", :type => String, :multi => true
|
37
|
+
opt :timeout, "Timeout (in seconds) when waiting for acknowledgements", :default => 30
|
38
|
+
opt :tcp, "Use TCP transport instead of UDP (improves reliability, slight overhead.", :default => true
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns parsed options (cached) from command line.
|
43
|
+
def options
|
44
|
+
@options ||= self.class.options
|
45
|
+
end
|
46
|
+
alias :opts :options
|
47
|
+
|
48
|
+
def attributes
|
49
|
+
@attributes ||= Hash[options[:attribute].map do |attr|
|
50
|
+
k,v = attr.split(/=/)
|
51
|
+
if k and v
|
52
|
+
[k,v]
|
53
|
+
end
|
54
|
+
end]
|
55
|
+
end
|
56
|
+
|
57
|
+
def report(event)
|
58
|
+
if options[:tag]
|
59
|
+
# Work around a bug with beefcake which can't take frozen strings.
|
60
|
+
event[:tags] = options[:tag].map(&:dup)
|
61
|
+
end
|
62
|
+
|
63
|
+
event[:ttl] ||= (options[:ttl] || (options[:interval] * 2))
|
64
|
+
|
65
|
+
if options[:event_host]
|
66
|
+
event[:host] = options[:event_host].dup
|
67
|
+
end
|
68
|
+
|
69
|
+
event = event.merge(attributes)
|
70
|
+
|
71
|
+
begin
|
72
|
+
Timeout::timeout(options[:timeout]) do
|
73
|
+
riemann << event
|
74
|
+
end
|
75
|
+
rescue Timeout::Error
|
76
|
+
riemann.connect
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def new_riemann_client
|
81
|
+
r = Riemann::Client.new(
|
82
|
+
:host => options[:host],
|
83
|
+
:port => options[:port]
|
84
|
+
)
|
85
|
+
if options[:tcp]
|
86
|
+
r.tcp
|
87
|
+
else
|
88
|
+
r
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def riemann
|
93
|
+
@riemann ||= new_riemann_client
|
94
|
+
end
|
95
|
+
alias :r :riemann
|
96
|
+
|
97
|
+
def run
|
98
|
+
t0 = Time.now
|
99
|
+
loop do
|
100
|
+
begin
|
101
|
+
tick
|
102
|
+
rescue => e
|
103
|
+
$stderr.puts "#{e.class} #{e}\n#{e.backtrace.join "\n"}"
|
104
|
+
end
|
105
|
+
|
106
|
+
# Sleep.
|
107
|
+
sleep(options[:interval] - ((Time.now - t0) % options[:interval]))
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def tick
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|