riemann-tools.haf 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ class Riemann::Tools::Resmon
6
+ include Riemann::Tools
7
+ require 'nokogiri'
8
+ require 'faraday'
9
+
10
+ opt :resmon_hostfile, 'File with hostnames running Resmon (one URI per line)', type: :string
11
+ opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
12
+ opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
13
+ opt :fqdn, 'Use FQDN for event host'
14
+
15
+
16
+ def initialize
17
+ @hosts = File.read(options[:resmon_hostfile]).split("\n")
18
+ super
19
+ end
20
+
21
+
22
+ # Work out the hostname to submit with the event
23
+ def get_event_host(host)
24
+ unless options[:fqdn]
25
+ return host.split('.')[0]
26
+ end
27
+ return host
28
+ end
29
+
30
+ # Handles HTTP connections and GET requests safely
31
+ def safe_get(uri, event_host)
32
+ # Handle connection timeouts
33
+ response = nil
34
+ begin
35
+ connection = Faraday.new(uri)
36
+ response = connection.get do |req|
37
+ req.options[:timeout] = options[:read_timeout]
38
+ req.options[:open_timeout] = options[:open_timeout]
39
+ end
40
+ rescue => e
41
+ report(:host => event_host,
42
+ :service => "resmon",
43
+ :state => "critical",
44
+ :description => "HTTP connection error: #{e.class} - #{e.message}"
45
+ )
46
+ end
47
+ response
48
+ end
49
+
50
+ def tick
51
+ @hosts.each do |host|
52
+
53
+ uri = URI(host)
54
+ event_host = get_event_host(uri.host)
55
+
56
+ response = safe_get(uri, event_host)
57
+ next if response.nil?
58
+
59
+ # Handle non-200 responses
60
+ if response.status != 200
61
+ report(:host => event_host,
62
+ :service => "resmon",
63
+ :state => "critical",
64
+ :description => "HTTP connection error: #{response.status} - #{response.body}"
65
+ )
66
+ next
67
+ else
68
+ report(:host => event_host,
69
+ :service => "resmon",
70
+ :state => "ok",
71
+ :description => "Resmon connection ok"
72
+ )
73
+ doc = Nokogiri::XML(response.body)
74
+ end
75
+
76
+ doc.xpath('//ResmonResults/ResmonResult').each do |result|
77
+ timestamp = result.xpath('last_update').first.text
78
+ result.xpath('metric').each do |metric|
79
+ hash = {
80
+ host: event_host,
81
+ service: "#{result.attributes['module'].value}`#{result.attributes['service'].value}`#{metric.attributes['name'].value}",
82
+ time: timestamp.to_i
83
+ }
84
+
85
+ case metric.attributes['type'].value
86
+ when /[iIlL]/
87
+ hash[:metric] = metric.text.to_i
88
+ when 'n'
89
+ hash[:metric] = metric.text.to_f
90
+ when 's'
91
+ hash[:description] = metric.text
92
+ when '0'
93
+ raise 'dunno what 0 is yet'
94
+ end
95
+
96
+ report(hash)
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+
103
+ Riemann::Tools::Resmon.run
@@ -0,0 +1,237 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Forwards information on a Riak node to Riemann.
4
+
5
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
6
+
7
+ require 'net/http'
8
+ require 'net/https'
9
+ require 'yajl/json_gem'
10
+
11
+ class Riemann::Tools::Riak
12
+ include Riemann::Tools
13
+
14
+ opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
15
+ opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
16
+ opt :stats_port, "Riak HTTP port for stats", :default => 8098
17
+ opt :stats_path, "Riak HTTP stats path", :default => '/stats'
18
+ opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}"
19
+ opt :cookie, "Riak cookie to use", :default => "riak"
20
+
21
+ opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000
22
+ opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000
23
+ opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000
24
+ opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
25
+ opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
26
+ opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
27
+
28
+ def initialize
29
+ @escript = true
30
+ @riakadmin = true
31
+ @httpstatus = true
32
+
33
+ if `which escript` =~ /^\s*$/
34
+ @escript = false
35
+ end
36
+
37
+ if `which riak-admin` =~ /^\s*$/
38
+ @riakadmin = false
39
+ end
40
+
41
+ if
42
+ begin
43
+ uri = URI.parse(opts[:riak_host])
44
+ if uri.host == nil
45
+ uri.host = opts[:riak_host]
46
+ end
47
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
48
+ http.use_ssl = uri.scheme == 'https'
49
+ if http.use_ssl?
50
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
51
+ end
52
+ http.start do |http|
53
+ http.get opts[:stats_path]
54
+ end
55
+ rescue => e
56
+ @httpstatus = false
57
+ end
58
+ end
59
+ # we're going to override the emulator setting to allow users to
60
+ # dynamically input the cookie
61
+ # this is done only once - hopefully it doesn't get overridden.
62
+ ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
63
+
64
+ end
65
+
66
+ def check_ring
67
+ if @escript
68
+ str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
69
+ elsif @riakadmin
70
+ str = `riak-admin ringready`
71
+ end
72
+
73
+ if str =~ /^TRUE/
74
+ report(
75
+ :host => opts[:riak_host],
76
+ :service => 'riak ring',
77
+ :state => 'ok',
78
+ :description => str
79
+ )
80
+ else
81
+ report(
82
+ :host => opts[:riak_host],
83
+ :service => 'riak ring',
84
+ :state => 'warning',
85
+ :description => str
86
+ )
87
+ end
88
+ end
89
+
90
+ def check_keys
91
+ keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp
92
+ if keys =~ /^\d+$/
93
+ report(
94
+ :host => opts[:riak_host],
95
+ :service => 'riak keys',
96
+ :state => 'ok',
97
+ :metric => keys.to_i,
98
+ :description => keys
99
+ )
100
+ else
101
+ report(
102
+ :host => opts[:riak_host],
103
+ :service => 'riak keys',
104
+ :state => 'unknown',
105
+ :description => keys
106
+ )
107
+ end
108
+ end
109
+
110
+ def check_disk
111
+ gb = `du -s #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
112
+ report(
113
+ :host => opts[:riak_host],
114
+ :service => 'riak disk',
115
+ :state => 'ok',
116
+ :metric => gb,
117
+ :description => "#{gb} GB in #{opts[:data_dir]}"
118
+ )
119
+ end
120
+
121
+ # Returns the riak stat for the given fsm type and percentile.
122
+ def fsm_stat(type, percentile)
123
+ "node_#{type}_fsm_time_#{percentile == 50 ? 'median' : percentile}"
124
+ end
125
+
126
+ # Returns the alerts state for the given fsm.
127
+ def fsm_state(type, percentile, val)
128
+ limit = opts["#{type}_#{percentile}_warning".to_sym]
129
+ case val
130
+ when 0 .. limit
131
+ 'ok'
132
+ when limit .. limit * 2
133
+ 'warning'
134
+ else
135
+ 'critical'
136
+ end
137
+ end
138
+
139
+ def check_stats
140
+ if @httpstatus
141
+ begin
142
+ uri = URI.parse(opts[:riak_host])
143
+ if uri.host == nil
144
+ uri.host = opts[:riak_host]
145
+ end
146
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
147
+ http.use_ssl = uri.scheme == 'https'
148
+ if http.use_ssl?
149
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
150
+ end
151
+ res = http.start do |http|
152
+ http.get opts[:stats_path]
153
+ end
154
+ rescue => e
155
+ report(
156
+ :host => opts[:riak_host],
157
+ :service => 'riak',
158
+ :state => 'critical',
159
+ :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
160
+ )
161
+ return
162
+ end
163
+
164
+ if res.code.to_i == 200
165
+ stats = JSON.parse(res.body)
166
+ else
167
+ report(
168
+ :host => opts[:riak_host],
169
+ :service => 'riak',
170
+ :state => 'critical',
171
+ :description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
172
+ )
173
+ return
174
+ end
175
+ elsif @riakadmin
176
+ stats = Hash[`riak-admin status`.split(/\n/).map{|i| i.split(/ : /)}]
177
+ else
178
+ report(
179
+ :host => opts[:riak_host],
180
+ :service => 'riak',
181
+ :state => 'critical',
182
+ :description => "error fetching Riak stats"
183
+ )
184
+ return
185
+ end
186
+
187
+ report(
188
+ :host => opts[:riak_host],
189
+ :service => 'riak',
190
+ :state => 'ok'
191
+ )
192
+
193
+ # Gets/puts/rr
194
+ [
195
+ 'vnode_gets',
196
+ 'vnode_puts',
197
+ 'node_gets',
198
+ 'node_puts',
199
+ 'read_repairs'
200
+ ].each do |s|
201
+ report(
202
+ :host => opts[:riak_host],
203
+ :service => "riak #{s}",
204
+ :state => 'ok',
205
+ :metric => stats[s].to_i/60.0,
206
+ :description => "#{stats[s].to_i/60.0}/sec"
207
+ )
208
+ end
209
+
210
+ # FSMs
211
+ ['get', 'put'].each do |type|
212
+ [50, 95, 99].each do |percentile|
213
+ val = stats[fsm_stat(type, percentile)].to_i || 0
214
+ val = 0 if val == 'undefined'
215
+ val /= 1000.0 # Convert us to ms
216
+ state = fsm_state(type, percentile, val)
217
+ report(
218
+ :host => opts[:riak_host],
219
+ :service => "riak #{type} #{percentile}",
220
+ :state => state,
221
+ :metric => val,
222
+ :description => "#{val} ms"
223
+ )
224
+ end
225
+ end
226
+ end
227
+
228
+ def tick
229
+ # This can utterly destroy a cluster, so we disable
230
+ # check_keys
231
+ check_stats
232
+ check_ring
233
+ check_disk
234
+ end
235
+ end
236
+
237
+ Riemann::Tools::Riak.run
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env escript
2
+ %%! -name riakstatuscheck@127.0.0.1 -hidden
3
+
4
+ main([]) -> main(["riak@127.0.0.1"]);
5
+ main([Node]) ->
6
+ io:format("~w\n", [
7
+ lists:foldl(
8
+ fun({_VNode, Count}, Sum) -> Sum + Count end,
9
+ 0,
10
+ rpc:call(list_to_atom(Node), riak_kv_bitcask_backend, key_counts, [])
11
+ )
12
+ ]).
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env escript
2
+ %%! -name riakstatuscheck@127.0.0.1 -hidden
3
+
4
+ main([]) -> main(["riak@127.0.0.1"]);
5
+ main([Node]) ->
6
+ io:format("~p\n", [
7
+ rpc:call(list_to_atom(Node), riak_kv_console, ringready, [[]])
8
+ ]).
@@ -0,0 +1,114 @@
1
+ module Riemann
2
+ module Tools
3
+ require 'rubygems'
4
+ require 'trollop'
5
+ require 'riemann/client'
6
+ require 'timeout'
7
+
8
+ def self.included(base)
9
+ base.instance_eval do
10
+ def run
11
+ new.run
12
+ end
13
+
14
+ def opt(*a)
15
+ a.unshift :opt
16
+ @opts ||= []
17
+ @opts << a
18
+ end
19
+
20
+ def options
21
+ p = Trollop::Parser.new
22
+ @opts.each do |o|
23
+ p.send *o
24
+ end
25
+ Trollop::with_standard_exception_handling(p) do
26
+ p.parse ARGV
27
+ end
28
+ end
29
+
30
+ opt :host, "Riemann host", :default => '127.0.0.1'
31
+ opt :port, "Riemann port", :default => 5555
32
+ opt :event_host, "Event hostname", :type => String
33
+ opt :interval, "Seconds between updates", :default => 5
34
+ opt :tag, "Tag to add to events", :type => String, :multi => true
35
+ opt :ttl, "TTL for events", :type => Integer
36
+ opt :attribute, "Attribute to add to the event", :type => String, :multi => true
37
+ opt :timeout, "Timeout (in seconds) when waiting for acknowledgements", :default => 30
38
+ opt :tcp, "Use TCP transport instead of UDP (improves reliability, slight overhead.", :default => true
39
+ end
40
+ end
41
+
42
+ # Returns parsed options (cached) from command line.
43
+ def options
44
+ @options ||= self.class.options
45
+ end
46
+ alias :opts :options
47
+
48
+ def attributes
49
+ @attributes ||= Hash[options[:attribute].map do |attr|
50
+ k,v = attr.split(/=/)
51
+ if k and v
52
+ [k,v]
53
+ end
54
+ end]
55
+ end
56
+
57
+ def report(event)
58
+ if options[:tag]
59
+ # Work around a bug with beefcake which can't take frozen strings.
60
+ event[:tags] = options[:tag].map(&:dup)
61
+ end
62
+
63
+ event[:ttl] ||= (options[:ttl] || (options[:interval] * 2))
64
+
65
+ if options[:event_host]
66
+ event[:host] = options[:event_host].dup
67
+ end
68
+
69
+ event = event.merge(attributes)
70
+
71
+ begin
72
+ Timeout::timeout(options[:timeout]) do
73
+ riemann << event
74
+ end
75
+ rescue Timeout::Error
76
+ riemann.connect
77
+ end
78
+ end
79
+
80
+ def new_riemann_client
81
+ r = Riemann::Client.new(
82
+ :host => options[:host],
83
+ :port => options[:port]
84
+ )
85
+ if options[:tcp]
86
+ r.tcp
87
+ else
88
+ r
89
+ end
90
+ end
91
+
92
+ def riemann
93
+ @riemann ||= new_riemann_client
94
+ end
95
+ alias :r :riemann
96
+
97
+ def run
98
+ t0 = Time.now
99
+ loop do
100
+ begin
101
+ tick
102
+ rescue => e
103
+ $stderr.puts "#{e.class} #{e}\n#{e.backtrace.join "\n"}"
104
+ end
105
+
106
+ # Sleep.
107
+ sleep(options[:interval] - ((Time.now - t0) % options[:interval]))
108
+ end
109
+ end
110
+
111
+ def tick
112
+ end
113
+ end
114
+ end