riemann-tools.haf 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ class Riemann::Tools::Resmon
6
+ include Riemann::Tools
7
+ require 'nokogiri'
8
+ require 'faraday'
9
+
10
+ opt :resmon_hostfile, 'File with hostnames running Resmon (one URI per line)', type: :string
11
+ opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
12
+ opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
13
+ opt :fqdn, 'Use FQDN for event host'
14
+
15
+
16
+ def initialize
17
+ @hosts = File.read(options[:resmon_hostfile]).split("\n")
18
+ super
19
+ end
20
+
21
+
22
+ # Work out the hostname to submit with the event
23
+ def get_event_host(host)
24
+ unless options[:fqdn]
25
+ return host.split('.')[0]
26
+ end
27
+ return host
28
+ end
29
+
30
+ # Handles HTTP connections and GET requests safely
31
+ def safe_get(uri, event_host)
32
+ # Handle connection timeouts
33
+ response = nil
34
+ begin
35
+ connection = Faraday.new(uri)
36
+ response = connection.get do |req|
37
+ req.options[:timeout] = options[:read_timeout]
38
+ req.options[:open_timeout] = options[:open_timeout]
39
+ end
40
+ rescue => e
41
+ report(:host => event_host,
42
+ :service => "resmon",
43
+ :state => "critical",
44
+ :description => "HTTP connection error: #{e.class} - #{e.message}"
45
+ )
46
+ end
47
+ response
48
+ end
49
+
50
+ def tick
51
+ @hosts.each do |host|
52
+
53
+ uri = URI(host)
54
+ event_host = get_event_host(uri.host)
55
+
56
+ response = safe_get(uri, event_host)
57
+ next if response.nil?
58
+
59
+ # Handle non-200 responses
60
+ if response.status != 200
61
+ report(:host => event_host,
62
+ :service => "resmon",
63
+ :state => "critical",
64
+ :description => "HTTP connection error: #{response.status} - #{response.body}"
65
+ )
66
+ next
67
+ else
68
+ report(:host => event_host,
69
+ :service => "resmon",
70
+ :state => "ok",
71
+ :description => "Resmon connection ok"
72
+ )
73
+ doc = Nokogiri::XML(response.body)
74
+ end
75
+
76
+ doc.xpath('//ResmonResults/ResmonResult').each do |result|
77
+ timestamp = result.xpath('last_update').first.text
78
+ result.xpath('metric').each do |metric|
79
+ hash = {
80
+ host: event_host,
81
+ service: "#{result.attributes['module'].value}`#{result.attributes['service'].value}`#{metric.attributes['name'].value}",
82
+ time: timestamp.to_i
83
+ }
84
+
85
+ case metric.attributes['type'].value
86
+ when /[iIlL]/
87
+ hash[:metric] = metric.text.to_i
88
+ when 'n'
89
+ hash[:metric] = metric.text.to_f
90
+ when 's'
91
+ hash[:description] = metric.text
92
+ when '0'
93
+ raise 'dunno what 0 is yet'
94
+ end
95
+
96
+ report(hash)
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+
103
+ Riemann::Tools::Resmon.run
@@ -0,0 +1,237 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Forwards information on a Riak node to Riemann.
4
+
5
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
6
+
7
+ require 'net/http'
8
+ require 'net/https'
9
+ require 'yajl/json_gem'
10
+
11
+ class Riemann::Tools::Riak
12
+ include Riemann::Tools
13
+
14
+ opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
15
+ opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
16
+ opt :stats_port, "Riak HTTP port for stats", :default => 8098
17
+ opt :stats_path, "Riak HTTP stats path", :default => '/stats'
18
+ opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}"
19
+ opt :cookie, "Riak cookie to use", :default => "riak"
20
+
21
+ opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000
22
+ opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000
23
+ opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000
24
+ opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
25
+ opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
26
+ opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
27
+
28
+ def initialize
29
+ @escript = true
30
+ @riakadmin = true
31
+ @httpstatus = true
32
+
33
+ if `which escript` =~ /^\s*$/
34
+ @escript = false
35
+ end
36
+
37
+ if `which riak-admin` =~ /^\s*$/
38
+ @riakadmin = false
39
+ end
40
+
41
+ if
42
+ begin
43
+ uri = URI.parse(opts[:riak_host])
44
+ if uri.host == nil
45
+ uri.host = opts[:riak_host]
46
+ end
47
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
48
+ http.use_ssl = uri.scheme == 'https'
49
+ if http.use_ssl?
50
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
51
+ end
52
+ http.start do |http|
53
+ http.get opts[:stats_path]
54
+ end
55
+ rescue => e
56
+ @httpstatus = false
57
+ end
58
+ end
59
+ # we're going to override the emulator setting to allow users to
60
+ # dynamically input the cookie
61
+ # this is done only once - hopefully it doesn't get overridden.
62
+ ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
63
+
64
+ end
65
+
66
+ def check_ring
67
+ if @escript
68
+ str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
69
+ elsif @riakadmin
70
+ str = `riak-admin ringready`
71
+ end
72
+
73
+ if str =~ /^TRUE/
74
+ report(
75
+ :host => opts[:riak_host],
76
+ :service => 'riak ring',
77
+ :state => 'ok',
78
+ :description => str
79
+ )
80
+ else
81
+ report(
82
+ :host => opts[:riak_host],
83
+ :service => 'riak ring',
84
+ :state => 'warning',
85
+ :description => str
86
+ )
87
+ end
88
+ end
89
+
90
+ def check_keys
91
+ keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp
92
+ if keys =~ /^\d+$/
93
+ report(
94
+ :host => opts[:riak_host],
95
+ :service => 'riak keys',
96
+ :state => 'ok',
97
+ :metric => keys.to_i,
98
+ :description => keys
99
+ )
100
+ else
101
+ report(
102
+ :host => opts[:riak_host],
103
+ :service => 'riak keys',
104
+ :state => 'unknown',
105
+ :description => keys
106
+ )
107
+ end
108
+ end
109
+
110
+ def check_disk
111
+ gb = `du -s #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
112
+ report(
113
+ :host => opts[:riak_host],
114
+ :service => 'riak disk',
115
+ :state => 'ok',
116
+ :metric => gb,
117
+ :description => "#{gb} GB in #{opts[:data_dir]}"
118
+ )
119
+ end
120
+
121
+ # Returns the riak stat for the given fsm type and percentile.
122
+ def fsm_stat(type, percentile)
123
+ "node_#{type}_fsm_time_#{percentile == 50 ? 'median' : percentile}"
124
+ end
125
+
126
+ # Returns the alerts state for the given fsm.
127
+ def fsm_state(type, percentile, val)
128
+ limit = opts["#{type}_#{percentile}_warning".to_sym]
129
+ case val
130
+ when 0 .. limit
131
+ 'ok'
132
+ when limit .. limit * 2
133
+ 'warning'
134
+ else
135
+ 'critical'
136
+ end
137
+ end
138
+
139
+ def check_stats
140
+ if @httpstatus
141
+ begin
142
+ uri = URI.parse(opts[:riak_host])
143
+ if uri.host == nil
144
+ uri.host = opts[:riak_host]
145
+ end
146
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
147
+ http.use_ssl = uri.scheme == 'https'
148
+ if http.use_ssl?
149
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
150
+ end
151
+ res = http.start do |http|
152
+ http.get opts[:stats_path]
153
+ end
154
+ rescue => e
155
+ report(
156
+ :host => opts[:riak_host],
157
+ :service => 'riak',
158
+ :state => 'critical',
159
+ :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
160
+ )
161
+ return
162
+ end
163
+
164
+ if res.code.to_i == 200
165
+ stats = JSON.parse(res.body)
166
+ else
167
+ report(
168
+ :host => opts[:riak_host],
169
+ :service => 'riak',
170
+ :state => 'critical',
171
+ :description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
172
+ )
173
+ return
174
+ end
175
+ elsif @riakadmin
176
+ stats = Hash[`riak-admin status`.split(/\n/).map{|i| i.split(/ : /)}]
177
+ else
178
+ report(
179
+ :host => opts[:riak_host],
180
+ :service => 'riak',
181
+ :state => 'critical',
182
+ :description => "error fetching Riak stats"
183
+ )
184
+ return
185
+ end
186
+
187
+ report(
188
+ :host => opts[:riak_host],
189
+ :service => 'riak',
190
+ :state => 'ok'
191
+ )
192
+
193
+ # Gets/puts/rr
194
+ [
195
+ 'vnode_gets',
196
+ 'vnode_puts',
197
+ 'node_gets',
198
+ 'node_puts',
199
+ 'read_repairs'
200
+ ].each do |s|
201
+ report(
202
+ :host => opts[:riak_host],
203
+ :service => "riak #{s}",
204
+ :state => 'ok',
205
+ :metric => stats[s].to_i/60.0,
206
+ :description => "#{stats[s].to_i/60.0}/sec"
207
+ )
208
+ end
209
+
210
+ # FSMs
211
+ ['get', 'put'].each do |type|
212
+ [50, 95, 99].each do |percentile|
213
+ val = stats[fsm_stat(type, percentile)].to_i || 0
214
+ val = 0 if val == 'undefined'
215
+ val /= 1000.0 # Convert us to ms
216
+ state = fsm_state(type, percentile, val)
217
+ report(
218
+ :host => opts[:riak_host],
219
+ :service => "riak #{type} #{percentile}",
220
+ :state => state,
221
+ :metric => val,
222
+ :description => "#{val} ms"
223
+ )
224
+ end
225
+ end
226
+ end
227
+
228
+ def tick
229
+ # This can utterly destroy a cluster, so we disable
230
+ # check_keys
231
+ check_stats
232
+ check_ring
233
+ check_disk
234
+ end
235
+ end
236
+
237
+ Riemann::Tools::Riak.run
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env escript
2
+ %%! -name riakstatuscheck@127.0.0.1 -hidden
3
+
4
+ main([]) -> main(["riak@127.0.0.1"]);
5
+ main([Node]) ->
6
+ io:format("~w\n", [
7
+ lists:foldl(
8
+ fun({_VNode, Count}, Sum) -> Sum + Count end,
9
+ 0,
10
+ rpc:call(list_to_atom(Node), riak_kv_bitcask_backend, key_counts, [])
11
+ )
12
+ ]).
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env escript
2
+ %%! -name riakstatuscheck@127.0.0.1 -hidden
3
+
4
+ main([]) -> main(["riak@127.0.0.1"]);
5
+ main([Node]) ->
6
+ io:format("~p\n", [
7
+ rpc:call(list_to_atom(Node), riak_kv_console, ringready, [[]])
8
+ ]).
@@ -0,0 +1,114 @@
1
+ module Riemann
2
+ module Tools
3
+ require 'rubygems'
4
+ require 'trollop'
5
+ require 'riemann/client'
6
+ require 'timeout'
7
+
8
+ def self.included(base)
9
+ base.instance_eval do
10
+ def run
11
+ new.run
12
+ end
13
+
14
+ def opt(*a)
15
+ a.unshift :opt
16
+ @opts ||= []
17
+ @opts << a
18
+ end
19
+
20
+ def options
21
+ p = Trollop::Parser.new
22
+ @opts.each do |o|
23
+ p.send *o
24
+ end
25
+ Trollop::with_standard_exception_handling(p) do
26
+ p.parse ARGV
27
+ end
28
+ end
29
+
30
+ opt :host, "Riemann host", :default => '127.0.0.1'
31
+ opt :port, "Riemann port", :default => 5555
32
+ opt :event_host, "Event hostname", :type => String
33
+ opt :interval, "Seconds between updates", :default => 5
34
+ opt :tag, "Tag to add to events", :type => String, :multi => true
35
+ opt :ttl, "TTL for events", :type => Integer
36
+ opt :attribute, "Attribute to add to the event", :type => String, :multi => true
37
+ opt :timeout, "Timeout (in seconds) when waiting for acknowledgements", :default => 30
38
+ opt :tcp, "Use TCP transport instead of UDP (improves reliability, slight overhead.", :default => true
39
+ end
40
+ end
41
+
42
+ # Returns parsed options (cached) from command line.
43
+ def options
44
+ @options ||= self.class.options
45
+ end
46
+ alias :opts :options
47
+
48
+ def attributes
49
+ @attributes ||= Hash[options[:attribute].map do |attr|
50
+ k,v = attr.split(/=/)
51
+ if k and v
52
+ [k,v]
53
+ end
54
+ end]
55
+ end
56
+
57
+ def report(event)
58
+ if options[:tag]
59
+ # Work around a bug with beefcake which can't take frozen strings.
60
+ event[:tags] = options[:tag].map(&:dup)
61
+ end
62
+
63
+ event[:ttl] ||= (options[:ttl] || (options[:interval] * 2))
64
+
65
+ if options[:event_host]
66
+ event[:host] = options[:event_host].dup
67
+ end
68
+
69
+ event = event.merge(attributes)
70
+
71
+ begin
72
+ Timeout::timeout(options[:timeout]) do
73
+ riemann << event
74
+ end
75
+ rescue Timeout::Error
76
+ riemann.connect
77
+ end
78
+ end
79
+
80
+ def new_riemann_client
81
+ r = Riemann::Client.new(
82
+ :host => options[:host],
83
+ :port => options[:port]
84
+ )
85
+ if options[:tcp]
86
+ r.tcp
87
+ else
88
+ r
89
+ end
90
+ end
91
+
92
+ def riemann
93
+ @riemann ||= new_riemann_client
94
+ end
95
+ alias :r :riemann
96
+
97
+ def run
98
+ t0 = Time.now
99
+ loop do
100
+ begin
101
+ tick
102
+ rescue => e
103
+ $stderr.puts "#{e.class} #{e}\n#{e.backtrace.join "\n"}"
104
+ end
105
+
106
+ # Sleep.
107
+ sleep(options[:interval] - ((Time.now - t0) % options[:interval]))
108
+ end
109
+ end
110
+
111
+ def tick
112
+ end
113
+ end
114
+ end