riemann-tools 0.2.11 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +5 -5
  2. data/.docker/Dockerfile +7 -0
  3. data/.docker/publish.sh +35 -0
  4. data/.github/workflows/ci.yml +29 -0
  5. data/.gitignore +6 -0
  6. data/.rspec +2 -0
  7. data/.travis.yml +31 -0
  8. data/CHANGELOG.md +393 -0
  9. data/Gemfile +6 -0
  10. data/ISSUE_TEMPLATE.md +15 -0
  11. data/README.markdown +17 -1
  12. data/Rakefile +21 -0
  13. data/bin/riemann-apache-status +1 -0
  14. data/bin/riemann-bench +1 -0
  15. data/bin/riemann-cloudant +1 -0
  16. data/bin/riemann-consul +3 -2
  17. data/bin/riemann-dir-files-count +1 -0
  18. data/bin/riemann-dir-space +1 -0
  19. data/bin/riemann-diskstats +1 -0
  20. data/bin/riemann-fd +1 -0
  21. data/bin/riemann-freeswitch +1 -0
  22. data/bin/riemann-haproxy +1 -0
  23. data/bin/riemann-health +87 -10
  24. data/bin/riemann-kvminstance +1 -0
  25. data/bin/riemann-memcached +1 -0
  26. data/bin/riemann-net +3 -2
  27. data/bin/riemann-nginx-status +1 -0
  28. data/bin/riemann-ntp +1 -0
  29. data/bin/riemann-portcheck +42 -0
  30. data/bin/riemann-proc +2 -1
  31. data/bin/riemann-varnish +1 -0
  32. data/bin/riemann-zookeeper +1 -0
  33. data/lib/riemann/tools/utils.rb +17 -0
  34. data/lib/riemann/tools/version.rb +7 -0
  35. data/lib/riemann/tools.rb +15 -5
  36. data/riemann-tools.gemspec +39 -0
  37. data/tools/riemann-aws/LICENSE +21 -0
  38. data/tools/riemann-aws/README.md +54 -0
  39. data/tools/riemann-aws/Rakefile.rb +35 -0
  40. data/tools/riemann-aws/bin/riemann-aws-billing +87 -0
  41. data/tools/riemann-aws/bin/riemann-aws-rds-status +54 -0
  42. data/tools/riemann-aws/bin/riemann-aws-sqs-status +44 -0
  43. data/tools/riemann-aws/bin/riemann-aws-status +71 -0
  44. data/tools/riemann-aws/bin/riemann-elb-metrics +167 -0
  45. data/tools/riemann-aws/bin/riemann-s3-list +82 -0
  46. data/tools/riemann-aws/bin/riemann-s3-status +99 -0
  47. data/tools/riemann-chronos/LICENSE +21 -0
  48. data/tools/riemann-chronos/README.md +10 -0
  49. data/tools/riemann-chronos/Rakefile.rb +35 -0
  50. data/tools/riemann-chronos/bin/riemann-chronos +144 -0
  51. data/tools/riemann-docker/LICENSE +21 -0
  52. data/tools/riemann-docker/README.md +10 -0
  53. data/tools/riemann-docker/Rakefile.rb +34 -0
  54. data/tools/riemann-docker/bin/riemann-docker +217 -0
  55. data/tools/riemann-elasticsearch/LICENSE +21 -0
  56. data/tools/riemann-elasticsearch/README.md +10 -0
  57. data/tools/riemann-elasticsearch/Rakefile.rb +35 -0
  58. data/tools/riemann-elasticsearch/bin/riemann-elasticsearch +166 -0
  59. data/tools/riemann-marathon/LICENSE +21 -0
  60. data/tools/riemann-marathon/README.md +10 -0
  61. data/tools/riemann-marathon/Rakefile.rb +35 -0
  62. data/tools/riemann-marathon/bin/riemann-marathon +147 -0
  63. data/tools/riemann-mesos/LICENSE +21 -0
  64. data/tools/riemann-mesos/README.md +10 -0
  65. data/tools/riemann-mesos/Rakefile.rb +35 -0
  66. data/tools/riemann-mesos/bin/riemann-mesos +131 -0
  67. data/tools/riemann-munin/LICENSE +21 -0
  68. data/tools/riemann-munin/README.md +10 -0
  69. data/tools/riemann-munin/Rakefile.rb +34 -0
  70. data/tools/riemann-munin/bin/riemann-munin +37 -0
  71. data/tools/riemann-rabbitmq/LICENSE +21 -0
  72. data/tools/riemann-rabbitmq/README.md +10 -0
  73. data/tools/riemann-rabbitmq/Rakefile.rb +35 -0
  74. data/tools/riemann-rabbitmq/bin/riemann-rabbitmq +269 -0
  75. data/tools/riemann-riak/LICENSE +21 -0
  76. data/tools/riemann-riak/README.md +10 -0
  77. data/tools/riemann-riak/Rakefile.rb +34 -0
  78. data/tools/riemann-riak/bin/riemann-riak +331 -0
  79. data/tools/riemann-riak/bin/riemann-riak-keys +13 -0
  80. data/tools/riemann-riak/bin/riemann-riak-ring +9 -0
  81. data/tools/riemann-riak/riak_status/key_count.erl +13 -0
  82. data/tools/riemann-riak/riak_status/riak_status.rb +152 -0
  83. data/tools/riemann-riak/riak_status/ringready.erl +9 -0
  84. metadata +130 -16
@@ -0,0 +1,269 @@
1
+ #!/usr/bin/env ruby
2
+ Process.setproctitle($0)
3
+
4
+ require 'riemann/tools'
5
+
6
+ class Riemann::Tools::Rabbitmq
7
+ include Riemann::Tools
8
+
9
+ require 'faraday'
10
+ require 'json'
11
+ require 'uri'
12
+
13
+
14
+ opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
15
+ opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
16
+
17
+ opt :monitor_user, 'RabbitMQ monitoring user', type: :string
18
+ opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
19
+ opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672
20
+ opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost"
21
+ opt :monitor_use_tls, 'RabbitMQ use tls', type: :bool, default: false
22
+
23
+ opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000
24
+ opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string
25
+
26
+ opt :node, "Specify a node to monitor", type: :strings
27
+
28
+ def base_url
29
+ protocol = "http"
30
+ if (options[:monitor_use_tls]) && (options[:monitor_use_tls]==true)
31
+ protocol = "https"
32
+ end
33
+ "#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api"
34
+ end
35
+
36
+ def overview_url
37
+ "#{base_url}/overview"
38
+ end
39
+
40
+ def node_url(n)
41
+ "#{base_url}/nodes/#{n}"
42
+ end
43
+
44
+ def queues_url
45
+ "#{base_url}/queues"
46
+ end
47
+
48
+ def event_host
49
+ if options[:event_host]
50
+ return options[:event_host]
51
+ else
52
+ return options[:monitor_host]
53
+ end
54
+ end
55
+
56
+ def safe_get(uri, event_host)
57
+ # Handle connection timeouts
58
+ response = nil
59
+ begin
60
+ connection = Faraday.new(uri)
61
+ response = connection.get do |req|
62
+ req.options[:timeout] = options[:read_timeout]
63
+ req.options[:open_timeout] = options[:open_timeout]
64
+ end
65
+ report(:host => event_host,
66
+ :service => "rabbitmq monitoring",
67
+ :state => 'ok',
68
+ :description => "Monitoring operational"
69
+ )
70
+ rescue => e
71
+ report(:host => event_host,
72
+ :service => "rabbitmq monitoring",
73
+ :state => "critical",
74
+ :description => "HTTP connection error: #{e.class} - #{e.message}"
75
+ )
76
+ end
77
+ response
78
+ end
79
+
80
+ def check_queues
81
+ response = safe_get(queues_url, event_host)
82
+ max_size_check_filter = if options[:ignore_max_size_queues]
83
+ Regexp.new(options[:ignore_max_size_queues])
84
+ else
85
+ nil
86
+ end
87
+
88
+ return if response.nil?
89
+
90
+ json = JSON.parse(response.body)
91
+
92
+ if response.status != 200
93
+ report(:host => event_host,
94
+ :service => "rabbitmq.queue",
95
+ :state => "critical",
96
+ :description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}"
97
+ )
98
+ else
99
+ report(:host => event_host,
100
+ :service => "rabbitmq.queue",
101
+ :state => "ok",
102
+ :description => "HTTP connection ok"
103
+ )
104
+
105
+ json = JSON.parse(response.body)
106
+
107
+ json.each do |queue|
108
+ svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}"
109
+ errs = []
110
+
111
+ if queue['messages_ready']!=nil and queue['messages_ready'] > 0 and queue['consumers'] == 0
112
+ errs << "Queue has jobs but no consumers"
113
+ end
114
+
115
+ if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready']!=nil and queue['messages_ready'] > options[:max_queue_size]
116
+ errs << "Queue has #{queue['messages_ready']} jobs"
117
+ end
118
+
119
+ if errs.empty?
120
+ report(:host => event_host,
121
+ :service => svc,
122
+ :state => "ok",
123
+ :description => "Queue is looking good"
124
+ )
125
+ else
126
+ report(:host => event_host,
127
+ :service => svc,
128
+ :state => "critical",
129
+ :description => errs.join("; ")
130
+ )
131
+ end
132
+
133
+ stats = (queue['message_stats'] || {}).merge(
134
+ 'messages' => queue['messages'],
135
+ 'messages_details' => queue['messages_details'],
136
+ 'messages_ready' => queue['messages_ready'],
137
+ 'messages_ready_details' => queue['messages_ready_details'],
138
+ 'messages_unacknowledged' => queue['messages_unacknowledged'],
139
+ 'messages_unacknowledged_details' => queue['messages_unacknowledged_details'],
140
+ 'consumers' => queue['consumers'],
141
+ 'memory' => queue['memory'],
142
+ )
143
+
144
+ stats.each_pair do |k,v|
145
+ service = "#{svc}.#{k}"
146
+ if k =~ /details$/ and v!=nil
147
+ metric = v['rate']
148
+ else
149
+ metric = v
150
+ end
151
+
152
+ # TODO: Set state via thresholds which can be configured
153
+
154
+ report(:host => event_host,
155
+ :service => service,
156
+ :metric => metric,
157
+ :description => "RabbitMQ monitor"
158
+ )
159
+ end
160
+ end
161
+ end
162
+ end
163
+
164
+ def check_overview
165
+ uri = URI(overview_url)
166
+ response = safe_get(uri, event_host)
167
+
168
+ return if response.nil?
169
+
170
+ json = JSON.parse(response.body)
171
+
172
+ if response.status != 200
173
+ report(:host => event_host,
174
+ :service => "rabbitmq",
175
+ :state => "critical",
176
+ :description => "HTTP connection error: #{response.status} - #{response.body}"
177
+ )
178
+ else
179
+ report(:host => event_host,
180
+ :service => "rabbitmq monitoring",
181
+ :state => "ok",
182
+ :description => "HTTP connection ok"
183
+ )
184
+
185
+ %w( message_stats queue_totals object_totals ).each do |stat|
186
+ # NOTE / BUG ?
187
+ # Brand new servers can have blank message stats. Is this ok?
188
+ # I can't decide.
189
+ next if json[stat].empty?
190
+ json[stat].each_pair do |k,v|
191
+ service = "rabbitmq.#{stat}.#{k}"
192
+ if k =~ /details$/
193
+ metric = v['rate']
194
+ else
195
+ metric = v
196
+ end
197
+
198
+ # TODO: Set state via thresholds which can be configured
199
+
200
+ report(:host => event_host,
201
+ :service => service,
202
+ :metric => metric,
203
+ :description => "RabbitMQ monitor"
204
+ )
205
+ end
206
+ end
207
+ end
208
+ end
209
+
210
+ def check_node
211
+ opts[:node].each do |n|
212
+ uri = URI(node_url(n))
213
+ response = safe_get(uri, event_host)
214
+
215
+ return if response.nil?
216
+
217
+ if response.status != 200
218
+ if response.status == 404
219
+ report(:host => event_host,
220
+ :service => "rabbitmq.node.#{n}",
221
+ :state => "critical",
222
+ :description => "Node was not found in the cluster"
223
+ )
224
+ else
225
+ report(:host => event_host,
226
+ :service => "rabbitmq.node.#{n}",
227
+ :state => "critical",
228
+ :description => "HTTP error: #{response.status} - #{response.body}"
229
+ )
230
+ end
231
+ return
232
+ end
233
+
234
+ json = JSON.parse(response.body)
235
+
236
+ if json['mem_alarm']
237
+ report(:host => event_host,
238
+ :service => "rabbitmq.node.#{n}",
239
+ :state => "critical",
240
+ :description => "Memory alarm has triggered; job submission throttled"
241
+ )
242
+ return
243
+ end
244
+
245
+ if json['disk_free_alarm']
246
+ report(:host => event_host,
247
+ :service => "rabbitmq.node.#{n}",
248
+ :state => "critical",
249
+ :description => "Disk free alarm has triggered; job submission throttled"
250
+ )
251
+ return
252
+ end
253
+
254
+ report(:host => event_host,
255
+ :service => "rabbitmq.node.#{n}",
256
+ :state => "ok",
257
+ :description => "Node looks OK to me"
258
+ )
259
+ end
260
+ end
261
+
262
+ def tick
263
+ check_overview
264
+ check_node if opts[:node]
265
+ check_queues
266
+ end
267
+ end
268
+ Riemann::Tools::Rabbitmq.run
269
+
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2011 Kyle Kingsbury
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,10 @@
1
+ # Riemann Riak
2
+
3
+ Gathers Riak statistics and submits them to Riemann.
4
+
5
+ ## Getting started
6
+
7
+ ```
8
+ gem install riemann-riak
9
+ riemann-riak --help
10
+ ```
@@ -0,0 +1,34 @@
1
+ require 'rubygems'
2
+ require 'rubygems/package_task'
3
+ require 'rdoc/task'
4
+ require 'find'
5
+
6
+ # Don't include resource forks in tarballs on Mac OS X.
7
+ ENV['COPY_EXTENDED_ATTRIBUTES_DISABLE'] = 'true'
8
+ ENV['COPYFILE_DISABLE'] = 'true'
9
+
10
+ # Gemspec
11
+ gemspec = Gem::Specification.new do |s|
12
+ s.rubyforge_project = 'riemann-riak'
13
+
14
+ s.name = 'riemann-riak'
15
+ s.version = '0.1.2'
16
+ s.author = 'Kyle Kingsbury'
17
+ s.email = 'aphyr@aphyr.com'
18
+ s.homepage = 'https://github.com/riemann/riemann-tools'
19
+ s.platform = Gem::Platform::RUBY
20
+ s.summary = 'Submits riak stats to riemann.'
21
+ s.license = 'MIT'
22
+
23
+ s.add_dependency 'riemann-tools', '>= 0.2.13'
24
+ s.add_dependency 'yajl-ruby', '>= 1.1.0'
25
+
26
+ s.files = FileList['bin/*', 'LICENSE', 'README.md'].to_a
27
+ s.executables |= Dir.entries('bin/')
28
+ s.has_rdoc = false
29
+
30
+ s.required_ruby_version = '>= 1.8.7'
31
+ end
32
+
33
+ Gem::PackageTask.new gemspec do |p|
34
+ end
@@ -0,0 +1,331 @@
1
+ #!/usr/bin/env ruby
2
+ Process.setproctitle($0)
3
+
4
+ # Forwards information on a Riak node to Riemann.
5
+
6
+ require 'riemann/tools'
7
+
8
+ class Riemann::Tools::Riak
9
+ include Riemann::Tools
10
+ require 'net/http'
11
+ require 'net/https'
12
+ require 'yajl/json_gem'
13
+
14
+ opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
15
+ opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
16
+ opt :stats_port, "Riak HTTP port for stats", :default => 8098
17
+ opt :stats_path, "Riak HTTP stats path", :default => '/stats'
18
+ opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}"
19
+ opt :cookie, "Riak cookie to use", :default => "riak"
20
+
21
+ opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000
22
+ opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000
23
+ opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000
24
+ opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000
25
+ opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000
26
+ opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000
27
+
28
+ def initialize
29
+ detect_features
30
+
31
+ @httpstatus = true
32
+
33
+ begin
34
+ uri = URI.parse(opts[:riak_host])
35
+ if uri.host == nil
36
+ uri.host = opts[:riak_host]
37
+ end
38
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
39
+ http.use_ssl = uri.scheme == 'https'
40
+ if http.use_ssl?
41
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
42
+ end
43
+ http.start do |h|
44
+ h.get opts[:stats_path]
45
+ end
46
+ rescue => _e
47
+ @httpstatus = false
48
+ end
49
+
50
+ # we're going to override the emulator setting to allow users to
51
+ # dynamically input the cookie
52
+ # this is done only once - hopefully it doesn't get overridden.
53
+ ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
54
+ end
55
+
56
+ # Identifies whether escript and riak-admin are installed
57
+ def detect_features
58
+ @escript = true # Whether escript is present on this machine
59
+ @riakadmin = true # Whether riak-admin is present
60
+
61
+ if `which escript` =~ /^\s*$/
62
+ @escript = false
63
+ end
64
+
65
+ if `which riak-admin` =~ /^\s*$/
66
+ @riakadmin = false
67
+ end
68
+ end
69
+
70
+ def check_ring
71
+ str = if @escript
72
+ str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp
73
+ elsif @riakadmin
74
+ str = `riak-admin ringready`
75
+ else
76
+ nil
77
+ end
78
+
79
+ return if str.nil?
80
+
81
+ if str =~ /^TRUE/
82
+ report(
83
+ :host => opts[:riak_host],
84
+ :service => 'riak ring',
85
+ :state => 'ok',
86
+ :description => str
87
+ )
88
+ else
89
+ report(
90
+ :host => opts[:riak_host],
91
+ :service => 'riak ring',
92
+ :state => 'warning',
93
+ :description => str
94
+ )
95
+ end
96
+ end
97
+
98
+ def check_keys
99
+ keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp
100
+ if keys =~ /^\d+$/
101
+ report(
102
+ :host => opts[:riak_host],
103
+ :service => 'riak keys',
104
+ :state => 'ok',
105
+ :metric => keys.to_i,
106
+ :description => keys
107
+ )
108
+ else
109
+ report(
110
+ :host => opts[:riak_host],
111
+ :service => 'riak keys',
112
+ :state => 'unknown',
113
+ :description => keys
114
+ )
115
+ end
116
+ end
117
+
118
+ def check_transfers
119
+ str = if @riakadmin
120
+ `riak-admin transfers`
121
+ else
122
+ nil
123
+ end
124
+
125
+ return if str.nil?
126
+
127
+ if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/
128
+ report(
129
+ :host => opts[:riak_host],
130
+ :service => 'riak transfers',
131
+ :state => 'critical',
132
+ :metric => $1.to_i,
133
+ :description => "waiting to handoff #{$1} partitions"
134
+ )
135
+ else
136
+ report(
137
+ :host => opts[:riak_host],
138
+ :service => 'riak transfers',
139
+ :state => 'ok',
140
+ :metric => 0,
141
+ :description => "No pending transfers"
142
+ )
143
+ end
144
+ end
145
+
146
+ def check_disk
147
+ gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2)
148
+ report(
149
+ :host => opts[:riak_host],
150
+ :service => 'riak disk',
151
+ :state => 'ok',
152
+ :metric => gb,
153
+ :description => "#{gb} GB in #{opts[:data_dir]}"
154
+ )
155
+ end
156
+
157
+ # Returns the riak stat for the given fsm type and percentile.
158
+ def fsm_stat(type, property, percentile)
159
+ "node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}"
160
+ end
161
+
162
+ # Returns the alerts state for the given fsm.
163
+ def fsm_state(type, percentile, val)
164
+ limit = opts["#{type}_#{percentile}_warning".to_sym]
165
+ case val
166
+ when 0 .. limit
167
+ 'ok'
168
+ when limit .. limit * 2
169
+ 'warning'
170
+ else
171
+ 'critical'
172
+ end
173
+ end
174
+
175
+ # Get current stats via HTTP
176
+ def stats_http
177
+ begin
178
+ uri = URI.parse(opts[:riak_host])
179
+ if uri.host == nil
180
+ uri.host = opts[:riak_host]
181
+ end
182
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
183
+ http.use_ssl = uri.scheme == 'https'
184
+ if http.use_ssl?
185
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
186
+ end
187
+ res = http.start do |h|
188
+ h.get opts[:stats_path]
189
+ end
190
+ rescue => e
191
+ report(
192
+ :host => opts[:riak_host],
193
+ :service => 'riak',
194
+ :state => 'critical',
195
+ :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}"
196
+ )
197
+ raise
198
+ end
199
+
200
+ if res.code.to_i == 200
201
+ return JSON.parse(res.body)
202
+ else
203
+ report(
204
+ :host => opts[:riak_host],
205
+ :service => 'riak',
206
+ :state => 'critical',
207
+ :description => "stats returned HTTP #{res.code}:\n\n#{res.body}"
208
+ )
209
+ raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}"
210
+ end
211
+ end
212
+
213
+ # Get current stats via riak-admin
214
+ def stats_riak_admin
215
+ str = `riak-admin status`
216
+ raise "riak-admin failed" unless $? == 0
217
+ Hash[str.split(/\n/).map{|i| i.split(/ : /)}]
218
+ end
219
+
220
+ # Get current stats as a hash
221
+ def stats
222
+ if @httpstatus
223
+ stats_http
224
+ elsif @riakadmin
225
+ stats_riak_admin
226
+ else
227
+ report(
228
+ :host => opts[:riak_host],
229
+ :service => 'riak',
230
+ :state => 'critical',
231
+ :description => "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
232
+ )
233
+ raise "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available."
234
+ end
235
+ end
236
+
237
+ def core_services
238
+ ['vnode_gets',
239
+ 'vnode_puts',
240
+ 'node_gets',
241
+ 'node_puts',
242
+ 'node_gets_set',
243
+ 'node_puts_set',
244
+ 'read_repairs']
245
+ end
246
+
247
+ def fsm_types
248
+ [{'get' => 'time'}, {'put' => 'time'},
249
+ {'get' => 'set_objsize'}]
250
+ end
251
+
252
+ def fsm_percentiles
253
+ [50, 95, 99]
254
+ end
255
+
256
+ # Reports current stats to Riemann
257
+ def check_stats
258
+ begin
259
+ stats = self.stats
260
+ rescue => e
261
+ event = {:state => 'critical',
262
+ :description => e.message,
263
+ :host => opts[:riak_host]}
264
+ # Report errors
265
+ report(event.merge(:service => 'riak'))
266
+ core_services.each do |s|
267
+ report(event.merge(:service => "riak #{s}"))
268
+ end
269
+ fsm_types.each do |typespec|
270
+ typespec.each do |type, prop|
271
+ fsm_percentiles.each do |percentile|
272
+ report(event.merge(:service => "riak #{type} #{prop} #{percentile}"))
273
+ end
274
+ end
275
+ end
276
+ return
277
+ end
278
+
279
+ # Riak itself
280
+ report(
281
+ :host => opts[:riak_host],
282
+ :service => 'riak',
283
+ :state => 'ok'
284
+ )
285
+
286
+ # Gets/puts/rr
287
+ core_services.each do |s|
288
+ report(
289
+ :host => opts[:riak_host],
290
+ :service => "riak #{s}",
291
+ :state => 'ok',
292
+ :metric => stats[s].to_i/60.0,
293
+ :description => "#{stats[s].to_i/60.0}/sec"
294
+ )
295
+ end
296
+
297
+ # FSMs
298
+ fsm_types.each do |typespec|
299
+ typespec.each do |type, prop|
300
+ fsm_percentiles.each do |percentile|
301
+ val = stats[fsm_stat(type, prop, percentile)].to_i || 0
302
+ val = 0 if val == 'undefined'
303
+ val /= 1000.0 if prop == 'time' # Convert us to ms
304
+ if prop == 'time'
305
+ state = fsm_state(type, percentile, val)
306
+ else
307
+ state = "ok"
308
+ end
309
+ report(
310
+ :host => opts[:riak_host],
311
+ :service => "riak #{type} #{prop} #{percentile}",
312
+ :state => state,
313
+ :metric => val,
314
+ :description => "#{val} ms"
315
+ )
316
+ end
317
+ end
318
+ end
319
+ end
320
+
321
+ def tick
322
+ # This can utterly destroy a cluster, so we disable
323
+ # check_keys
324
+ check_stats
325
+ check_ring
326
+ check_disk
327
+ check_transfers
328
+ end
329
+ end
330
+
331
+ Riemann::Tools::Riak.run
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env escript
2
+ Process.setproctitle($0)
3
+ %%! -name riakstatuscheck@127.0.0.1 -hidden
4
+
5
+ main([]) -> main(["riak@127.0.0.1"]);
6
+ main([Node]) ->
7
+ io:format("~w\n", [
8
+ lists:foldl(
9
+ fun({_VNode, Count}, Sum) -> Sum + Count end,
10
+ 0,
11
+ rpc:call(list_to_atom(Node), riak_kv_bitcask_backend, key_counts, [])
12
+ )
13
+ ]).
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env escript
2
+ Process.setproctitle($0)
3
+ %%! -name riakstatuscheck@127.0.0.1 -hidden
4
+
5
+ main([]) -> main(["riak@127.0.0.1"]);
6
+ main([Node]) ->
7
+ io:format("~p\n", [
8
+ rpc:call(list_to_atom(Node), riak_kv_console, ringready, [[]])
9
+ ]).
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env escript
2
+ Process.setproctitle($0)
3
+ %%! -name riakstatuscheck -setcookie riak -hidden
4
+
5
+ main([]) -> main(["riak@127.0.0.1"]);
6
+ main([Node]) ->
7
+ io:format("~w\n", [
8
+ lists:foldl(
9
+ fun({_VNode, Count}, Sum) -> Sum + Count end,
10
+ 0,
11
+ rpc:call(list_to_atom(Node), riak_kv_bitcask_backend, key_counts, [])
12
+ )
13
+ ]).