riemann-tools.haf 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2011 Kyle Kingsbury
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,18 @@
1
+ Riemann Tools
2
+ =============
3
+
4
+ Tiny programs to submit events to Riemann.
5
+
6
+ Riemann-health, for example, submits events about the current CPU, load,
7
+ memory, and disk use. Bench submits randomly distributed metrics for load
8
+ testing. I've got a whole bunch of these internally for monitoring Redis, Riak,
9
+ queues, etc. Most have internal configuration dependencies, so it'll be a while
10
+ before I can extract them for re-use.
11
+
12
+ Get started
13
+ ==========
14
+
15
+ ``` bash
16
+ gem install riemann-tools
17
+ riemann-health --host my.riemann.server
18
+ ```
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'fog'
4
+ require 'date'
5
+
6
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
7
+
8
+ $0 = __FILE__ # Let's not expose our AWS keys in the process list
9
+
10
+ class Riemann::Tools::AWS
11
+ include Riemann::Tools
12
+
13
+ opt :access_key, "AWS access key", :type => String
14
+ opt :secret_key, "Secret access key", :type => String
15
+ opt :region, "AWS region", :type => String, :default => 'eu-west-1'
16
+
17
+ opt :retirement_critical, "Number of days before retirement. Defaults to 2", :default => 2
18
+ opt :event_warning, "Number of days before event. Defaults to nil (i.e. when the event appears)", :default => nil
19
+
20
+ def initialize
21
+ @compute = Fog::Compute.new(:aws_access_key_id => opts[:access_key],
22
+ :aws_secret_access_key => opts[:secret_key],
23
+ :region => opts[:region],
24
+ :provider => 'AWS')
25
+ end
26
+
27
+ def tick
28
+ instance_status = @compute.describe_instance_status.body["instanceStatusSet"]
29
+ status = instance_status.inject({}) do |acc,i|
30
+ acc[i.delete("instanceId")] = i
31
+ acc
32
+ end
33
+
34
+ hosts = @compute.servers.select { |s| s.state == "running" }.
35
+ inject([status, {}]) do |(status, acc), host|
36
+ acc[host.private_dns_name] = status.delete(host.id); [status, acc]
37
+ end[1]
38
+
39
+ hosts.each do |host, status|
40
+ status['eventsSet'].each do |event|
41
+ before, after = ['notBefore', 'notAfter'].map { |k| Date.parse event[k].to_s if event[k] }
42
+
43
+ ev = {:host => host,
44
+ :service => "aws_instance_status",
45
+ :description => "#{event['code']}\n\nstart #{event['notBefore']}\nend #{event['notAfter']}\n\n#{event['description']}",
46
+ :state => "ok",
47
+ :ttl => 300}
48
+
49
+ ev2 = if (event['code'] == 'instance-retirement') and
50
+ Date.today >= before-opts[:retirement_critical]
51
+ {:state => "critical"}
52
+ elsif opts[:event_warning] and Date.today >= before-opts[:event_warning]
53
+ {:state => "warning"}
54
+ else
55
+ {:state => "warning"}
56
+ end
57
+
58
+ report ev.merge(ev2)
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ Riemann::Tools::AWS.run
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Connects to a server (first arg) and populates it with a constant stream of
4
+ # events for testing.
5
+
6
+ require 'rubygems'
7
+ require 'riemann/client'
8
+ require 'pp'
9
+
10
+ class Riemann::Bench
11
+ attr_accessor :client, :hosts, :services, :states
12
+ def initialize
13
+ @hosts = [nil] + (0...10).map { |i| "host#{i}" }
14
+ @hosts = %w(a b c d e f g h i j)
15
+ @services = %w(test1 test2 test3 foo bar baz xyzzy attack cat treat)
16
+ @states = {}
17
+ @client = Riemann::Client.new(:host => (ARGV.first || 'localhost'))
18
+ end
19
+
20
+ def evolve(state)
21
+ m = state[:metric] + (rand - 0.5) * 0.1
22
+ m = [[0,m].max, 1].min
23
+
24
+ s = case m
25
+ when 0...0.75
26
+ 'ok'
27
+ when 0.75...0.9
28
+ 'warning'
29
+ when 0.9..1.0
30
+ 'critical'
31
+ end
32
+
33
+ {
34
+ :metric => m,
35
+ :state => s,
36
+ :host => state[:host],
37
+ :service => state[:service],
38
+ :description => "at #{Time.now}"
39
+ }
40
+ end
41
+
42
+ def tick
43
+ # pp @states
44
+ hosts.product(services).each do |id|
45
+ client << (states[id] = evolve(states[id]))
46
+ end
47
+ end
48
+
49
+ def run
50
+ start
51
+ loop do
52
+ sleep 0.05
53
+ tick
54
+ end
55
+ end
56
+
57
+ def start
58
+ hosts.product(services).each do |host, service|
59
+ states[[host, service]] = {
60
+ :metric => 0.5,
61
+ :state => 'ok',
62
+ :description => "Starting up",
63
+ :host => host,
64
+ :service => service
65
+ }
66
+ end
67
+ end
68
+ end
69
+
70
+ Riemann::Bench.new.run
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Gathers load balancer statistics from Cloudant.com (shared cluster) and submits them to Riemann.
4
+
5
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
6
+
7
+ class Riemann::Tools::Cloudant
8
+ include Riemann::Tools
9
+ require 'net/http'
10
+ require 'json'
11
+
12
+ opt :cloudant_username, "Cloudant username", :type => :string, :required => true
13
+ opt :cloudant_password, "Cloudant pasword", :type => :string, :required => true
14
+
15
+ def tick
16
+ json = JSON.parse(get_json().body)
17
+ json.each do |node|
18
+ return if node['svname'] == 'BACKEND' # this is just a sum of all nodes.
19
+
20
+ ns = "cloudant #{node['pxname']}"
21
+ cluster_name = node['tracked'].split('.')[0] # ie: meritage.cloudant.com
22
+
23
+ # report health of each node.
24
+ report(
25
+ :service => ns,
26
+ :state => (node['status'] == 'UP' ? 'ok' : 'critical'),
27
+ :tags => ['cloudant', cluster_name]
28
+ )
29
+
30
+ # report property->metric of each node.
31
+ node.each do |property, metric|
32
+ unless ['pxname', 'svname', 'status', 'tracked'].include?(property)
33
+ report(
34
+ :host => node['tracked'],
35
+ :service => "#{ns} #{property}",
36
+ :metric => metric.to_f,
37
+ :state => (node['status'] == 'UP' ? 'ok' : 'critical'),
38
+ :tags => ['cloudant', cluster_name]
39
+ )
40
+ end
41
+ end
42
+
43
+ end
44
+ end
45
+
46
+ def get_json
47
+ http = Net::HTTP.new('cloudant.com', 443)
48
+ http.use_ssl = true
49
+ http.start do |h|
50
+ get = Net::HTTP::Get.new('/api/load_balancer')
51
+ get.basic_auth opts[:cloudant_username], opts[:cloudant_password]
52
+ h.request get
53
+ end
54
+ end
55
+
56
+ end
57
+
58
+ Riemann::Tools::Cloudant.run
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ require 'rubygems'
4
+ require 'riemann/tools'
5
+
6
+ class Riemann::Tools::Diskstats
7
+ include Riemann::Tools
8
+
9
+ opt :devices, "Devices to monitor", :type => :strings, :default => nil
10
+ opt :ignore_devices, "Devices to ignore", :type => :strings, :default =>nil
11
+
12
+ def initialize
13
+ @old_state = nil
14
+ end
15
+
16
+ def state
17
+ f = File.read('/proc/diskstats')
18
+ state = f.split("\n").reject { |d| d =~ /(ram|loop)/ }.inject({}) do |s, line|
19
+ if line =~ /^(?:\s+\d+){2}\s+([\w\d]+) (.*)$/
20
+ dev = $1
21
+
22
+ ['reads reqs',
23
+ 'reads merged',
24
+ 'reads sector',
25
+ 'reads time',
26
+ 'writes reqs',
27
+ 'writes merged',
28
+ 'writes sector',
29
+ 'writes time',
30
+ 'io reqs',
31
+ 'io time',
32
+ 'io weighted'
33
+ ].map do |service|
34
+ "#{dev} #{service}"
35
+ end.zip(
36
+ $2.split(/\s+/).map { |str| str.to_i }
37
+ ).each do |service, value|
38
+ s[service] = value
39
+ end
40
+ end
41
+
42
+ s
43
+ end
44
+
45
+ # Filter interfaces
46
+ if is = opts[:devices]
47
+ state = state.select do |service, value|
48
+ is.include? service.split(' ').first
49
+ end
50
+ end
51
+
52
+ if ign = opts[:ignore_devices]
53
+ state = state.reject do |service, value|
54
+ ign.include? service.split(' ').first
55
+ end
56
+ end
57
+
58
+ state
59
+ end
60
+
61
+ def tick
62
+ state = self.state
63
+
64
+ if @old_state
65
+ state.each do |service, metric|
66
+ delta = metric - @old_state[service]
67
+
68
+ report(
69
+ :service => "diskstats " + service,
70
+ :metric => (delta.to_f / opts[:interval]),
71
+ :state => "ok"
72
+ )
73
+
74
+ if service =~ /io time$/
75
+ report(:service => "diskstats " + service.gsub(/time/, 'util'),
76
+ :metric => (delta.to_f / (opts[:interval]*1000)),
77
+ :state => "ok")
78
+ end
79
+ end
80
+ end
81
+
82
+ @old_state = state
83
+ end
84
+ end
85
+
86
+ Riemann::Tools::Diskstats.run
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ class Riemann::Tools::Elasticsearch
6
+ include Riemann::Tools
7
+
8
+ require 'faraday'
9
+ require 'json'
10
+
11
+ opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
12
+ opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
13
+ opt :es_host, 'Elasticsearch host', default: "localhost"
14
+ opt :es_port, 'Elasticsearch port', type: :int, default: 9200
15
+
16
+
17
+ # Handles HTTP connections and GET requests safely
18
+ def safe_get(uri)
19
+ # Handle connection timeouts
20
+ response = nil
21
+ begin
22
+ connection = Faraday.new(uri)
23
+ response = connection.get do |req|
24
+ req.options[:timeout] = options[:read_timeout]
25
+ req.options[:open_timeout] = options[:open_timeout]
26
+ end
27
+ rescue => e
28
+ report(:host => uri.host,
29
+ :service => "elasticsearch health",
30
+ :state => "critical",
31
+ :description => "HTTP connection error: #{e.class} - #{e.message}"
32
+ )
33
+ end
34
+ response
35
+ end
36
+
37
+ def health_url
38
+ "http://#{options[:es_host]}:#{options[:es_port]}/_cluster/health"
39
+ end
40
+
41
+ def tick
42
+ uri = URI(health_url)
43
+ response = safe_get(uri)
44
+
45
+ return if response.nil?
46
+
47
+ if response.status != 200
48
+ report(:host => uri.host,
49
+ :service => "elasticsearch health",
50
+ :state => "critical",
51
+ :description => "HTTP connection error: #{response.status} - #{response.body}"
52
+ )
53
+ else
54
+ # Assuming that a 200 will give json
55
+ json = JSON.parse(response.body)
56
+ cluster_name = json.delete("cluster_name")
57
+ cluster_status = json.delete("status")
58
+ state = case cluster_status
59
+ when "green"
60
+ "ok"
61
+ when "yellow"
62
+ "warning"
63
+ when "red"
64
+ "critical"
65
+ end
66
+
67
+ report(:host => uri.host,
68
+ :service => "elasticsearch health",
69
+ :state => state,
70
+ :description => "Elasticsearch cluster: #{cluster_name} - #{cluster_status}")
71
+
72
+ json.each_pair do |k,v|
73
+ report(:host => uri.host,
74
+ :service => "elasticsearch #{k}",
75
+ :metric => v,
76
+ :description => "Elasticsearch cluster #{k}"
77
+ )
78
+
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+
85
+ end
86
+ Riemann::Tools::Elasticsearch.run
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ $0 = __FILE__
6
+
7
+ class Riemann::Tools::ELBMetrics
8
+ include Riemann::Tools
9
+
10
+ require 'fog'
11
+ require 'time'
12
+
13
+ opt :fog_credentials_file, "Fog credentials file", :type => String
14
+ opt :fog_credential, "Fog credentials to use", :type => String
15
+ opt :aws_access, "AWS Access Key", :type => String
16
+ opt :aws_secret, "AWS Secret Key", :type => String
17
+ opt :aws_region, "AWS Region", :type => String, :default => "eu-west-1"
18
+ opt :aws_azs, "List of AZs to aggregate against", :type => :strings, :default => [ "all_az" ]
19
+ opt :elbs, "List of ELBs to pull metrics from", :type => :strings, :required => true
20
+
21
+ def standard_metrics
22
+ # ELB metric types, from:
23
+ # http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/CW_Support_For_AWS.html#elb-metricscollected
24
+ metric_options = {
25
+ "Latency" => {
26
+ "Unit" => "Seconds",
27
+ "Statistics" => ["Maximum", "Minimum", "Average" ]
28
+ },
29
+ "RequestCount" => {
30
+ "Unit" => "Count",
31
+ "Statistics" => [ "Sum" ]
32
+ },
33
+ "HealthyHostCount" => {
34
+ "Units" => "Count",
35
+ "Statistics" => [ "Minimum", "Maximum", "Average" ]
36
+ },
37
+ "UnHealthyHostCount" => {
38
+ "Units" => "Count",
39
+ "Statistics" => [ "Minimum", "Maximum", "Average" ]
40
+ },
41
+ "HTTPCode_ELB_4XX" => {
42
+ "Units" => "Count",
43
+ "Statistics" => [ "Sum" ]
44
+ },
45
+ "HTTPCode_ELB_5XX" => {
46
+ "Units" => "Count",
47
+ "Statistics" => [ "Sum" ]
48
+ },
49
+ "HTTPCode_Backend_2XX" => {
50
+ "Units" => "Count",
51
+ "Statistics" => [ "Sum" ]
52
+ },
53
+ "HTTPCode_Backend_3XX" => {
54
+ "Units" => "Count",
55
+ "Statistics" => [ "Sum" ]
56
+ },
57
+ "HTTPCode_Backend_4XX" => {
58
+ "Units" => "Count",
59
+ "Statistics" => [ "Sum" ]
60
+ },
61
+ "HTTPCode_Backend_5XX" => {
62
+ "Units" => "Count",
63
+ "Statistics" => [ "Sum" ]
64
+ }
65
+ }
66
+
67
+ metric_options
68
+ end
69
+
70
+ def base_metrics
71
+ # get last 60 seconds
72
+ start_time = (Time.now.utc - 60).iso8601
73
+ end_time = Time.now.utc.iso8601
74
+
75
+ # The base query that all metrics would get
76
+ metric_base = {
77
+ "Namespace" => "AWS/ELB",
78
+ "StartTime" => start_time,
79
+ "EndTime" => end_time,
80
+ "Period" => 60,
81
+ }
82
+
83
+ metric_base
84
+ end
85
+
86
+
87
+ def tick
88
+ if options[:fog_credentials_file]
89
+ Fog.credentials_path = options[:fog_credentials_file]
90
+ Fog.credential = options[:fog_credential].to_sym
91
+ connection = Fog::AWS::CloudWatch.new
92
+ else
93
+ connection = Fog::AWS::CloudWatch.new({
94
+ :aws_access_key_id => options[:aws_access],
95
+ :aws_secret_access_key => options[:aws_secret],
96
+ :region => options[:aws_region]
97
+ })
98
+ end
99
+
100
+ options[:elbs].each do |lb|
101
+
102
+ metric_options = standard_metrics
103
+ metric_base_options = base_metrics
104
+
105
+ options[:aws_azs].each do |az|
106
+ metric_options.keys.sort.each do |metric_type|
107
+ merged_options = metric_base_options.merge(metric_options[metric_type])
108
+ merged_options["MetricName"] = metric_type
109
+ if az == "all_az"
110
+ merged_options["Dimensions"] = [ { "Name" => "LoadBalancerName", "Value" => lb } ]
111
+ else
112
+ merged_options["Dimensions"] = [
113
+ { "Name" => "LoadBalancerName", "Value" => lb },
114
+ { "Name" => "AvailabilityZone" , "Value" => az}
115
+ ]
116
+ end
117
+
118
+ result = connection.get_metric_statistics(merged_options)
119
+
120
+ # "If no response codes in the category 2XX-5XX range are sent to clients within
121
+ # the given time period, values for these metrics will not be recorded in CloudWatch"
122
+ #next if result.body["GetMetricStatisticsResult"]["Datapoints"].empty? && metric_type =~ /[2345]XX/
123
+ #
124
+ # BUG:
125
+ # Metrics are reported every 60 seconds, but sometimes there isn't one there yet.
126
+ # We can skip that, or do something else?
127
+ next if result.body["GetMetricStatisticsResult"]["Datapoints"].empty?
128
+
129
+ # We should only ever have a single data point
130
+ result.body["GetMetricStatisticsResult"]["Datapoints"][0].keys.sort.each do |stat_type|
131
+ next if stat_type == "Unit"
132
+ next if stat_type == "Timestamp"
133
+
134
+ unit = result.body["GetMetricStatisticsResult"]["Datapoints"][0]["Unit"]
135
+ metric = result.body["GetMetricStatisticsResult"]["Datapoints"][0][stat_type]
136
+ event = Hash.new
137
+ event = {
138
+ host: lb,
139
+ service: "elb.#{az}.#{metric_type}.#{stat_type}",
140
+ ttl: 60,
141
+ description: "#{lb} #{metric_type} #{stat_type} (#{unit})",
142
+ tags: [ "production", "elb_metrics" ],
143
+ metric: metric
144
+ }
145
+
146
+ report(event)
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+
154
+ Riemann::Tools::ELBMetrics.run