riemann-tools 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ class Riemann::Tools::Elasticsearch
6
+ include Riemann::Tools
7
+
8
+ require 'faraday'
9
+ require 'json'
10
+
11
+ opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
12
+ opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
13
+ opt :es_host, 'Elasticsearch host', default: "localhost"
14
+ opt :es_port, 'Elasticsearch port', type: :int, default: 9200
15
+
16
+
17
+ # Handles HTTP connections and GET requests safely
18
+ def safe_get(uri)
19
+ # Handle connection timeouts
20
+ response = nil
21
+ begin
22
+ connection = Faraday.new(uri)
23
+ response = connection.get do |req|
24
+ req.options[:timeout] = options[:read_timeout]
25
+ req.options[:open_timeout] = options[:open_timeout]
26
+ end
27
+ rescue => e
28
+ report(:host => uri.host,
29
+ :service => "elasticsearch health",
30
+ :state => "critical",
31
+ :description => "HTTP connection error: #{e.class} - #{e.message}"
32
+ )
33
+ end
34
+ response
35
+ end
36
+
37
+ def health_url
38
+ "http://#{options[:es_host]}:#{options[:es_port]}/_cluster/health"
39
+ end
40
+
41
+ def tick
42
+ uri = URI(health_url)
43
+ response = safe_get(uri)
44
+
45
+ return if response.nil?
46
+
47
+ if response.status != 200
48
+ report(:host => uri.host,
49
+ :service => "elasticsearch health",
50
+ :state => "critical",
51
+ :description => "HTTP connection error: #{response.status} - #{response.body}"
52
+ )
53
+ else
54
+ # Assuming that a 200 will give json
55
+ json = JSON.parse(response.body)
56
+ cluster_name = json.delete("cluster_name")
57
+ cluster_status = json.delete("status")
58
+ state = case cluster_status
59
+ when "green"
60
+ "ok"
61
+ when "yellow"
62
+ "warning"
63
+ when "red"
64
+ "critical"
65
+ end
66
+
67
+ report(:host => uri.host,
68
+ :service => "elasticsearch health",
69
+ :state => state,
70
+ :description => "Elasticsearch cluster: #{cluster_name} - #{cluster_status}")
71
+
72
+ json.each_pair do |k,v|
73
+ report(:host => uri.host,
74
+ :service => "elasticsearch #{k}",
75
+ :metric => v,
76
+ :description => "Elasticsearch cluster #{k}"
77
+ )
78
+
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+
85
+ end
86
+ Riemann::Tools::Elasticsearch.run
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ $0 = __FILE__
6
+
7
+ class Riemann::Tools::ELBMetrics
8
+ include Riemann::Tools
9
+
10
+ require 'fog'
11
+ require 'time'
12
+
13
+ opt :fog_credentials_file, "Fog credentials file", :type => String
14
+ opt :fog_credential, "Fog credentials to use", :type => String
15
+ opt :aws_access, "AWS Access Key", :type => String
16
+ opt :aws_secret, "AWS Secret Key", :type => String
17
+ opt :aws_region, "AWS Region", :type => String, :default => "eu-west-1"
18
+ opt :aws_azs, "List of AZs to aggregate against", :type => :strings, :default => [ "all_az" ]
19
+ opt :elbs, "List of ELBs to pull metrics from", :type => :strings, :required => true
20
+
21
+ def standard_metrics
22
+ # ELB metric types, from:
23
+ # http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/CW_Support_For_AWS.html#elb-metricscollected
24
+ metric_options = {
25
+ "Latency" => {
26
+ "Unit" => "Seconds",
27
+ "Statistics" => ["Maximum", "Minimum", "Average" ]
28
+ },
29
+ "RequestCount" => {
30
+ "Unit" => "Count",
31
+ "Statistics" => [ "Sum" ]
32
+ },
33
+ "HealthyHostCount" => {
34
+ "Units" => "Count",
35
+ "Statistics" => [ "Minimum", "Maximum", "Average" ]
36
+ },
37
+ "UnHealthyHostCount" => {
38
+ "Units" => "Count",
39
+ "Statistics" => [ "Minimum", "Maximum", "Average" ]
40
+ },
41
+ "HTTPCode_ELB_4XX" => {
42
+ "Units" => "Count",
43
+ "Statistics" => [ "Sum" ]
44
+ },
45
+ "HTTPCode_ELB_5XX" => {
46
+ "Units" => "Count",
47
+ "Statistics" => [ "Sum" ]
48
+ },
49
+ "HTTPCode_Backend_2XX" => {
50
+ "Units" => "Count",
51
+ "Statistics" => [ "Sum" ]
52
+ },
53
+ "HTTPCode_Backend_3XX" => {
54
+ "Units" => "Count",
55
+ "Statistics" => [ "Sum" ]
56
+ },
57
+ "HTTPCode_Backend_4XX" => {
58
+ "Units" => "Count",
59
+ "Statistics" => [ "Sum" ]
60
+ },
61
+ "HTTPCode_Backend_5XX" => {
62
+ "Units" => "Count",
63
+ "Statistics" => [ "Sum" ]
64
+ }
65
+ }
66
+
67
+ metric_options
68
+ end
69
+
70
+ def base_metrics
71
+ # get last 60 seconds
72
+ start_time = (Time.now.utc - 60).iso8601
73
+ end_time = Time.now.utc.iso8601
74
+
75
+ # The base query that all metrics would get
76
+ metric_base = {
77
+ "Namespace" => "AWS/ELB",
78
+ "StartTime" => start_time,
79
+ "EndTime" => end_time,
80
+ "Period" => 60,
81
+ }
82
+
83
+ metric_base
84
+ end
85
+
86
+
87
+ def tick
88
+ if options[:fog_credentials_file]
89
+ Fog.credentials_path = options[:fog_credentials_file]
90
+ Fog.credential = options[:fog_credential].to_sym
91
+ connection = Fog::AWS::CloudWatch.new
92
+ else
93
+ connection = Fog::AWS::CloudWatch.new({
94
+ :aws_access_key_id => options[:aws_access],
95
+ :aws_secret_access_key => options[:aws_secret],
96
+ :region => options[:aws_region]
97
+ })
98
+ end
99
+
100
+ options[:elbs].each do |lb|
101
+
102
+ metric_options = standard_metrics
103
+ metric_base_options = base_metrics
104
+
105
+ options[:aws_azs].each do |az|
106
+ metric_options.keys.sort.each do |metric_type|
107
+ merged_options = metric_base_options.merge(metric_options[metric_type])
108
+ merged_options["MetricName"] = metric_type
109
+ if az == "all_az"
110
+ merged_options["Dimensions"] = [ { "Name" => "LoadBalancerName", "Value" => lb } ]
111
+ else
112
+ merged_options["Dimensions"] = [
113
+ { "Name" => "LoadBalancerName", "Value" => lb },
114
+ { "Name" => "AvailabilityZone" , "Value" => az}
115
+ ]
116
+ end
117
+
118
+ result = connection.get_metric_statistics(merged_options)
119
+
120
+ # "If no response codes in the category 2XX-5XX range are sent to clients within
121
+ # the given time period, values for these metrics will not be recorded in CloudWatch"
122
+ #next if result.body["GetMetricStatisticsResult"]["Datapoints"].empty? && metric_type =~ /[2345]XX/
123
+ #
124
+ # BUG:
125
+ # Metrics are reported every 60 seconds, but sometimes there isn't one there yet.
126
+ # We can skip that, or do something else?
127
+ next if result.body["GetMetricStatisticsResult"]["Datapoints"].empty?
128
+
129
+ # We should only ever have a single data point
130
+ result.body["GetMetricStatisticsResult"]["Datapoints"][0].keys.sort.each do |stat_type|
131
+ next if stat_type == "Unit"
132
+ next if stat_type == "Timestamp"
133
+
134
+ unit = result.body["GetMetricStatisticsResult"]["Datapoints"][0]["Unit"]
135
+ metric = result.body["GetMetricStatisticsResult"]["Datapoints"][0][stat_type]
136
+ event = Hash.new
137
+ event = {
138
+ host: lb,
139
+ service: "elb.#{az}.#{metric_type}.#{stat_type}",
140
+ ttl: 60,
141
+ description: "#{lb} #{metric_type} #{stat_type} (#{unit})",
142
+ tags: [ "production", "elb_metrics" ],
143
+ metric: metric
144
+ }
145
+
146
+ report(event)
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+
154
+ Riemann::Tools::ELBMetrics.run
data/bin/riemann-fd ADDED
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Reports current file descriptor use to riemann.
4
+ # By default reports the total system fd usage, can also report usage of individual processes
5
+
6
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
7
+
8
+ class Riemann::Tools::Health
9
+ include Riemann::Tools
10
+
11
+ opt :fd_sys_warning, "open file descriptor threshold for system", :default => 800
12
+ opt :fd_sys_critical, "open file descriptor critical threshold for system", :default => 900
13
+ opt :fd_proc_warning, "open file descriptor threshold for process", :default => 800
14
+ opt :fd_proc_critical, "open file descriptor critical threshold for process", :default => 900
15
+ opt :processes, "list of processes to measure fd usage in addition to system total", :type => :ints
16
+
17
+ def initialize
18
+ @limits = {
19
+ :fd => {:critical => opts[:fd_sys_critical], :warning => opts[:fd_sys_warning]},
20
+ :process => {:critical => opts[:fd_proc_critical], :warning => opts[:fd_proc_warning]},
21
+ }
22
+ ostype = `uname -s`.chomp.downcase
23
+ puts "WARNING: OS '#{ostype}' not explicitly supported. Falling back to Linux" unless ostype == "linux"
24
+ @fd = method :linux_fd
25
+ end
26
+
27
+ def alert(service, state, metric, description)
28
+ report(
29
+ :service => service.to_s,
30
+ :state => state.to_s,
31
+ :metric => metric.to_f,
32
+ :description => description
33
+ )
34
+ end
35
+
36
+ def linux_fd
37
+ sys_used = Integer(`lsof | wc -l`)
38
+ if sys_used > @limits[:fd][:critical]
39
+ alert "fd sys", :critical, sys_used, "system is using #{sys_used} fds"
40
+ elsif sys_used > @limits[:fd][:warning]
41
+ alert "fd sys", :warning, sys_used, "system is using #{sys_used} fds"
42
+ else
43
+ alert "fd sys", :ok, sys_used, "system is using #{sys_used} fds"
44
+ end
45
+
46
+ unless opts[:processes].nil?
47
+ opts[:processes].each do |process|
48
+ used = Integer(`lsof -p #{process} | wc -l`)
49
+ name, pid = `ps axo comm,pid | grep -w #{process}`.split
50
+ if used > @limits[:process][:critical]
51
+ alert "fd #{name} #{process}", :critical, used, "process #{name} #{process} is using #{used} fds"
52
+ elsif used > @limits[:process][:warning]
53
+ alert "fd #{name} #{process}", :warning, used, "process #{name} #{process} is using #{used} fds"
54
+ else
55
+ alert "fd #{name} #{process}", :ok, used, "process #{name} #{process} is using #{used} fds"
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ def tick
62
+ @fd.call
63
+ end
64
+ end
65
+
66
+ Riemann::Tools::Health.run
data/bin/riemann-net CHANGED
@@ -30,6 +30,7 @@ class Riemann::Tools::Net
30
30
  'rx multicast',
31
31
  'tx bytes',
32
32
  'tx packets',
33
+ 'tx errs',
33
34
  'tx drops',
34
35
  'tx fifo',
35
36
  'tx colls',
@@ -49,7 +49,25 @@ class Riemann::Tools::NginxStatus
49
49
  end
50
50
 
51
51
  def tick
52
- response = Net::HTTP.get(@uri)
52
+ response = nil
53
+ begin
54
+ response = Net::HTTP.get(@uri)
55
+ rescue => e
56
+ report(
57
+ :service => "nginx health",
58
+ :state => "critical",
59
+ :description => "Connection error: #{e.class} - #{e.message}"
60
+ )
61
+ end
62
+
63
+ return if response.nil?
64
+
65
+ report(
66
+ :service => "nginx health",
67
+ :state => "ok",
68
+ :description => "Nginx status connection ok"
69
+ )
70
+
53
71
  values = @re.match(response).to_a[1,7].map { |v| v.to_i }
54
72
 
55
73
  @keys.zip(values).each do |key, value|
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+
4
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
5
+
6
+ class Riemann::Tools::Rabbitmq
7
+ include Riemann::Tools
8
+
9
+ require 'faraday'
10
+ require 'json'
11
+ require 'uri'
12
+
13
+
14
+ opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
15
+ opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
16
+
17
+ opt :monitor_user, 'RabbitMQ monitoring user', type: :string
18
+ opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
19
+ opt :monitor_port, 'RabbitMQ monitoring port', default: 15672
20
+ opt :monitor_host, 'RabbitMQ monitoring host', default: "localhost"
21
+
22
+ def monitor_url
23
+ "http://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api/overview"
24
+ end
25
+
26
+ def event_host
27
+ if options[:event_host]
28
+ return options[:event_host]
29
+ else
30
+ return options[:monitor_host]
31
+ end
32
+ end
33
+
34
+ def safe_get(uri, event_host)
35
+ # Handle connection timeouts
36
+ response = nil
37
+ begin
38
+ connection = Faraday.new(uri)
39
+ response = connection.get do |req|
40
+ req.options[:timeout] = options[:read_timeout]
41
+ req.options[:open_timeout] = options[:open_timeout]
42
+ end
43
+ rescue => e
44
+ report(:host => event_host,
45
+ :service => "rabbitmq monitoring",
46
+ :state => "critical",
47
+ :description => "HTTP connection error: #{e.class} - #{e.message}"
48
+ )
49
+ end
50
+ response
51
+ end
52
+
53
+ def tick
54
+ uri = URI(monitor_url)
55
+ response = safe_get(uri, event_host)
56
+
57
+ return if response.nil?
58
+
59
+ json = JSON.parse(response.body)
60
+
61
+ if response.status != 200
62
+ report(:host => event_host,
63
+ :service => "rabbitmq",
64
+ :state => "critical",
65
+ :description => "HTTP connection error: #{response.status} - #{response.body}"
66
+ )
67
+ else
68
+ report(:host => event_host,
69
+ :service => "rabbitmq monitoring",
70
+ :state => "ok",
71
+ :description => "HTTP connection ok"
72
+ )
73
+
74
+ %w( message_stats queue_totals object_totals ).each do |stat|
75
+ # NOTE / BUG ?
76
+ # Brand new servers can have blank message stats. Is this ok?
77
+ # I can't decide.
78
+ next if json[stat].empty?
79
+ json[stat].each_pair do |k,v|
80
+ service = "rabbitmq.#{stat}.#{k}"
81
+ if k =~ /details$/
82
+ metric = v['rate']
83
+ else
84
+ metric = v
85
+ end
86
+
87
+ # TODO: Set state via thresholds which can be configured
88
+
89
+ report(:host => event_host,
90
+ :service => service,
91
+ :metric => metric,
92
+ :description => "RabbitMQ monitor"
93
+ )
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
99
+ Riemann::Tools::Rabbitmq.run
data/bin/riemann-redis CHANGED
@@ -34,25 +34,32 @@ class Riemann::Tools::Redis
34
34
  end
35
35
 
36
36
  def tick
37
- @redis.info(@section).each do |property, value|
38
- data = {
39
- :host => opts[:redis_host],
40
- :service => "redis #{property}",
41
- :metric => value.to_f,
42
- :state => 'ok',
43
- :tags => ['redis']
44
- }
45
-
46
- if STRING_VALUES.include?(property) || property.match(/^db\d+/)
47
- if %w{ rdb_last_bgsave_status aof_last_bgrewrite_status }.include?(property)
48
- data[:state] = value
49
- else
50
- data[:description] = value
37
+ begin
38
+ @redis.info(@section).each do |property, value|
39
+ data = {
40
+ :host => opts[:redis_host],
41
+ :service => "redis #{property}",
42
+ :metric => value.to_f,
43
+ :state => value.to_s,
44
+ :tags => ['redis']
45
+ }
46
+
47
+ if STRING_VALUES.include?(property) || property.match(/^db\d+/)
48
+ if %w{ rdb_last_bgsave_status aof_last_bgrewrite_status }.include?(property)
49
+ data[:state] = value
50
+ else
51
+ data[:description] = value
52
+ end
51
53
  end
52
- end
53
54
 
54
- report(data)
55
+ report(data)
56
+ end
57
+ rescue ::Redis::CommandError => e
58
+ if e.message == "ERR operation not permitted"
59
+ @redis.auth(opts[:redis_password]) unless opts[:redis_password] == ''
60
+ end
55
61
  end
62
+
56
63
  end
57
64
 
58
65
  end
data/bin/riemann-resmon CHANGED
@@ -65,6 +65,11 @@ class Riemann::Tools::Resmon
65
65
  )
66
66
  next
67
67
  else
68
+ report(:host => event_host,
69
+ :service => "resmon",
70
+ :state => "ok",
71
+ :description => "Resmon connection ok"
72
+ )
68
73
  doc = Nokogiri::XML(response.body)
69
74
  end
70
75
 
@@ -78,8 +83,10 @@ class Riemann::Tools::Resmon
78
83
  }
79
84
 
80
85
  case metric.attributes['type'].value
81
- when /[iIlLn]/
82
- hash[:metric] = metric.text
86
+ when /[iIlL]/
87
+ hash[:metric] = metric.text.to_i
88
+ when 'n'
89
+ hash[:metric] = metric.text.to_f
83
90
  when 's'
84
91
  hash[:description] = metric.text
85
92
  when '0'
data/bin/riemann-riak CHANGED
@@ -5,12 +5,13 @@
5
5
  require File.expand_path('../../lib/riemann/tools', __FILE__)
6
6
 
7
7
  require 'net/http'
8
+ require 'net/https'
8
9
  require 'yajl/json_gem'
9
10
 
10
11
  class Riemann::Tools::Riak
11
12
  include Riemann::Tools
12
13
 
13
- opt :riak_host, "Riak host", :default => Socket.gethostname
14
+ opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
14
15
  opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
15
16
  opt :stats_port, "Riak HTTP port for stats", :default => 8098
16
17
  opt :stats_path, "Riak HTTP stats path", :default => '/stats'
@@ -39,8 +40,17 @@ class Riemann::Tools::Riak
39
40
 
40
41
  if
41
42
  begin
42
- Net::HTTP.start(opts[:riak_host], opts[:stats_port]) do |http|
43
- http.get opts[:stats_path]
43
+ uri = URI.parse(opts[:riak_host])
44
+ if uri.host == nil
45
+ uri.host = opts[:riak_host]
46
+ end
47
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
48
+ http.use_ssl = uri.scheme == 'https'
49
+ if http.use_ssl?
50
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
51
+ end
52
+ http.start do |http|
53
+ http.get opts[:stats_path]
44
54
  end
45
55
  rescue => e
46
56
  @httpstatus = false
@@ -50,7 +60,7 @@ class Riemann::Tools::Riak
50
60
  # dynamically input the cookie
51
61
  # this is done only once - hopefully it doesn't get overridden.
52
62
  ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
53
-
63
+
54
64
  end
55
65
 
56
66
  def check_ring
@@ -129,9 +139,18 @@ class Riemann::Tools::Riak
129
139
  def check_stats
130
140
  if @httpstatus
131
141
  begin
132
- res = Net::HTTP.start(opts[:riak_host], opts[:stats_port]) do |http|
133
- http.get opts[:stats_path]
142
+ uri = URI.parse(opts[:riak_host])
143
+ if uri.host == nil
144
+ uri.host = opts[:riak_host]
134
145
  end
146
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
147
+ http.use_ssl = uri.scheme == 'https'
148
+ if http.use_ssl?
149
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
150
+ end
151
+ res = http.start do |http|
152
+ http.get opts[:stats_path]
153
+ end
135
154
  rescue => e
136
155
  report(
137
156
  :host => opts[:riak_host],
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: riemann-tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-28 00:00:00.000000000 Z
12
+ date: 2013-08-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: riemann-client
@@ -148,15 +148,19 @@ executables:
148
148
  - riemann-redis-slowlog
149
149
  - riemann-bench
150
150
  - riemann-freeswitch
151
+ - riemann-elb-metrics
151
152
  - riemann-riak
152
153
  - riemann-health
153
154
  - riemann-memcached
154
155
  - riemann-haproxy
156
+ - riemann-elasticsearch
155
157
  - riemann-riak-keys
156
158
  - riemann-diskstats
159
+ - riemann-fd
157
160
  - riemann-riak-ring
158
161
  - riemann-cloudant
159
162
  - riemann-nginx-status
163
+ - riemann-rabbitmq
160
164
  - riemann-kvminstance
161
165
  - riemann-net
162
166
  - riemann-redis
@@ -168,6 +172,9 @@ files:
168
172
  - bin/riemann-bench
169
173
  - bin/riemann-cloudant
170
174
  - bin/riemann-diskstats
175
+ - bin/riemann-elasticsearch
176
+ - bin/riemann-elb-metrics
177
+ - bin/riemann-fd
171
178
  - bin/riemann-freeswitch
172
179
  - bin/riemann-haproxy
173
180
  - bin/riemann-health
@@ -176,6 +183,7 @@ files:
176
183
  - bin/riemann-munin
177
184
  - bin/riemann-net
178
185
  - bin/riemann-nginx-status
186
+ - bin/riemann-rabbitmq
179
187
  - bin/riemann-redis
180
188
  - bin/riemann-redis-slowlog
181
189
  - bin/riemann-resmon