riemann-tools 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ class Riemann::Tools::Elasticsearch
6
+ include Riemann::Tools
7
+
8
+ require 'faraday'
9
+ require 'json'
10
+
11
+ opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
12
+ opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
13
+ opt :es_host, 'Elasticsearch host', default: "localhost"
14
+ opt :es_port, 'Elasticsearch port', type: :int, default: 9200
15
+
16
+
17
+ # Handles HTTP connections and GET requests safely
18
+ def safe_get(uri)
19
+ # Handle connection timeouts
20
+ response = nil
21
+ begin
22
+ connection = Faraday.new(uri)
23
+ response = connection.get do |req|
24
+ req.options[:timeout] = options[:read_timeout]
25
+ req.options[:open_timeout] = options[:open_timeout]
26
+ end
27
+ rescue => e
28
+ report(:host => uri.host,
29
+ :service => "elasticsearch health",
30
+ :state => "critical",
31
+ :description => "HTTP connection error: #{e.class} - #{e.message}"
32
+ )
33
+ end
34
+ response
35
+ end
36
+
37
+ def health_url
38
+ "http://#{options[:es_host]}:#{options[:es_port]}/_cluster/health"
39
+ end
40
+
41
+ def tick
42
+ uri = URI(health_url)
43
+ response = safe_get(uri)
44
+
45
+ return if response.nil?
46
+
47
+ if response.status != 200
48
+ report(:host => uri.host,
49
+ :service => "elasticsearch health",
50
+ :state => "critical",
51
+ :description => "HTTP connection error: #{response.status} - #{response.body}"
52
+ )
53
+ else
54
+ # Assuming that a 200 will give json
55
+ json = JSON.parse(response.body)
56
+ cluster_name = json.delete("cluster_name")
57
+ cluster_status = json.delete("status")
58
+ state = case cluster_status
59
+ when "green"
60
+ "ok"
61
+ when "yellow"
62
+ "warning"
63
+ when "red"
64
+ "critical"
65
+ end
66
+
67
+ report(:host => uri.host,
68
+ :service => "elasticsearch health",
69
+ :state => state,
70
+ :description => "Elasticsearch cluster: #{cluster_name} - #{cluster_status}")
71
+
72
+ json.each_pair do |k,v|
73
+ report(:host => uri.host,
74
+ :service => "elasticsearch #{k}",
75
+ :metric => v,
76
+ :description => "Elasticsearch cluster #{k}"
77
+ )
78
+
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+
85
+ end
86
+ Riemann::Tools::Elasticsearch.run
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ $0 = __FILE__
6
+
7
+ class Riemann::Tools::ELBMetrics
8
+ include Riemann::Tools
9
+
10
+ require 'fog'
11
+ require 'time'
12
+
13
+ opt :fog_credentials_file, "Fog credentials file", :type => String
14
+ opt :fog_credential, "Fog credentials to use", :type => String
15
+ opt :aws_access, "AWS Access Key", :type => String
16
+ opt :aws_secret, "AWS Secret Key", :type => String
17
+ opt :aws_region, "AWS Region", :type => String, :default => "eu-west-1"
18
+ opt :aws_azs, "List of AZs to aggregate against", :type => :strings, :default => [ "all_az" ]
19
+ opt :elbs, "List of ELBs to pull metrics from", :type => :strings, :required => true
20
+
21
+ def standard_metrics
22
+ # ELB metric types, from:
23
+ # http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/CW_Support_For_AWS.html#elb-metricscollected
24
+ metric_options = {
25
+ "Latency" => {
26
+ "Unit" => "Seconds",
27
+ "Statistics" => ["Maximum", "Minimum", "Average" ]
28
+ },
29
+ "RequestCount" => {
30
+ "Unit" => "Count",
31
+ "Statistics" => [ "Sum" ]
32
+ },
33
+ "HealthyHostCount" => {
34
+ "Units" => "Count",
35
+ "Statistics" => [ "Minimum", "Maximum", "Average" ]
36
+ },
37
+ "UnHealthyHostCount" => {
38
+ "Units" => "Count",
39
+ "Statistics" => [ "Minimum", "Maximum", "Average" ]
40
+ },
41
+ "HTTPCode_ELB_4XX" => {
42
+ "Units" => "Count",
43
+ "Statistics" => [ "Sum" ]
44
+ },
45
+ "HTTPCode_ELB_5XX" => {
46
+ "Units" => "Count",
47
+ "Statistics" => [ "Sum" ]
48
+ },
49
+ "HTTPCode_Backend_2XX" => {
50
+ "Units" => "Count",
51
+ "Statistics" => [ "Sum" ]
52
+ },
53
+ "HTTPCode_Backend_3XX" => {
54
+ "Units" => "Count",
55
+ "Statistics" => [ "Sum" ]
56
+ },
57
+ "HTTPCode_Backend_4XX" => {
58
+ "Units" => "Count",
59
+ "Statistics" => [ "Sum" ]
60
+ },
61
+ "HTTPCode_Backend_5XX" => {
62
+ "Units" => "Count",
63
+ "Statistics" => [ "Sum" ]
64
+ }
65
+ }
66
+
67
+ metric_options
68
+ end
69
+
70
+ def base_metrics
71
+ # get last 60 seconds
72
+ start_time = (Time.now.utc - 60).iso8601
73
+ end_time = Time.now.utc.iso8601
74
+
75
+ # The base query that all metrics would get
76
+ metric_base = {
77
+ "Namespace" => "AWS/ELB",
78
+ "StartTime" => start_time,
79
+ "EndTime" => end_time,
80
+ "Period" => 60,
81
+ }
82
+
83
+ metric_base
84
+ end
85
+
86
+
87
+ def tick
88
+ if options[:fog_credentials_file]
89
+ Fog.credentials_path = options[:fog_credentials_file]
90
+ Fog.credential = options[:fog_credential].to_sym
91
+ connection = Fog::AWS::CloudWatch.new
92
+ else
93
+ connection = Fog::AWS::CloudWatch.new({
94
+ :aws_access_key_id => options[:aws_access],
95
+ :aws_secret_access_key => options[:aws_secret],
96
+ :region => options[:aws_region]
97
+ })
98
+ end
99
+
100
+ options[:elbs].each do |lb|
101
+
102
+ metric_options = standard_metrics
103
+ metric_base_options = base_metrics
104
+
105
+ options[:aws_azs].each do |az|
106
+ metric_options.keys.sort.each do |metric_type|
107
+ merged_options = metric_base_options.merge(metric_options[metric_type])
108
+ merged_options["MetricName"] = metric_type
109
+ if az == "all_az"
110
+ merged_options["Dimensions"] = [ { "Name" => "LoadBalancerName", "Value" => lb } ]
111
+ else
112
+ merged_options["Dimensions"] = [
113
+ { "Name" => "LoadBalancerName", "Value" => lb },
114
+ { "Name" => "AvailabilityZone" , "Value" => az}
115
+ ]
116
+ end
117
+
118
+ result = connection.get_metric_statistics(merged_options)
119
+
120
+ # "If no response codes in the category 2XX-5XX range are sent to clients within
121
+ # the given time period, values for these metrics will not be recorded in CloudWatch"
122
+ #next if result.body["GetMetricStatisticsResult"]["Datapoints"].empty? && metric_type =~ /[2345]XX/
123
+ #
124
+ # BUG:
125
+ # Metrics are reported every 60 seconds, but sometimes there isn't one there yet.
126
+ # We can skip that, or do something else?
127
+ next if result.body["GetMetricStatisticsResult"]["Datapoints"].empty?
128
+
129
+ # We should only ever have a single data point
130
+ result.body["GetMetricStatisticsResult"]["Datapoints"][0].keys.sort.each do |stat_type|
131
+ next if stat_type == "Unit"
132
+ next if stat_type == "Timestamp"
133
+
134
+ unit = result.body["GetMetricStatisticsResult"]["Datapoints"][0]["Unit"]
135
+ metric = result.body["GetMetricStatisticsResult"]["Datapoints"][0][stat_type]
136
+ event = Hash.new
137
+ event = {
138
+ host: lb,
139
+ service: "elb.#{az}.#{metric_type}.#{stat_type}",
140
+ ttl: 60,
141
+ description: "#{lb} #{metric_type} #{stat_type} (#{unit})",
142
+ tags: [ "production", "elb_metrics" ],
143
+ metric: metric
144
+ }
145
+
146
+ report(event)
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+
154
+ Riemann::Tools::ELBMetrics.run
data/bin/riemann-fd ADDED
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Reports current file descriptor use to riemann.
4
+ # By default reports the total system fd usage, can also report usage of individual processes
5
+
6
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
7
+
8
+ class Riemann::Tools::Health
9
+ include Riemann::Tools
10
+
11
+ opt :fd_sys_warning, "open file descriptor threshold for system", :default => 800
12
+ opt :fd_sys_critical, "open file descriptor critical threshold for system", :default => 900
13
+ opt :fd_proc_warning, "open file descriptor threshold for process", :default => 800
14
+ opt :fd_proc_critical, "open file descriptor critical threshold for process", :default => 900
15
+ opt :processes, "list of processes to measure fd usage in addition to system total", :type => :ints
16
+
17
+ def initialize
18
+ @limits = {
19
+ :fd => {:critical => opts[:fd_sys_critical], :warning => opts[:fd_sys_warning]},
20
+ :process => {:critical => opts[:fd_proc_critical], :warning => opts[:fd_proc_warning]},
21
+ }
22
+ ostype = `uname -s`.chomp.downcase
23
+ puts "WARNING: OS '#{ostype}' not explicitly supported. Falling back to Linux" unless ostype == "linux"
24
+ @fd = method :linux_fd
25
+ end
26
+
27
+ def alert(service, state, metric, description)
28
+ report(
29
+ :service => service.to_s,
30
+ :state => state.to_s,
31
+ :metric => metric.to_f,
32
+ :description => description
33
+ )
34
+ end
35
+
36
+ def linux_fd
37
+ sys_used = Integer(`lsof | wc -l`)
38
+ if sys_used > @limits[:fd][:critical]
39
+ alert "fd sys", :critical, sys_used, "system is using #{sys_used} fds"
40
+ elsif sys_used > @limits[:fd][:warning]
41
+ alert "fd sys", :warning, sys_used, "system is using #{sys_used} fds"
42
+ else
43
+ alert "fd sys", :ok, sys_used, "system is using #{sys_used} fds"
44
+ end
45
+
46
+ unless opts[:processes].nil?
47
+ opts[:processes].each do |process|
48
+ used = Integer(`lsof -p #{process} | wc -l`)
49
+ name, pid = `ps axo comm,pid | grep -w #{process}`.split
50
+ if used > @limits[:process][:critical]
51
+ alert "fd #{name} #{process}", :critical, used, "process #{name} #{process} is using #{used} fds"
52
+ elsif used > @limits[:process][:warning]
53
+ alert "fd #{name} #{process}", :warning, used, "process #{name} #{process} is using #{used} fds"
54
+ else
55
+ alert "fd #{name} #{process}", :ok, used, "process #{name} #{process} is using #{used} fds"
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ def tick
62
+ @fd.call
63
+ end
64
+ end
65
+
66
+ Riemann::Tools::Health.run
data/bin/riemann-net CHANGED
@@ -30,6 +30,7 @@ class Riemann::Tools::Net
30
30
  'rx multicast',
31
31
  'tx bytes',
32
32
  'tx packets',
33
+ 'tx errs',
33
34
  'tx drops',
34
35
  'tx fifo',
35
36
  'tx colls',
@@ -49,7 +49,25 @@ class Riemann::Tools::NginxStatus
49
49
  end
50
50
 
51
51
  def tick
52
- response = Net::HTTP.get(@uri)
52
+ response = nil
53
+ begin
54
+ response = Net::HTTP.get(@uri)
55
+ rescue => e
56
+ report(
57
+ :service => "nginx health",
58
+ :state => "critical",
59
+ :description => "Connection error: #{e.class} - #{e.message}"
60
+ )
61
+ end
62
+
63
+ return if response.nil?
64
+
65
+ report(
66
+ :service => "nginx health",
67
+ :state => "ok",
68
+ :description => "Nginx status connection ok"
69
+ )
70
+
53
71
  values = @re.match(response).to_a[1,7].map { |v| v.to_i }
54
72
 
55
73
  @keys.zip(values).each do |key, value|
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+
4
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
5
+
6
+ class Riemann::Tools::Rabbitmq
7
+ include Riemann::Tools
8
+
9
+ require 'faraday'
10
+ require 'json'
11
+ require 'uri'
12
+
13
+
14
+ opt :read_timeout, 'Faraday read timeout', type: :int, default: 2
15
+ opt :open_timeout, 'Faraday open timeout', type: :int, default: 1
16
+
17
+ opt :monitor_user, 'RabbitMQ monitoring user', type: :string
18
+ opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string
19
+ opt :monitor_port, 'RabbitMQ monitoring port', default: 15672
20
+ opt :monitor_host, 'RabbitMQ monitoring host', default: "localhost"
21
+
22
+ def monitor_url
23
+ "http://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api/overview"
24
+ end
25
+
26
+ def event_host
27
+ if options[:event_host]
28
+ return options[:event_host]
29
+ else
30
+ return options[:monitor_host]
31
+ end
32
+ end
33
+
34
+ def safe_get(uri, event_host)
35
+ # Handle connection timeouts
36
+ response = nil
37
+ begin
38
+ connection = Faraday.new(uri)
39
+ response = connection.get do |req|
40
+ req.options[:timeout] = options[:read_timeout]
41
+ req.options[:open_timeout] = options[:open_timeout]
42
+ end
43
+ rescue => e
44
+ report(:host => event_host,
45
+ :service => "rabbitmq monitoring",
46
+ :state => "critical",
47
+ :description => "HTTP connection error: #{e.class} - #{e.message}"
48
+ )
49
+ end
50
+ response
51
+ end
52
+
53
+ def tick
54
+ uri = URI(monitor_url)
55
+ response = safe_get(uri, event_host)
56
+
57
+ return if response.nil?
58
+
59
+ json = JSON.parse(response.body)
60
+
61
+ if response.status != 200
62
+ report(:host => event_host,
63
+ :service => "rabbitmq",
64
+ :state => "critical",
65
+ :description => "HTTP connection error: #{response.status} - #{response.body}"
66
+ )
67
+ else
68
+ report(:host => event_host,
69
+ :service => "rabbitmq monitoring",
70
+ :state => "ok",
71
+ :description => "HTTP connection ok"
72
+ )
73
+
74
+ %w( message_stats queue_totals object_totals ).each do |stat|
75
+ # NOTE / BUG ?
76
+ # Brand new servers can have blank message stats. Is this ok?
77
+ # I can't decide.
78
+ next if json[stat].empty?
79
+ json[stat].each_pair do |k,v|
80
+ service = "rabbitmq.#{stat}.#{k}"
81
+ if k =~ /details$/
82
+ metric = v['rate']
83
+ else
84
+ metric = v
85
+ end
86
+
87
+ # TODO: Set state via thresholds which can be configured
88
+
89
+ report(:host => event_host,
90
+ :service => service,
91
+ :metric => metric,
92
+ :description => "RabbitMQ monitor"
93
+ )
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
99
+ Riemann::Tools::Rabbitmq.run
data/bin/riemann-redis CHANGED
@@ -34,25 +34,32 @@ class Riemann::Tools::Redis
34
34
  end
35
35
 
36
36
  def tick
37
- @redis.info(@section).each do |property, value|
38
- data = {
39
- :host => opts[:redis_host],
40
- :service => "redis #{property}",
41
- :metric => value.to_f,
42
- :state => 'ok',
43
- :tags => ['redis']
44
- }
45
-
46
- if STRING_VALUES.include?(property) || property.match(/^db\d+/)
47
- if %w{ rdb_last_bgsave_status aof_last_bgrewrite_status }.include?(property)
48
- data[:state] = value
49
- else
50
- data[:description] = value
37
+ begin
38
+ @redis.info(@section).each do |property, value|
39
+ data = {
40
+ :host => opts[:redis_host],
41
+ :service => "redis #{property}",
42
+ :metric => value.to_f,
43
+ :state => value.to_s,
44
+ :tags => ['redis']
45
+ }
46
+
47
+ if STRING_VALUES.include?(property) || property.match(/^db\d+/)
48
+ if %w{ rdb_last_bgsave_status aof_last_bgrewrite_status }.include?(property)
49
+ data[:state] = value
50
+ else
51
+ data[:description] = value
52
+ end
51
53
  end
52
- end
53
54
 
54
- report(data)
55
+ report(data)
56
+ end
57
+ rescue ::Redis::CommandError => e
58
+ if e.message == "ERR operation not permitted"
59
+ @redis.auth(opts[:redis_password]) unless opts[:redis_password] == ''
60
+ end
55
61
  end
62
+
56
63
  end
57
64
 
58
65
  end
data/bin/riemann-resmon CHANGED
@@ -65,6 +65,11 @@ class Riemann::Tools::Resmon
65
65
  )
66
66
  next
67
67
  else
68
+ report(:host => event_host,
69
+ :service => "resmon",
70
+ :state => "ok",
71
+ :description => "Resmon connection ok"
72
+ )
68
73
  doc = Nokogiri::XML(response.body)
69
74
  end
70
75
 
@@ -78,8 +83,10 @@ class Riemann::Tools::Resmon
78
83
  }
79
84
 
80
85
  case metric.attributes['type'].value
81
- when /[iIlLn]/
82
- hash[:metric] = metric.text
86
+ when /[iIlL]/
87
+ hash[:metric] = metric.text.to_i
88
+ when 'n'
89
+ hash[:metric] = metric.text.to_f
83
90
  when 's'
84
91
  hash[:description] = metric.text
85
92
  when '0'
data/bin/riemann-riak CHANGED
@@ -5,12 +5,13 @@
5
5
  require File.expand_path('../../lib/riemann/tools', __FILE__)
6
6
 
7
7
  require 'net/http'
8
+ require 'net/https'
8
9
  require 'yajl/json_gem'
9
10
 
10
11
  class Riemann::Tools::Riak
11
12
  include Riemann::Tools
12
13
 
13
- opt :riak_host, "Riak host", :default => Socket.gethostname
14
+ opt :riak_host, "Riak host for stats <IP> or SSL http(s)://<IP>", :default => Socket.gethostname
14
15
  opt :data_dir, "Riak data directory", :default => '/var/lib/riak'
15
16
  opt :stats_port, "Riak HTTP port for stats", :default => 8098
16
17
  opt :stats_path, "Riak HTTP stats path", :default => '/stats'
@@ -39,8 +40,17 @@ class Riemann::Tools::Riak
39
40
 
40
41
  if
41
42
  begin
42
- Net::HTTP.start(opts[:riak_host], opts[:stats_port]) do |http|
43
- http.get opts[:stats_path]
43
+ uri = URI.parse(opts[:riak_host])
44
+ if uri.host == nil
45
+ uri.host = opts[:riak_host]
46
+ end
47
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
48
+ http.use_ssl = uri.scheme == 'https'
49
+ if http.use_ssl?
50
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
51
+ end
52
+ http.start do |http|
53
+ http.get opts[:stats_path]
44
54
  end
45
55
  rescue => e
46
56
  @httpstatus = false
@@ -50,7 +60,7 @@ class Riemann::Tools::Riak
50
60
  # dynamically input the cookie
51
61
  # this is done only once - hopefully it doesn't get overridden.
52
62
  ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}"
53
-
63
+
54
64
  end
55
65
 
56
66
  def check_ring
@@ -129,9 +139,18 @@ class Riemann::Tools::Riak
129
139
  def check_stats
130
140
  if @httpstatus
131
141
  begin
132
- res = Net::HTTP.start(opts[:riak_host], opts[:stats_port]) do |http|
133
- http.get opts[:stats_path]
142
+ uri = URI.parse(opts[:riak_host])
143
+ if uri.host == nil
144
+ uri.host = opts[:riak_host]
134
145
  end
146
+ http = Net::HTTP.new(uri.host, opts[:stats_port])
147
+ http.use_ssl = uri.scheme == 'https'
148
+ if http.use_ssl?
149
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
150
+ end
151
+ res = http.start do |http|
152
+ http.get opts[:stats_path]
153
+ end
135
154
  rescue => e
136
155
  report(
137
156
  :host => opts[:riak_host],
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: riemann-tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-28 00:00:00.000000000 Z
12
+ date: 2013-08-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: riemann-client
@@ -148,15 +148,19 @@ executables:
148
148
  - riemann-redis-slowlog
149
149
  - riemann-bench
150
150
  - riemann-freeswitch
151
+ - riemann-elb-metrics
151
152
  - riemann-riak
152
153
  - riemann-health
153
154
  - riemann-memcached
154
155
  - riemann-haproxy
156
+ - riemann-elasticsearch
155
157
  - riemann-riak-keys
156
158
  - riemann-diskstats
159
+ - riemann-fd
157
160
  - riemann-riak-ring
158
161
  - riemann-cloudant
159
162
  - riemann-nginx-status
163
+ - riemann-rabbitmq
160
164
  - riemann-kvminstance
161
165
  - riemann-net
162
166
  - riemann-redis
@@ -168,6 +172,9 @@ files:
168
172
  - bin/riemann-bench
169
173
  - bin/riemann-cloudant
170
174
  - bin/riemann-diskstats
175
+ - bin/riemann-elasticsearch
176
+ - bin/riemann-elb-metrics
177
+ - bin/riemann-fd
171
178
  - bin/riemann-freeswitch
172
179
  - bin/riemann-haproxy
173
180
  - bin/riemann-health
@@ -176,6 +183,7 @@ files:
176
183
  - bin/riemann-munin
177
184
  - bin/riemann-net
178
185
  - bin/riemann-nginx-status
186
+ - bin/riemann-rabbitmq
179
187
  - bin/riemann-redis
180
188
  - bin/riemann-redis-slowlog
181
189
  - bin/riemann-resmon