vmpooler 0.13.3 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'vmpooler/metrics/statsd'
4
+ require 'vmpooler/metrics/graphite'
5
+ require 'vmpooler/metrics/promstats'
6
+ require 'vmpooler/metrics/dummy_statsd'
7
+
8
+ module Vmpooler
9
+ class Metrics
10
+ # static class instantiate appropriate metrics object.
11
+ def self.init(logger, params)
12
+ if params[:statsd]
13
+ metrics = Vmpooler::Metrics::Statsd.new(logger, params[:statsd])
14
+ elsif params[:graphite]
15
+ metrics = Vmpooler::Metrics::Graphite.new(logger, params[:graphite])
16
+ elsif params[:prometheus]
17
+ metrics = Vmpooler::Metrics::Promstats.new(logger, params[:prometheus])
18
+ else
19
+ metrics = Vmpooler::Metrics::DummyStatsd.new
20
+ end
21
+ metrics
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vmpooler
4
+ class Metrics
5
+ class DummyStatsd < Metrics
6
+ attr_reader :server, :port, :prefix
7
+
8
+ def initialize(*)
9
+ end
10
+
11
+ def increment(*)
12
+ true
13
+ end
14
+
15
+ def gauge(*)
16
+ true
17
+ end
18
+
19
+ def timing(*)
20
+ true
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rubygems' unless defined?(Gem)
4
+
5
+ module Vmpooler
6
+ class Metrics
7
+ class Graphite < Metrics
8
+ attr_reader :server, :port, :prefix
9
+
10
+ def initialize(logger, params = {})
11
+ raise ArgumentError, "Graphite server is required. Config: #{params.inspect}" if params['server'].nil? || params['server'].empty?
12
+
13
+ @server = params['server']
14
+ @port = params['port'] || 2003
15
+ @prefix = params['prefix'] || 'vmpooler'
16
+ @logger = logger
17
+ end
18
+
19
+ def increment(label)
20
+ log label, 1
21
+ end
22
+
23
+ def gauge(label, value)
24
+ log label, value
25
+ end
26
+
27
+ def timing(label, duration)
28
+ log label, duration
29
+ end
30
+
31
+ def log(path, value)
32
+ Thread.new do
33
+ socket = TCPSocket.new(server, port)
34
+ begin
35
+ socket.puts "#{prefix}.#{path} #{value} #{Time.now.to_i}"
36
+ ensure
37
+ socket.close
38
+ end
39
+ end
40
+ rescue Errno::EADDRNOTAVAIL => e
41
+ warn "Could not assign address to graphite server #{server}: #{e}"
42
+ rescue StandardError => e
43
+ @logger.log('s', "[!] Failure logging #{path} to graphite server [#{server}:#{port}]: #{e}")
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,380 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'prometheus/client'
4
+
5
+ module Vmpooler
6
+ class Metrics
7
+ class Promstats < Metrics
8
+ attr_reader :prefix, :endpoint, :metrics_prefix
9
+
10
+ # Constants for Metric Types
11
+ M_COUNTER = 1
12
+ M_GAUGE = 2
13
+ M_SUMMARY = 3
14
+ M_HISTOGRAM = 4
15
+
16
+ # Customised Bucket set to use for the Pooler clone times set to more appropriate intervals.
17
+ POOLER_CLONE_TIME_BUCKETS = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 180.0, 240.0, 300.0, 600.0].freeze
18
+ POOLER_READY_TIME_BUCKETS = [30.0, 60.0, 120.0, 180.0, 240.0, 300.0, 500.0, 800.0, 1200.0, 1600.0].freeze
19
+ # Same for redis connection times - this is the same as the current Prometheus Default.
20
+ # https://github.com/prometheus/client_ruby/blob/master/lib/prometheus/client/histogram.rb#L14
21
+ REDIS_CONNECT_BUCKETS = [1.0, 2.0, 3.0, 5.0, 8.0, 13.0, 18.0, 23.0].freeze
22
+
23
+ @p_metrics = {}
24
+
25
+ def initialize(logger, params = {})
26
+ @prefix = params['prefix'] || 'vmpooler'
27
+ @metrics_prefix = params['metrics_prefix'] || 'vmpooler'
28
+ @endpoint = params['endpoint'] || '/prometheus'
29
+ @logger = logger
30
+
31
+ # Setup up prometheus registry and data structures
32
+ @prometheus = Prometheus::Client.registry
33
+ end
34
+
35
+ # Metrics structure used to register the metrics and also translate/interpret the incoming metrics.
36
+ def vmpooler_metrics_table
37
+ {
38
+ errors: {
39
+ mtype: M_COUNTER,
40
+ torun: %i[manager],
41
+ docstring: 'Count of errors for pool',
42
+ prom_metric_prefix: "#{@metrics_prefix}_errors",
43
+ metric_suffixes: {
44
+ markedasfailed: 'timeout waiting for instance to initialise',
45
+ duplicatehostname: 'unable to create instance due to duplicate hostname',
46
+ staledns: 'unable to create instance due to duplicate DNS record'
47
+ },
48
+ param_labels: %i[template_name]
49
+ },
50
+ user: {
51
+ mtype: M_COUNTER,
52
+ torun: %i[manager],
53
+ docstring: 'Number of pool instances this user created created',
54
+ prom_metric_prefix: "#{@metrics_prefix}_user",
55
+ param_labels: %i[user poolname]
56
+ },
57
+ usage_litmus: {
58
+ mtype: M_COUNTER,
59
+ torun: %i[manager],
60
+ docstring: 'Pools by Litmus job usage',
61
+ prom_metric_prefix: "#{@metrics_prefix}_usage_litmus",
62
+ param_labels: %i[user poolname]
63
+ },
64
+ usage_jenkins_instance: {
65
+ mtype: M_COUNTER,
66
+ torun: %i[manager],
67
+ docstring: 'Pools by Jenkins instance usage',
68
+ prom_metric_prefix: "#{@metrics_prefix}_usage_jenkins_instance",
69
+ param_labels: %i[jenkins_instance value_stream poolname]
70
+ },
71
+ usage_branch_project: {
72
+ mtype: M_COUNTER,
73
+ torun: %i[manager],
74
+ docstring: 'Pools by branch/project usage',
75
+ prom_metric_prefix: "#{@metrics_prefix}_usage_branch_project",
76
+ param_labels: %i[branch project poolname]
77
+ },
78
+ usage_job_component: {
79
+ mtype: M_COUNTER,
80
+ torun: %i[manager],
81
+ docstring: 'Pools by job/component usage',
82
+ prom_metric_prefix: "#{@metrics_prefix}_usage_job_component",
83
+ param_labels: %i[job_name component_to_test poolname]
84
+ },
85
+ checkout: {
86
+ mtype: M_COUNTER,
87
+ torun: %i[api],
88
+ docstring: 'Pool checkout counts',
89
+ prom_metric_prefix: "#{@metrics_prefix}_checkout",
90
+ metric_suffixes: {
91
+ nonresponsive: 'checkout failed - non responsive machine',
92
+ empty: 'checkout failed - no machine',
93
+ success: 'successful checkout',
94
+ invalid: 'checkout failed - invalid template'
95
+ },
96
+ param_labels: %i[poolname]
97
+ },
98
+ delete: {
99
+ mtype: M_COUNTER,
100
+ torun: %i[api],
101
+ docstring: 'Delete machine',
102
+ prom_metric_prefix: "#{@metrics_prefix}_delete",
103
+ metric_suffixes: {
104
+ success: 'succeeded',
105
+ failed: 'failed'
106
+ },
107
+ param_labels: []
108
+ },
109
+ ondemandrequest_generate: {
110
+ mtype: M_COUNTER,
111
+ torun: %i[api],
112
+ docstring: 'Ondemand request',
113
+ prom_metric_prefix: "#{@metrics_prefix}_ondemandrequest_generate",
114
+ metric_suffixes: {
115
+ duplicaterequests: 'failed duplicate request',
116
+ success: 'succeeded'
117
+ },
118
+ param_labels: []
119
+ },
120
+ ondemandrequest_fail: {
121
+ mtype: M_COUNTER,
122
+ torun: %i[api],
123
+ docstring: 'Ondemand request failure',
124
+ prom_metric_prefix: "#{@metrics_prefix}_ondemandrequest_fail",
125
+ metric_suffixes: {
126
+ toomanyrequests: 'too many requests',
127
+ invalid: 'invalid poolname'
128
+ },
129
+ param_labels: %i[poolname]
130
+ },
131
+ config: {
132
+ mtype: M_COUNTER,
133
+ torun: %i[api],
134
+ docstring: 'vmpooler pool configuration request',
135
+ prom_metric_prefix: "#{@metrics_prefix}_config",
136
+ metric_suffixes: { invalid: 'Invalid' },
137
+ param_labels: %i[poolname]
138
+ },
139
+ poolreset: {
140
+ mtype: M_COUNTER,
141
+ torun: %i[api],
142
+ docstring: 'Pool reset counter',
143
+ prom_metric_prefix: "#{@metrics_prefix}_poolreset",
144
+ metric_suffixes: { invalid: 'Invalid Pool' },
145
+ param_labels: %i[poolname]
146
+ },
147
+ connect: {
148
+ mtype: M_COUNTER,
149
+ torun: %i[manager],
150
+ docstring: 'vmpooler connect (to vSphere)',
151
+ prom_metric_prefix: "#{@metrics_prefix}_connect",
152
+ metric_suffixes: {
153
+ open: 'Connect Succeeded',
154
+ fail: 'Connect Failed'
155
+ },
156
+ param_labels: []
157
+ },
158
+ migrate_from: {
159
+ mtype: M_COUNTER,
160
+ torun: %i[manager],
161
+ docstring: 'vmpooler machine migrated from',
162
+ prom_metric_prefix: "#{@metrics_prefix}_migrate_from",
163
+ param_labels: %i[host_name]
164
+ },
165
+ migrate_to: {
166
+ mtype: M_COUNTER,
167
+ torun: %i[manager],
168
+ docstring: 'vmpooler machine migrated to',
169
+ prom_metric_prefix: "#{@metrics_prefix}_migrate_to",
170
+ param_labels: %i[host_name]
171
+ },
172
+ api_vm: {
173
+ mtype: M_COUNTER,
174
+ torun: %i[api],
175
+ docstring: 'Total number of HTTP request/sub-operations handled by the Rack application under the /vm endpoint',
176
+ prom_metric_prefix: "#{@metrics_prefix}_http_requests_vm_total",
177
+ param_labels: %i[method subpath operation]
178
+ },
179
+ ready: {
180
+ mtype: M_GAUGE,
181
+ torun: %i[manager],
182
+ docstring: 'vmpooler number of machines in ready State',
183
+ prom_metric_prefix: "#{@metrics_prefix}_ready",
184
+ param_labels: %i[poolname]
185
+ },
186
+ running: {
187
+ mtype: M_GAUGE,
188
+ torun: %i[manager],
189
+ docstring: 'vmpooler number of machines running',
190
+ prom_metric_prefix: "#{@metrics_prefix}_running",
191
+ param_labels: %i[poolname]
192
+ },
193
+ connection_available: {
194
+ mtype: M_GAUGE,
195
+ torun: %i[manager],
196
+ docstring: 'vmpooler redis connections available',
197
+ prom_metric_prefix: "#{@metrics_prefix}_connection_available",
198
+ param_labels: %i[type provider]
199
+ },
200
+ time_to_ready_state: {
201
+ mtype: M_HISTOGRAM,
202
+ torun: %i[manager],
203
+ buckets: POOLER_READY_TIME_BUCKETS,
204
+ docstring: 'Time taken for machine to read ready state for pool',
205
+ prom_metric_prefix: "#{@metrics_prefix}_time_to_ready_state",
206
+ param_labels: %i[poolname]
207
+ },
208
+ migrate: {
209
+ mtype: M_HISTOGRAM,
210
+ torun: %i[manager],
211
+ buckets: POOLER_CLONE_TIME_BUCKETS,
212
+ docstring: 'vmpooler time taken to migrate machine for pool',
213
+ prom_metric_prefix: "#{@metrics_prefix}_migrate",
214
+ param_labels: %i[poolname]
215
+ },
216
+ clone: {
217
+ mtype: M_HISTOGRAM,
218
+ torun: %i[manager],
219
+ buckets: POOLER_CLONE_TIME_BUCKETS,
220
+ docstring: 'vmpooler time taken to clone machine',
221
+ prom_metric_prefix: "#{@metrics_prefix}_clone",
222
+ param_labels: %i[poolname]
223
+ },
224
+ destroy: {
225
+ mtype: M_HISTOGRAM,
226
+ torun: %i[manager],
227
+ buckets: POOLER_CLONE_TIME_BUCKETS,
228
+ docstring: 'vmpooler time taken to destroy machine',
229
+ prom_metric_prefix: "#{@metrics_prefix}_destroy",
230
+ param_labels: %i[poolname]
231
+ },
232
+ connection_waited: {
233
+ mtype: M_HISTOGRAM,
234
+ torun: %i[manager],
235
+ buckets: REDIS_CONNECT_BUCKETS,
236
+ docstring: 'vmpooler redis connection wait time',
237
+ prom_metric_prefix: "#{@metrics_prefix}_connection_waited",
238
+ param_labels: %i[type provider]
239
+ }
240
+ }
241
+ end
242
+
243
+ # Helper to add individual prom metric.
244
+ # Allow Histograms to specify the bucket size.
245
+ def add_prometheus_metric(metric_spec, name, docstring)
246
+ case metric_spec[:mtype]
247
+ when M_COUNTER
248
+ metric_class = Prometheus::Client::Counter
249
+ when M_GAUGE
250
+ metric_class = Prometheus::Client::Gauge
251
+ when M_SUMMARY
252
+ metric_class = Prometheus::Client::Summary
253
+ when M_HISTOGRAM
254
+ metric_class = Prometheus::Client::Histogram
255
+ else
256
+ raise("Unable to register metric #{name} with metric type #{metric_spec[:mtype]}")
257
+ end
258
+
259
+ if (metric_spec[:mtype] == M_HISTOGRAM) && (metric_spec.key? :buckets)
260
+ prom_metric = metric_class.new(
261
+ name.to_sym,
262
+ docstring: docstring,
263
+ labels: metric_spec[:param_labels] + [:vmpooler_instance],
264
+ buckets: metric_spec[:buckets],
265
+ preset_labels: { vmpooler_instance: @prefix }
266
+ )
267
+ else
268
+ prom_metric = metric_class.new(
269
+ name.to_sym,
270
+ docstring: docstring,
271
+ labels: metric_spec[:param_labels] + [:vmpooler_instance],
272
+ preset_labels: { vmpooler_instance: @prefix }
273
+ )
274
+ end
275
+ @prometheus.register(prom_metric)
276
+ end
277
+
278
+ # Top level method to register all the prometheus metrics.
279
+
280
+ def setup_prometheus_metrics(torun)
281
+ @p_metrics = vmpooler_metrics_table
282
+ @p_metrics.each do |_name, metric_spec|
283
+ # Only register metrics appropriate to api or manager
284
+ next if (torun & metric_spec[:torun]).empty?
285
+
286
+ if metric_spec.key? :metric_suffixes
287
+ # Iterate thru the suffixes if provided to register multiple counters here.
288
+ metric_spec[:metric_suffixes].each do |metric_suffix|
289
+ add_prometheus_metric(
290
+ metric_spec,
291
+ "#{metric_spec[:prom_metric_prefix]}_#{metric_suffix[0]}",
292
+ "#{metric_spec[:docstring]} #{metric_suffix[1]}"
293
+ )
294
+ end
295
+ else
296
+ # No Additional counter suffixes so register this as metric.
297
+ add_prometheus_metric(
298
+ metric_spec,
299
+ metric_spec[:prom_metric_prefix],
300
+ metric_spec[:docstring]
301
+ )
302
+ end
303
+ end
304
+ end
305
+
306
+ # locate a metric and check/interpet the sub-fields.
307
+ def find_metric(label)
308
+ sublabels = label.split('.')
309
+ metric_key = sublabels.shift.to_sym
310
+ raise("Invalid Metric #{metric_key} for #{label}") unless @p_metrics.key? metric_key
311
+
312
+ metric = @p_metrics[metric_key].clone
313
+
314
+ if metric.key? :metric_suffixes
315
+ metric_subkey = sublabels.shift.to_sym
316
+ raise("Invalid Metric #{metric_key}_#{metric_subkey} for #{label}") unless metric[:metric_suffixes].key? metric_subkey.to_sym
317
+
318
+ metric[:metric_name] = "#{metric[:prom_metric_prefix]}_#{metric_subkey}"
319
+ else
320
+ metric[:metric_name] = metric[:prom_metric_prefix]
321
+ end
322
+
323
+ # Check if we are looking for a parameter value at last element.
324
+ if metric.key? :param_labels
325
+ metric[:labels] = {}
326
+ # Special case processing here - if there is only one parameter label then make sure
327
+ # we append all of the remaining contents of the metric with "." separators to ensure
328
+ # we get full nodenames (e.g. for Migration to node operations)
329
+ if metric[:param_labels].length == 1
330
+ metric[:labels][metric[:param_labels].first] = sublabels.join('.')
331
+ else
332
+ metric[:param_labels].reverse_each do |param_label|
333
+ metric[:labels][param_label] = sublabels.pop(1).first
334
+ end
335
+ end
336
+ end
337
+ metric
338
+ end
339
+
340
+ # Helper to get lab metrics.
341
+ def get(label)
342
+ metric = find_metric(label)
343
+ [metric, @prometheus.get(metric[:metric_name])]
344
+ end
345
+
346
+ # Note - Catch and log metrics failures so they can be noted, but don't interrupt vmpooler operation.
347
+ def increment(label)
348
+ begin
349
+ counter_metric, c = get(label)
350
+ c.increment(labels: counter_metric[:labels])
351
+ rescue StandardError => e
352
+ @logger.log('s', "[!] prometheus error logging metric #{label} increment : #{e}")
353
+ end
354
+ end
355
+
356
+ def gauge(label, value)
357
+ begin
358
+ unless value.nil?
359
+ gauge_metric, g = get(label)
360
+ g.set(value.to_i, labels: gauge_metric[:labels])
361
+ end
362
+ rescue StandardError => e
363
+ @logger.log('s', "[!] prometheus error logging gauge #{label}, value #{value}: #{e}")
364
+ end
365
+ end
366
+
367
+ def timing(label, duration)
368
+ begin
369
+ # https://prometheus.io/docs/practices/histograms/
370
+ unless duration.nil?
371
+ histogram_metric, hm = get(label)
372
+ hm.observe(duration.to_f, labels: histogram_metric[:labels])
373
+ end
374
+ rescue StandardError => e
375
+ @logger.log('s', "[!] prometheus error logging timing event label #{label}, duration #{duration}: #{e}")
376
+ end
377
+ end
378
+ end
379
+ end
380
+ end