fluent-plugin-k8s-metrics-agg 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,604 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ require 'time'
14
+
15
+ require 'fluent/plugin/input'
16
+ require 'kubeclient'
17
+ require 'multi_json'
18
+ module Fluent
19
+ module Plugin
20
+ class KubernetesMetricsAggregatorInput < Fluent::Plugin::Input
21
+ @@namespace_usage_metrics_map = {}
22
+ @@node_requests_limits_metrics_map = {}
23
+
24
+ @@namespace_resource_usage_metrics_map = {}
25
+ @@node_resource_usage_metrics_map = {}
26
+
27
+ class UsageMetricsUnit
28
+ def initialize
29
+ @cpu_limit = 0
30
+ @cpu_request = 0
31
+ @memory_limit = 0
32
+ @memory_request = 0
33
+ end
34
+
35
+ def add_usage_metrics(cpu_limit, cpu_request, memory_limit, memory_request)
36
+ cpu = get_cpu_or_memory_value(cpu_limit)
37
+ mult = get_cpu_mult(cpu_limit)
38
+ @cpu_limit += cpu * mult
39
+ cpu = get_cpu_or_memory_value(cpu_request)
40
+ mult = get_cpu_mult(cpu_request)
41
+ @cpu_request += cpu * mult
42
+ memory = get_cpu_or_memory_value(memory_limit)
43
+ mult = get_memory_mult(memory_limit)
44
+ @memory_limit += memory * mult
45
+ memory = get_cpu_or_memory_value(memory_request)
46
+ mult = get_memory_mult(memory_request)
47
+ @memory_request += memory * mult
48
+ end
49
+
50
+ def get_cpu_value_and_multiplier(cpu)
51
+ # m cpu is assumed standard
52
+ @cpu_mult = 1
53
+ @cpu_mult = 1000 if cpu[-1] != 'm'
54
+ cpu.delete('^0-9').to_i
55
+ end
56
+
57
+ def get_cpu_or_memory_value(resource)
58
+ resource = resource.tr('^0-9', '').to_i
59
+ resource
60
+ end
61
+
62
+ def get_cpu_mult(cpu)
63
+ cpu_mult = 1
64
+ cpu_mult = 1000 if cpu[-1] != 'm'
65
+ cpu_mult
66
+ end
67
+
68
+ # https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory
69
+ def get_memory_mult(memory)
70
+ memory_mult = if memory[-2] == 'Ki'
71
+ 0.001
72
+ elsif memory[-2] == 'K'
73
+ 1.0 / 1024
74
+ elsif memory[-2] == 'Mi'
75
+ 1
76
+ elsif memory[-2] == 'M'
77
+ 1
78
+ elsif memory[-2] == 'Gi'
79
+ 1000
80
+ elsif memory[-2] == 'G'
81
+ 1024
82
+ elsif memory[-2] == 'Ti'
83
+ 1_000_000
84
+ elsif memory[-2] == 'T'
85
+ 1_048_576
86
+ elsif memory[-2] == 'Ei'
87
+ 1_000_000_000
88
+ elsif memory[-2] == 'E'
89
+ 1_073_741_824
90
+ else
91
+ 0.000001
92
+ end
93
+ memory_mult
94
+ end
95
+ end
96
+
97
+ class ResourceUsageMetricsUnit
98
+ def initialize
99
+ @cpu_usage = 0
100
+ @memory_usage = 0
101
+ end
102
+
103
+ def add_resource_usage_metrics(cpu_usage, memory_usage)
104
+ @cpu_usage += cpu_usage
105
+ @memory_usage += memory_usage
106
+ end
107
+ end
108
+
109
+ Fluent::Plugin.register_input('kubernetes_metrics_aggregator', self)
110
+
111
+ helpers :timer
112
+
113
+ desc 'URL of the kubernetes API server.'
114
+ config_param :kubernetes_url, :string, default: nil
115
+
116
+ desc 'The port that kubelet is listening to.'
117
+ config_param :kubelet_port, :integer, default: 10_250
118
+
119
+ desc 'The tag of the event.'
120
+ config_param :tag, :string, default: 'kubernetes.metrics.*'
121
+
122
+ desc 'How often it pulls metrics.'
123
+ config_param :interval, :time, default: "15s"
124
+
125
+ desc 'Path to a kubeconfig file points to a cluster the plugin should collect metrics from. Mostly useful when running fluentd outside of the cluster. When `kubeconfig` is set, `kubernetes_url`, `client_cert`, `client_key`, `ca_file`, `insecure_ssl`, `bearer_token_file`, and `secret_dir` will all be ignored.'
126
+ config_param :kubeconfig, :string, default: nil
127
+
128
+ desc 'Path to the certificate file for this client.'
129
+ config_param :client_cert, :string, default: nil
130
+
131
+ desc 'Path to the private key file for this client.'
132
+ config_param :client_key, :string, default: nil
133
+
134
+ desc 'Path to the CA file.'
135
+ config_param :ca_file, :string, default: nil
136
+
137
+ desc "If `insecure_ssl` is set to `true`, it won't verify apiserver's certificate."
138
+ config_param :insecure_ssl, :bool, default: false
139
+
140
+ desc 'Path to the file contains the API token. By default it reads from the file "token" in the `secret_dir`.'
141
+ config_param :bearer_token_file, :string, default: nil
142
+
143
+ desc "Path of the location where pod's service account's credentials are stored."
144
+ config_param :secret_dir, :string, default: '/var/run/secrets/kubernetes.io/serviceaccount'
145
+
146
+ desc 'The name of the cluster, where the plugin is deployed.'
147
+ config_param :cluster_name, :string, default: 'cluster_name'
148
+
149
+ def configure(conf)
150
+ super
151
+ @mutex_node_req_lim = Mutex.new
152
+ @mutex_node_res_usage = Mutex.new
153
+ parse_tag
154
+ initialize_client
155
+ end
156
+
157
+ def start
158
+ super
159
+
160
+ timer_execute :limits_request_scraper, @interval, &method(:scrape_limits_requests_metrics)
161
+ timer_execute :node_scraper, @interval, &method(:scrape_node_metrics)
162
+ timer_execute :resource_usage_scraper, @interval, &method(:scrape_resource_usage_metrics)
163
+
164
+ end
165
+
166
+ def close
167
+ @watchers.each &:finish if @watchers
168
+
169
+ super
170
+ end
171
+
172
+ private
173
+
174
+ def parse_tag
175
+ @tag_prefix, @tag_suffix = @tag.split('*') if @tag.include?('*')
176
+ end
177
+
178
+ def generate_tag(item_name)
179
+ return @tag unless @tag_prefix
180
+
181
+ [@tag_prefix, item_name, @tag_suffix].join
182
+ end
183
+
184
+ def init_with_kubeconfig(options = {})
185
+ config = Kubeclient::Config.read @kubeconfig
186
+ current_context = config.context
187
+
188
+ @client = Kubeclient::Client.new(
189
+ current_context.api_endpoint,
190
+ current_context.api_version,
191
+ options.merge(
192
+ ssl_options: current_context.ssl_options,
193
+ auth_options: current_context.auth_options
194
+ )
195
+ )
196
+ end
197
+
198
+ def init_without_kubeconfig(_options = {})
199
+ kubernetes_url_final = nil
200
+ # mostly borrowed from Fluentd Kubernetes Metadata Filter Plugin
201
+ if @kubernetes_url.nil?
202
+ # Use Kubernetes default service account if we're in a pod.
203
+ env_host = ENV['KUBERNETES_SERVICE_HOST']
204
+ env_port = ENV['KUBERNETES_SERVICE_PORT']
205
+ if env_host && env_port
206
+ kubernetes_url_final = "https://#{env_host}:#{env_port}/api/"
207
+ end
208
+ else
209
+ kubernetes_url_final = "https://#{@kubernetes_url}:#{@kubelet_port}/api/"
210
+ end
211
+
212
+ raise Fluent::ConfigError, 'kubernetes url is not set in configuration, or environment variables' unless kubernetes_url_final
213
+
214
+ # Use SSL certificate and bearer token from Kubernetes service account.
215
+ if Dir.exist?(@secret_dir)
216
+ secret_ca_file = File.join(@secret_dir, 'ca.crt')
217
+ secret_token_file = File.join(@secret_dir, 'token')
218
+
219
+ if @ca_file.nil? && File.exist?(secret_ca_file)
220
+ @ca_file = secret_ca_file
221
+ end
222
+
223
+ if @bearer_token_file.nil? && File.exist?(secret_token_file)
224
+ @bearer_token_file = secret_token_file
225
+ end
226
+ end
227
+
228
+ ssl_options = {
229
+ client_cert: @client_cert && OpenSSL::X509::Certificate.new(File.read(@client_cert)),
230
+ client_key: @client_key && OpenSSL::PKey::RSA.new(File.read(@client_key)),
231
+ ca_file: @ca_file,
232
+ verify_ssl: @insecure_ssl ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER
233
+ }
234
+
235
+ auth_options = {}
236
+ auth_options[:bearer_token] = File.read(@bearer_token_file) if @bearer_token_file
237
+
238
+ @client = Kubeclient::Client.new(
239
+ kubernetes_url_final, 'v1',
240
+ ssl_options: ssl_options,
241
+ auth_options: auth_options
242
+ )
243
+
244
+ begin
245
+ @client.api_valid?
246
+ rescue KubeException => kube_error
247
+ raise Fluent::ConfigError, "Invalid Kubernetes API #{@api_version} endpoint #{kubernetes_url_final}: #{kube_error.message}"
248
+ end
249
+ end
250
+
251
+ def initialize_client
252
+ options = {
253
+ timeouts: {
254
+ open: 10,
255
+ read: nil
256
+ }
257
+ }
258
+
259
+ if @kubeconfig.nil?
260
+ init_without_kubeconfig options
261
+ else
262
+ init_with_kubeconfig options
263
+ end
264
+ end
265
+
266
+ def parse_time(metric_time)
267
+ Fluent::EventTime.from_time Time.iso8601(metric_time)
268
+ end
269
+
270
+ def underscore(camlcase)
271
+ camlcase.gsub(/[A-Z]/) { |c| "_#{c.downcase}" }
272
+ end
273
+
274
+ def get_cpu_mult(cpu)
275
+ cpu_mult = 1
276
+ cpu_mult = 1000 if cpu[-1] != 'm'
277
+ cpu_mult
278
+ end
279
+
280
+ def get_cpu_value(resource)
281
+ cpu_val = resource.tr('^0-9', '').to_i
282
+ mult = get_cpu_mult(resource)
283
+ cpu_val += cpu_val * mult
284
+ cpu_val
285
+ end
286
+
287
+ def get_memory_mult(memory)
288
+ memory_mult = if memory[-2] == 'Ki'
289
+ 0.001
290
+ elsif memory[-2] == 'K'
291
+ 1.0 / 1024
292
+ elsif memory[-2] == 'Mi'
293
+ 1
294
+ elsif memory[-2] == 'M'
295
+ 1
296
+ elsif memory[-2] == 'Gi'
297
+ 1000
298
+ elsif memory[-2] == 'G'
299
+ 1024
300
+ elsif memory[-2] == 'Ti'
301
+ 1_000_000
302
+ elsif memory[-2] == 'T'
303
+ 1_048_576 # 1024*1024
304
+ elsif memory[-2] == 'Ei'
305
+ 1_000_000_000
306
+ elsif memory[-2] == 'E'
307
+ 1_073_741_824 # 1024*1024*1024
308
+ else
309
+ 0.000001
310
+ end
311
+ memory_mult
312
+ end
313
+
314
+ def get_memory_value(resource)
315
+ mem_val = resource.tr('^0-9', '').to_i
316
+ mult = get_memory_mult(resource)
317
+ mem_val += mem_val * mult
318
+ mem_val
319
+ end
320
+
321
+ def emit_limits_requests_metrics(tag,
322
+ scraped_at,
323
+ labels,
324
+ limits_requests_metric)
325
+ router.emit tag + '.cpu.limit',
326
+ Fluent::EventTime.from_time(scraped_at),
327
+ labels.merge(
328
+ 'value' => limits_requests_metric.instance_variable_get(:@cpu_limit)
329
+ )
330
+ router.emit tag + '.cpu.request',
331
+ Fluent::EventTime.from_time(scraped_at),
332
+ labels.merge('value' => limits_requests_metric.instance_variable_get(:@cpu_request))
333
+ router.emit tag + '.memory.limit',
334
+ Fluent::EventTime.from_time(scraped_at),
335
+ labels.merge('value' => limits_requests_metric.instance_variable_get(:@memory_limit))
336
+ router.emit tag + '.memory.request',
337
+ Fluent::EventTime.from_time(scraped_at),
338
+ labels.merge('value' => limits_requests_metric.instance_variable_get(:@memory_request))
339
+ end
340
+
341
+ def emit_resource_usage_metrics(tag,
342
+ scraped_at,
343
+ labels,
344
+ resource_usage_metric)
345
+ router.emit tag + '.cpu.usage',
346
+ Fluent::EventTime.from_time(scraped_at),
347
+ labels.merge('value' => resource_usage_metric.instance_variable_get(:@cpu_usage))
348
+ router.emit tag + '.memory.usage',
349
+ Fluent::EventTime.from_time(scraped_at),
350
+ labels.merge('value' => resource_usage_metric.instance_variable_get(:@memory_usage))
351
+ end
352
+
353
+ def limits_requests_api
354
+ @limits_requests_api =
355
+ begin
356
+ @client.discover unless @client.discovered
357
+ @client.rest_client['/pods'].tap do |endpoint|
358
+ log.info("Use URL #{endpoint.url} for scraping limits requests metrics")
359
+ end
360
+ end
361
+ end
362
+
363
+ def scrape_limits_requests_metrics
364
+ response = limits_requests_api.get(@client.headers)
365
+ handle_limits_requests_res(response)
366
+ end
367
+
368
+ # This method is used to handle responses from the kube apiserver api
369
+ def handle_limits_requests_res(response)
370
+ # Checking response codes only for a successful GET request viz., 2XX codes
371
+ if (response.code < 300) && (response.code > 199)
372
+ @scraped_at = Time.now
373
+ process_limits_requests_res MultiJson.load(response.body)
374
+ else
375
+ log.error "ExMultiJson.load(response.body) expected 2xx from summary API, but got #{response.code}. Response body = #{response.body}"
376
+ end
377
+ rescue StandardError => e
378
+ log.error "Failed to scrape metrics, error=#{$ERROR_INFO}, #{e.inspect}"
379
+ log.error_backtrace
380
+ end
381
+
382
+ def process_limits_requests_res(response)
383
+ @scraped_at = Time.now
384
+ @mutex_node_req_lim.synchronize do
385
+ Array(response['items']).each do |pod_json|
386
+ pod_namespace = pod_json['metadata']['namespace']
387
+ pod_node_name = pod_json['spec']['nodeName']
388
+ if @@namespace_usage_metrics_map[pod_namespace].nil?
389
+ namespace_usage_metrics = UsageMetricsUnit.new
390
+ @@namespace_usage_metrics_map[pod_namespace] = namespace_usage_metrics
391
+ end
392
+
393
+ pod_containers = pod_json['spec']['containers']
394
+ pod_usage_metrics = UsageMetricsUnit.new
395
+ Array(pod_containers).each do |container_json|
396
+ container_usage_metrics = UsageMetricsUnit.new
397
+ cpu_limit = '0'
398
+ memory_limit = '0'
399
+ cpu_request = '0'
400
+ memory_request = '0'
401
+ unless container_json['resources']['limits'].nil?
402
+ unless container_json['resources']['limits']['cpu'].nil?
403
+ cpu_limit = container_json['resources']['limits']['cpu']
404
+ end
405
+ unless container_json['resources']['limits']['memory'].nil?
406
+ memory_limit = container_json['resources']['limits']['memory']
407
+ end
408
+ end
409
+ unless container_json['resources']['requests'].nil?
410
+ unless container_json['resources']['requests']['cpu'].nil?
411
+ cpu_request = container_json['resources']['requests']['cpu']
412
+ end
413
+ unless container_json['resources']['requests']['memory'].nil?
414
+ memory_request = container_json['resources']['requests']['memory']
415
+ end
416
+ end
417
+ container_usage_metrics.add_usage_metrics(cpu_limit, cpu_request, memory_limit, memory_request)
418
+ container_labels = { 'name' => container_json['name'], 'image' => container_json['image'], 'node' => pod_json['spec']['nodeName'] }
419
+ emit_limits_requests_metrics(generate_tag('container'), @scraped_at, container_labels, container_usage_metrics)
420
+ pod_usage_metrics.add_usage_metrics(cpu_limit, cpu_request, memory_limit, memory_request)
421
+ end
422
+
423
+ pod_labels = { 'name' => pod_json['metadata']['name'], 'namespace' => pod_json['metadata']['name'], 'node' => pod_json['spec']['nodeName'] }
424
+ emit_limits_requests_metrics(generate_tag('pod'), @scraped_at, pod_labels, pod_usage_metrics)
425
+ @@namespace_usage_metrics_map[pod_namespace].add_usage_metrics(pod_usage_metrics.instance_variable_get(:@cpu_limit).to_s + ('m'), pod_usage_metrics.instance_variable_get(:@cpu_request).to_s + ('m'),
426
+ pod_usage_metrics.instance_variable_get(:@memory_limit).to_s + ('Mi'), pod_usage_metrics.instance_variable_get(:@memory_request).to_s + ('Mi'))
427
+
428
+ if @@node_requests_limits_metrics_map[pod_node_name].nil?
429
+ node_name_usage_metrics = UsageMetricsUnit.new
430
+ @@node_requests_limits_metrics_map[pod_node_name] = node_name_usage_metrics
431
+ end
432
+ @@node_requests_limits_metrics_map[pod_node_name].add_usage_metrics(pod_usage_metrics.instance_variable_get(:@cpu_limit).to_s + ('m'), pod_usage_metrics.instance_variable_get(:@cpu_request).to_s + ('m'),
433
+ pod_usage_metrics.instance_variable_get(:@memory_limit).to_s + ('Mi'), pod_usage_metrics.instance_variable_get(:@memory_request).to_s + ('Mi'))
434
+ pod_usage_metrics = nil
435
+ end
436
+ end
437
+ cluster_usage_metrics = UsageMetricsUnit.new
438
+ @@namespace_usage_metrics_map.each do |key, value|
439
+ cluster_usage_metrics.add_usage_metrics(value.instance_variable_get(:@cpu_limit).to_s + ('m'), value.instance_variable_get(:@cpu_request).to_s + ('m'),
440
+ value.instance_variable_get(:@memory_limit).to_s + ('Mi'), value.instance_variable_get(:@memory_request).to_s + ('Mi'))
441
+ emit_limits_requests_metrics(generate_tag('namespace'), @scraped_at, { 'name' => key }, value)
442
+ value = nil
443
+ end
444
+
445
+ emit_limits_requests_metrics(generate_tag('cluster'), @scraped_at, { 'name' => @cluster_name }, cluster_usage_metrics)
446
+ cluster_usage_metrics = nil
447
+
448
+ @@namespace_usage_metrics_map = nil
449
+ @@namespace_usage_metrics_map = {}
450
+ end
451
+
452
+ def node_api
453
+ @node_api =
454
+ begin
455
+ @client.discover unless @client.discovered
456
+ @client.rest_client['/nodes'].tap do |endpoint|
457
+ log.info("Use URL #{endpoint.url} for scraping node metrics")
458
+ end
459
+ end
460
+ end
461
+
462
+ def scrape_node_metrics
463
+ response = node_api.get(@client.headers)
464
+ handle_node_response(response)
465
+ end
466
+
467
+ # This method is used to handle responses from the kubeapiserver api
468
+ def handle_node_response(response)
469
+ # Checking response codes only for a successful GET request viz., 2XX codes
470
+ if (response.code < 300) && (response.code > 199)
471
+ @scraped_node_at = Time.now
472
+ process_node_response MultiJson.load(response.body)
473
+ else
474
+ log.error "ExMultiJson.load(response.body) expected 2xx from summary API, but got #{response.code}. Response body = #{response.body}"
475
+ end
476
+ rescue StandardError => e
477
+ log.error "Failed to scrape metrics, error=#{$ERROR_INFO}, #{e.inspect}"
478
+ log.error_backtrace
479
+ end
480
+
481
+ def process_node_response(response)
482
+ Array(response['items']).each do |node_json|
483
+ node_name = node_json['metadata']['name']
484
+ node_cpu_capacity = get_cpu_value(node_json['status']['capacity']['cpu'])
485
+ router.emit generate_tag('node') << ('.cpu.capacity'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_cpu_capacity
486
+ node_cpu_allocatable = get_cpu_value(node_json['status']['allocatable']['cpu'])
487
+ router.emit generate_tag('node') << ('.cpu.allocatable'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_cpu_allocatable
488
+ node_memory_capacity = get_memory_value(node_json['status']['capacity']['memory'])
489
+ router.emit generate_tag('node') << ('.memory.capacity'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_memory_capacity
490
+ node_memory_allocatable = get_memory_value(node_json['status']['allocatable']['memory'])
491
+ router.emit generate_tag('node') << ('.memory.allocatable'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_memory_allocatable
492
+
493
+ node_req_lim = UsageMetricsUnit.new
494
+ node_res_usage = ResourceUsageMetricsUnit.new
495
+ @mutex_node_req_lim.synchronize do
496
+ next if @@node_requests_limits_metrics_map[node_name].nil?
497
+
498
+ node_req_lim = @@node_requests_limits_metrics_map[node_name]
499
+ end
500
+ @mutex_node_res_usage.synchronize do
501
+ next if @@node_resource_usage_metrics_map[node_name].nil?
502
+
503
+ node_res_usage = @@node_resource_usage_metrics_map[node_name]
504
+ end
505
+ # https://github.com/kubernetes/heapster/blob/c78cc312ab3901acfe5c2f95f7a621909c8455ad/metrics/processors/node_autoscaling_enricher.go#L62
506
+ node_cpu_utilization = node_res_usage.instance_variable_get(:@cpu_usage).to_f / 1_000_000 * node_cpu_allocatable # converting from nano cores to milli core
507
+ router.emit generate_tag('node') << ('.cpu.utilization'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_cpu_utilization
508
+ node_cpu_reservation = node_req_lim.instance_variable_get(:@cpu_request).to_f / node_cpu_allocatable
509
+ router.emit generate_tag('node') << ('.cpu.reservation'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_cpu_reservation
510
+ node_memory_utilization = node_res_usage.instance_variable_get(:@memory_usage).to_f / 1_000_000 * node_memory_allocatable # converting from bytes to megabytes
511
+ router.emit generate_tag('node') << ('.memory.utilization'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_memory_utilization
512
+ node_memory_reservation = node_req_lim.instance_variable_get(:@memory_request).to_f / node_memory_allocatable
513
+ router.emit generate_tag('node') << ('.memory.reservation'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_memory_reservation
514
+ @mutex_node_req_lim.synchronize do
515
+ @@node_requests_limits_metrics_map = nil
516
+ @@node_requests_limits_metrics_map = {}
517
+ end
518
+ @mutex_node_res_usage.synchronize do
519
+ @@node_resource_usage_metrics_map = nil
520
+ @@node_resource_usage_metrics_map = {}
521
+ end
522
+ end
523
+ end
524
+
525
+ def resource_usage_api
526
+ @resource_usage_api =
527
+ begin
528
+ @client.discover unless @client.discovered
529
+ @client.rest_client['/nodes'].tap do |endpoint|
530
+ log.info("Use URL #{endpoint.url} for scraping node metrics")
531
+ end
532
+ end
533
+ end
534
+
535
+ def scrape_resource_usage_metrics
536
+ response = resource_usage_api.get(@client.headers)
537
+ handle_resource_usage_response(response)
538
+ end
539
+
540
+ # This method is used to handle responses from the kubelet summary api
541
+ def handle_resource_usage_response(response)
542
+ # Checking response codes only for a successful GET request viz., 2XX codes
543
+ if (response.code < 300) && (response.code > 199)
544
+ @scraped_at = Time.now
545
+ process_resource_usage_res MultiJson.load(response.body)
546
+ else
547
+ log.error "ExMultiJson.load(response.body) expected 2xx from summary API, but got #{response.code}. Response body = #{response.body}"
548
+ end
549
+ rescue StandardError => e
550
+ log.error "Failed to scrape metrics, error=#{$ERROR_INFO}, #{e.inspect}"
551
+ log.error_backtrace
552
+ end
553
+
554
+ def process_resource_usage_res(response)
555
+ @scraped_node_at = Time.now
556
+ @mutex_node_res_usage.synchronize do
557
+ Array(response['items']).each do |node_json|
558
+ node_name = node_json['metadata']['name']
559
+ node_rest_client =
560
+ begin
561
+ @client.discover unless @client.discovered
562
+ @client.rest_client["/nodes/#{node_name}:#{@kubelet_port}/proxy/stats/summary"].tap do |endpoint|
563
+ log.info("Use URL #{endpoint.url} for scraping resource usage metrics")
564
+ end
565
+ end
566
+
567
+ node_response = JSON.parse(node_rest_client.get(@client.headers))
568
+ Array(node_response['pods']).each do |pod_json|
569
+ pod_cpu_usage = pod_json['cpu']['usageNanoCores']
570
+ pod_memory_usage = pod_json['memory']['usageBytes']
571
+ pod_namespace = pod_json['podRef']['namespace']
572
+ pod_usage = ResourceUsageMetricsUnit.new
573
+ pod_usage.add_resource_usage_metrics(pod_cpu_usage, pod_memory_usage)
574
+ if @@namespace_resource_usage_metrics_map[pod_namespace].nil?
575
+ namespace_usage_metrics = ResourceUsageMetricsUnit.new
576
+ @@namespace_resource_usage_metrics_map[pod_namespace] = pod_usage
577
+ else
578
+ @@namespace_resource_usage_metrics_map[pod_namespace].add_resource_usage_metrics(pod_cpu_usage, pod_memory_usage)
579
+ end
580
+ if @@node_resource_usage_metrics_map[node_name].nil?
581
+ node_name_usage_metrics = ResourceUsageMetricsUnit.new
582
+ @@node_resource_usage_metrics_map[node_name] = node_name_usage_metrics
583
+ end
584
+ @@node_resource_usage_metrics_map[node_name].add_resource_usage_metrics(pod_cpu_usage, pod_memory_usage)
585
+ pod_usage = nil
586
+ end
587
+ end
588
+ end
589
+
590
+ cluster_usage_metrics = ResourceUsageMetricsUnit.new
591
+ @@namespace_resource_usage_metrics_map.each do |key, value|
592
+ cluster_usage_metrics.add_resource_usage_metrics(value.instance_variable_get(:@cpu_usage), value.instance_variable_get(:@memory_usage))
593
+ emit_resource_usage_metrics(generate_tag('namespace'), @scraped_at, { 'name' => key }, value)
594
+ value = nil
595
+ end
596
+ emit_resource_usage_metrics(generate_tag('cluster'), @scraped_at, { 'name' => @cluster_name }, cluster_usage_metrics)
597
+ cluster_usage_metrics = nil
598
+
599
+ @@namespace_resource_usage_metrics_map = nil
600
+ @@namespace_resource_usage_metrics_map = {}
601
+ end
602
+ end
603
+ end
604
+ end