fluent-plugin-k8s-metrics-agg 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,604 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ require 'time'
14
+
15
+ require 'fluent/plugin/input'
16
+ require 'kubeclient'
17
+ require 'multi_json'
18
+ module Fluent
19
+ module Plugin
20
+ class KubernetesMetricsAggregatorInput < Fluent::Plugin::Input
21
+ @@namespace_usage_metrics_map = {}
22
+ @@node_requests_limits_metrics_map = {}
23
+
24
+ @@namespace_resource_usage_metrics_map = {}
25
+ @@node_resource_usage_metrics_map = {}
26
+
27
+ class UsageMetricsUnit
28
+ def initialize
29
+ @cpu_limit = 0
30
+ @cpu_request = 0
31
+ @memory_limit = 0
32
+ @memory_request = 0
33
+ end
34
+
35
+ def add_usage_metrics(cpu_limit, cpu_request, memory_limit, memory_request)
36
+ cpu = get_cpu_or_memory_value(cpu_limit)
37
+ mult = get_cpu_mult(cpu_limit)
38
+ @cpu_limit += cpu * mult
39
+ cpu = get_cpu_or_memory_value(cpu_request)
40
+ mult = get_cpu_mult(cpu_request)
41
+ @cpu_request += cpu * mult
42
+ memory = get_cpu_or_memory_value(memory_limit)
43
+ mult = get_memory_mult(memory_limit)
44
+ @memory_limit += memory * mult
45
+ memory = get_cpu_or_memory_value(memory_request)
46
+ mult = get_memory_mult(memory_request)
47
+ @memory_request += memory * mult
48
+ end
49
+
50
+ def get_cpu_value_and_multiplier(cpu)
51
+ # m cpu is assumed standard
52
+ @cpu_mult = 1
53
+ @cpu_mult = 1000 if cpu[-1] != 'm'
54
+ cpu.delete('^0-9').to_i
55
+ end
56
+
57
+ def get_cpu_or_memory_value(resource)
58
+ resource = resource.tr('^0-9', '').to_i
59
+ resource
60
+ end
61
+
62
+ def get_cpu_mult(cpu)
63
+ cpu_mult = 1
64
+ cpu_mult = 1000 if cpu[-1] != 'm'
65
+ cpu_mult
66
+ end
67
+
68
+ # https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory
69
+ def get_memory_mult(memory)
70
+ memory_mult = if memory[-2] == 'Ki'
71
+ 0.001
72
+ elsif memory[-2] == 'K'
73
+ 1.0 / 1024
74
+ elsif memory[-2] == 'Mi'
75
+ 1
76
+ elsif memory[-2] == 'M'
77
+ 1
78
+ elsif memory[-2] == 'Gi'
79
+ 1000
80
+ elsif memory[-2] == 'G'
81
+ 1024
82
+ elsif memory[-2] == 'Ti'
83
+ 1_000_000
84
+ elsif memory[-2] == 'T'
85
+ 1_048_576
86
+ elsif memory[-2] == 'Ei'
87
+ 1_000_000_000
88
+ elsif memory[-2] == 'E'
89
+ 1_073_741_824
90
+ else
91
+ 0.000001
92
+ end
93
+ memory_mult
94
+ end
95
+ end
96
+
97
+ class ResourceUsageMetricsUnit
98
+ def initialize
99
+ @cpu_usage = 0
100
+ @memory_usage = 0
101
+ end
102
+
103
+ def add_resource_usage_metrics(cpu_usage, memory_usage)
104
+ @cpu_usage += cpu_usage
105
+ @memory_usage += memory_usage
106
+ end
107
+ end
108
+
109
+ Fluent::Plugin.register_input('kubernetes_metrics_aggregator', self)
110
+
111
+ helpers :timer
112
+
113
+ desc 'URL of the kubernetes API server.'
114
+ config_param :kubernetes_url, :string, default: nil
115
+
116
+ desc 'The port that kubelet is listening to.'
117
+ config_param :kubelet_port, :integer, default: 10_250
118
+
119
+ desc 'The tag of the event.'
120
+ config_param :tag, :string, default: 'kubernetes.metrics.*'
121
+
122
+ desc 'How often it pulls metrics.'
123
+ config_param :interval, :time, default: "15s"
124
+
125
+ desc 'Path to a kubeconfig file points to a cluster the plugin should collect metrics from. Mostly useful when running fluentd outside of the cluster. When `kubeconfig` is set, `kubernetes_url`, `client_cert`, `client_key`, `ca_file`, `insecure_ssl`, `bearer_token_file`, and `secret_dir` will all be ignored.'
126
+ config_param :kubeconfig, :string, default: nil
127
+
128
+ desc 'Path to the certificate file for this client.'
129
+ config_param :client_cert, :string, default: nil
130
+
131
+ desc 'Path to the private key file for this client.'
132
+ config_param :client_key, :string, default: nil
133
+
134
+ desc 'Path to the CA file.'
135
+ config_param :ca_file, :string, default: nil
136
+
137
+ desc "If `insecure_ssl` is set to `true`, it won't verify apiserver's certificate."
138
+ config_param :insecure_ssl, :bool, default: false
139
+
140
+ desc 'Path to the file contains the API token. By default it reads from the file "token" in the `secret_dir`.'
141
+ config_param :bearer_token_file, :string, default: nil
142
+
143
+ desc "Path of the location where pod's service account's credentials are stored."
144
+ config_param :secret_dir, :string, default: '/var/run/secrets/kubernetes.io/serviceaccount'
145
+
146
+ desc 'The name of the cluster, where the plugin is deployed.'
147
+ config_param :cluster_name, :string, default: 'cluster_name'
148
+
149
+ def configure(conf)
150
+ super
151
+ @mutex_node_req_lim = Mutex.new
152
+ @mutex_node_res_usage = Mutex.new
153
+ parse_tag
154
+ initialize_client
155
+ end
156
+
157
+ def start
158
+ super
159
+
160
+ timer_execute :limits_request_scraper, @interval, &method(:scrape_limits_requests_metrics)
161
+ timer_execute :node_scraper, @interval, &method(:scrape_node_metrics)
162
+ timer_execute :resource_usage_scraper, @interval, &method(:scrape_resource_usage_metrics)
163
+
164
+ end
165
+
166
+ def close
167
+ @watchers.each &:finish if @watchers
168
+
169
+ super
170
+ end
171
+
172
+ private
173
+
174
+ def parse_tag
175
+ @tag_prefix, @tag_suffix = @tag.split('*') if @tag.include?('*')
176
+ end
177
+
178
+ def generate_tag(item_name)
179
+ return @tag unless @tag_prefix
180
+
181
+ [@tag_prefix, item_name, @tag_suffix].join
182
+ end
183
+
184
+ def init_with_kubeconfig(options = {})
185
+ config = Kubeclient::Config.read @kubeconfig
186
+ current_context = config.context
187
+
188
+ @client = Kubeclient::Client.new(
189
+ current_context.api_endpoint,
190
+ current_context.api_version,
191
+ options.merge(
192
+ ssl_options: current_context.ssl_options,
193
+ auth_options: current_context.auth_options
194
+ )
195
+ )
196
+ end
197
+
198
+ def init_without_kubeconfig(_options = {})
199
+ kubernetes_url_final = nil
200
+ # mostly borrowed from Fluentd Kubernetes Metadata Filter Plugin
201
+ if @kubernetes_url.nil?
202
+ # Use Kubernetes default service account if we're in a pod.
203
+ env_host = ENV['KUBERNETES_SERVICE_HOST']
204
+ env_port = ENV['KUBERNETES_SERVICE_PORT']
205
+ if env_host && env_port
206
+ kubernetes_url_final = "https://#{env_host}:#{env_port}/api/"
207
+ end
208
+ else
209
+ kubernetes_url_final = "https://#{@kubernetes_url}:#{@kubelet_port}/api/"
210
+ end
211
+
212
+ raise Fluent::ConfigError, 'kubernetes url is not set in configuration, or environment variables' unless kubernetes_url_final
213
+
214
+ # Use SSL certificate and bearer token from Kubernetes service account.
215
+ if Dir.exist?(@secret_dir)
216
+ secret_ca_file = File.join(@secret_dir, 'ca.crt')
217
+ secret_token_file = File.join(@secret_dir, 'token')
218
+
219
+ if @ca_file.nil? && File.exist?(secret_ca_file)
220
+ @ca_file = secret_ca_file
221
+ end
222
+
223
+ if @bearer_token_file.nil? && File.exist?(secret_token_file)
224
+ @bearer_token_file = secret_token_file
225
+ end
226
+ end
227
+
228
+ ssl_options = {
229
+ client_cert: @client_cert && OpenSSL::X509::Certificate.new(File.read(@client_cert)),
230
+ client_key: @client_key && OpenSSL::PKey::RSA.new(File.read(@client_key)),
231
+ ca_file: @ca_file,
232
+ verify_ssl: @insecure_ssl ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER
233
+ }
234
+
235
+ auth_options = {}
236
+ auth_options[:bearer_token] = File.read(@bearer_token_file) if @bearer_token_file
237
+
238
+ @client = Kubeclient::Client.new(
239
+ kubernetes_url_final, 'v1',
240
+ ssl_options: ssl_options,
241
+ auth_options: auth_options
242
+ )
243
+
244
+ begin
245
+ @client.api_valid?
246
+ rescue KubeException => kube_error
247
+ raise Fluent::ConfigError, "Invalid Kubernetes API #{@api_version} endpoint #{kubernetes_url_final}: #{kube_error.message}"
248
+ end
249
+ end
250
+
251
+ def initialize_client
252
+ options = {
253
+ timeouts: {
254
+ open: 10,
255
+ read: nil
256
+ }
257
+ }
258
+
259
+ if @kubeconfig.nil?
260
+ init_without_kubeconfig options
261
+ else
262
+ init_with_kubeconfig options
263
+ end
264
+ end
265
+
266
+ def parse_time(metric_time)
267
+ Fluent::EventTime.from_time Time.iso8601(metric_time)
268
+ end
269
+
270
+ def underscore(camlcase)
271
+ camlcase.gsub(/[A-Z]/) { |c| "_#{c.downcase}" }
272
+ end
273
+
274
+ def get_cpu_mult(cpu)
275
+ cpu_mult = 1
276
+ cpu_mult = 1000 if cpu[-1] != 'm'
277
+ cpu_mult
278
+ end
279
+
280
+ def get_cpu_value(resource)
281
+ cpu_val = resource.tr('^0-9', '').to_i
282
+ mult = get_cpu_mult(resource)
283
+ cpu_val += cpu_val * mult
284
+ cpu_val
285
+ end
286
+
287
+ def get_memory_mult(memory)
288
+ memory_mult = if memory[-2] == 'Ki'
289
+ 0.001
290
+ elsif memory[-2] == 'K'
291
+ 1.0 / 1024
292
+ elsif memory[-2] == 'Mi'
293
+ 1
294
+ elsif memory[-2] == 'M'
295
+ 1
296
+ elsif memory[-2] == 'Gi'
297
+ 1000
298
+ elsif memory[-2] == 'G'
299
+ 1024
300
+ elsif memory[-2] == 'Ti'
301
+ 1_000_000
302
+ elsif memory[-2] == 'T'
303
+ 1_048_576 # 1024*1024
304
+ elsif memory[-2] == 'Ei'
305
+ 1_000_000_000
306
+ elsif memory[-2] == 'E'
307
+ 1_073_741_824 # 1024*1024*1024
308
+ else
309
+ 0.000001
310
+ end
311
+ memory_mult
312
+ end
313
+
314
+ def get_memory_value(resource)
315
+ mem_val = resource.tr('^0-9', '').to_i
316
+ mult = get_memory_mult(resource)
317
+ mem_val += mem_val * mult
318
+ mem_val
319
+ end
320
+
321
+ def emit_limits_requests_metrics(tag,
322
+ scraped_at,
323
+ labels,
324
+ limits_requests_metric)
325
+ router.emit tag + '.cpu.limit',
326
+ Fluent::EventTime.from_time(scraped_at),
327
+ labels.merge(
328
+ 'value' => limits_requests_metric.instance_variable_get(:@cpu_limit)
329
+ )
330
+ router.emit tag + '.cpu.request',
331
+ Fluent::EventTime.from_time(scraped_at),
332
+ labels.merge('value' => limits_requests_metric.instance_variable_get(:@cpu_request))
333
+ router.emit tag + '.memory.limit',
334
+ Fluent::EventTime.from_time(scraped_at),
335
+ labels.merge('value' => limits_requests_metric.instance_variable_get(:@memory_limit))
336
+ router.emit tag + '.memory.request',
337
+ Fluent::EventTime.from_time(scraped_at),
338
+ labels.merge('value' => limits_requests_metric.instance_variable_get(:@memory_request))
339
+ end
340
+
341
+ def emit_resource_usage_metrics(tag,
342
+ scraped_at,
343
+ labels,
344
+ resource_usage_metric)
345
+ router.emit tag + '.cpu.usage',
346
+ Fluent::EventTime.from_time(scraped_at),
347
+ labels.merge('value' => resource_usage_metric.instance_variable_get(:@cpu_usage))
348
+ router.emit tag + '.memory.usage',
349
+ Fluent::EventTime.from_time(scraped_at),
350
+ labels.merge('value' => resource_usage_metric.instance_variable_get(:@memory_usage))
351
+ end
352
+
353
+ def limits_requests_api
354
+ @limits_requests_api =
355
+ begin
356
+ @client.discover unless @client.discovered
357
+ @client.rest_client['/pods'].tap do |endpoint|
358
+ log.info("Use URL #{endpoint.url} for scraping limits requests metrics")
359
+ end
360
+ end
361
+ end
362
+
363
+ def scrape_limits_requests_metrics
364
+ response = limits_requests_api.get(@client.headers)
365
+ handle_limits_requests_res(response)
366
+ end
367
+
368
+ # This method is used to handle responses from the kube apiserver api
369
+ def handle_limits_requests_res(response)
370
+ # Checking response codes only for a successful GET request viz., 2XX codes
371
+ if (response.code < 300) && (response.code > 199)
372
+ @scraped_at = Time.now
373
+ process_limits_requests_res MultiJson.load(response.body)
374
+ else
375
+ log.error "ExMultiJson.load(response.body) expected 2xx from summary API, but got #{response.code}. Response body = #{response.body}"
376
+ end
377
+ rescue StandardError => e
378
+ log.error "Failed to scrape metrics, error=#{$ERROR_INFO}, #{e.inspect}"
379
+ log.error_backtrace
380
+ end
381
+
382
+ def process_limits_requests_res(response)
383
+ @scraped_at = Time.now
384
+ @mutex_node_req_lim.synchronize do
385
+ Array(response['items']).each do |pod_json|
386
+ pod_namespace = pod_json['metadata']['namespace']
387
+ pod_node_name = pod_json['spec']['nodeName']
388
+ if @@namespace_usage_metrics_map[pod_namespace].nil?
389
+ namespace_usage_metrics = UsageMetricsUnit.new
390
+ @@namespace_usage_metrics_map[pod_namespace] = namespace_usage_metrics
391
+ end
392
+
393
+ pod_containers = pod_json['spec']['containers']
394
+ pod_usage_metrics = UsageMetricsUnit.new
395
+ Array(pod_containers).each do |container_json|
396
+ container_usage_metrics = UsageMetricsUnit.new
397
+ cpu_limit = '0'
398
+ memory_limit = '0'
399
+ cpu_request = '0'
400
+ memory_request = '0'
401
+ unless container_json['resources']['limits'].nil?
402
+ unless container_json['resources']['limits']['cpu'].nil?
403
+ cpu_limit = container_json['resources']['limits']['cpu']
404
+ end
405
+ unless container_json['resources']['limits']['memory'].nil?
406
+ memory_limit = container_json['resources']['limits']['memory']
407
+ end
408
+ end
409
+ unless container_json['resources']['requests'].nil?
410
+ unless container_json['resources']['requests']['cpu'].nil?
411
+ cpu_request = container_json['resources']['requests']['cpu']
412
+ end
413
+ unless container_json['resources']['requests']['memory'].nil?
414
+ memory_request = container_json['resources']['requests']['memory']
415
+ end
416
+ end
417
+ container_usage_metrics.add_usage_metrics(cpu_limit, cpu_request, memory_limit, memory_request)
418
+ container_labels = { 'name' => container_json['name'], 'image' => container_json['image'], 'node' => pod_json['spec']['nodeName'] }
419
+ emit_limits_requests_metrics(generate_tag('container'), @scraped_at, container_labels, container_usage_metrics)
420
+ pod_usage_metrics.add_usage_metrics(cpu_limit, cpu_request, memory_limit, memory_request)
421
+ end
422
+
423
+ pod_labels = { 'name' => pod_json['metadata']['name'], 'namespace' => pod_json['metadata']['name'], 'node' => pod_json['spec']['nodeName'] }
424
+ emit_limits_requests_metrics(generate_tag('pod'), @scraped_at, pod_labels, pod_usage_metrics)
425
+ @@namespace_usage_metrics_map[pod_namespace].add_usage_metrics(pod_usage_metrics.instance_variable_get(:@cpu_limit).to_s + ('m'), pod_usage_metrics.instance_variable_get(:@cpu_request).to_s + ('m'),
426
+ pod_usage_metrics.instance_variable_get(:@memory_limit).to_s + ('Mi'), pod_usage_metrics.instance_variable_get(:@memory_request).to_s + ('Mi'))
427
+
428
+ if @@node_requests_limits_metrics_map[pod_node_name].nil?
429
+ node_name_usage_metrics = UsageMetricsUnit.new
430
+ @@node_requests_limits_metrics_map[pod_node_name] = node_name_usage_metrics
431
+ end
432
+ @@node_requests_limits_metrics_map[pod_node_name].add_usage_metrics(pod_usage_metrics.instance_variable_get(:@cpu_limit).to_s + ('m'), pod_usage_metrics.instance_variable_get(:@cpu_request).to_s + ('m'),
433
+ pod_usage_metrics.instance_variable_get(:@memory_limit).to_s + ('Mi'), pod_usage_metrics.instance_variable_get(:@memory_request).to_s + ('Mi'))
434
+ pod_usage_metrics = nil
435
+ end
436
+ end
437
+ cluster_usage_metrics = UsageMetricsUnit.new
438
+ @@namespace_usage_metrics_map.each do |key, value|
439
+ cluster_usage_metrics.add_usage_metrics(value.instance_variable_get(:@cpu_limit).to_s + ('m'), value.instance_variable_get(:@cpu_request).to_s + ('m'),
440
+ value.instance_variable_get(:@memory_limit).to_s + ('Mi'), value.instance_variable_get(:@memory_request).to_s + ('Mi'))
441
+ emit_limits_requests_metrics(generate_tag('namespace'), @scraped_at, { 'name' => key }, value)
442
+ value = nil
443
+ end
444
+
445
+ emit_limits_requests_metrics(generate_tag('cluster'), @scraped_at, { 'name' => @cluster_name }, cluster_usage_metrics)
446
+ cluster_usage_metrics = nil
447
+
448
+ @@namespace_usage_metrics_map = nil
449
+ @@namespace_usage_metrics_map = {}
450
+ end
451
+
452
+ def node_api
453
+ @node_api =
454
+ begin
455
+ @client.discover unless @client.discovered
456
+ @client.rest_client['/nodes'].tap do |endpoint|
457
+ log.info("Use URL #{endpoint.url} for scraping node metrics")
458
+ end
459
+ end
460
+ end
461
+
462
+ def scrape_node_metrics
463
+ response = node_api.get(@client.headers)
464
+ handle_node_response(response)
465
+ end
466
+
467
+ # This method is used to handle responses from the kubeapiserver api
468
+ def handle_node_response(response)
469
+ # Checking response codes only for a successful GET request viz., 2XX codes
470
+ if (response.code < 300) && (response.code > 199)
471
+ @scraped_node_at = Time.now
472
+ process_node_response MultiJson.load(response.body)
473
+ else
474
+ log.error "ExMultiJson.load(response.body) expected 2xx from summary API, but got #{response.code}. Response body = #{response.body}"
475
+ end
476
+ rescue StandardError => e
477
+ log.error "Failed to scrape metrics, error=#{$ERROR_INFO}, #{e.inspect}"
478
+ log.error_backtrace
479
+ end
480
+
481
+ def process_node_response(response)
482
+ Array(response['items']).each do |node_json|
483
+ node_name = node_json['metadata']['name']
484
+ node_cpu_capacity = get_cpu_value(node_json['status']['capacity']['cpu'])
485
+ router.emit generate_tag('node') << ('.cpu.capacity'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_cpu_capacity
486
+ node_cpu_allocatable = get_cpu_value(node_json['status']['allocatable']['cpu'])
487
+ router.emit generate_tag('node') << ('.cpu.allocatable'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_cpu_allocatable
488
+ node_memory_capacity = get_memory_value(node_json['status']['capacity']['memory'])
489
+ router.emit generate_tag('node') << ('.memory.capacity'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_memory_capacity
490
+ node_memory_allocatable = get_memory_value(node_json['status']['allocatable']['memory'])
491
+ router.emit generate_tag('node') << ('.memory.allocatable'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_memory_allocatable
492
+
493
+ node_req_lim = UsageMetricsUnit.new
494
+ node_res_usage = ResourceUsageMetricsUnit.new
495
+ @mutex_node_req_lim.synchronize do
496
+ next if @@node_requests_limits_metrics_map[node_name].nil?
497
+
498
+ node_req_lim = @@node_requests_limits_metrics_map[node_name]
499
+ end
500
+ @mutex_node_res_usage.synchronize do
501
+ next if @@node_resource_usage_metrics_map[node_name].nil?
502
+
503
+ node_res_usage = @@node_resource_usage_metrics_map[node_name]
504
+ end
505
+ # https://github.com/kubernetes/heapster/blob/c78cc312ab3901acfe5c2f95f7a621909c8455ad/metrics/processors/node_autoscaling_enricher.go#L62
506
+ node_cpu_utilization = node_res_usage.instance_variable_get(:@cpu_usage).to_f / 1_000_000 * node_cpu_allocatable # converting from nano cores to milli core
507
+ router.emit generate_tag('node') << ('.cpu.utilization'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_cpu_utilization
508
+ node_cpu_reservation = node_req_lim.instance_variable_get(:@cpu_request).to_f / node_cpu_allocatable
509
+ router.emit generate_tag('node') << ('.cpu.reservation'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_cpu_reservation
510
+ node_memory_utilization = node_res_usage.instance_variable_get(:@memory_usage).to_f / 1_000_000 * node_memory_allocatable # converting from bytes to megabytes
511
+ router.emit generate_tag('node') << ('.memory.utilization'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_memory_utilization
512
+ node_memory_reservation = node_req_lim.instance_variable_get(:@memory_request).to_f / node_memory_allocatable
513
+ router.emit generate_tag('node') << ('.memory.reservation'), Fluent::EventTime.from_time(@scraped_node_at), 'node' => node_name, 'value' => node_memory_reservation
514
+ @mutex_node_req_lim.synchronize do
515
+ @@node_requests_limits_metrics_map = nil
516
+ @@node_requests_limits_metrics_map = {}
517
+ end
518
+ @mutex_node_res_usage.synchronize do
519
+ @@node_resource_usage_metrics_map = nil
520
+ @@node_resource_usage_metrics_map = {}
521
+ end
522
+ end
523
+ end
524
+
525
+ def resource_usage_api
526
+ @resource_usage_api =
527
+ begin
528
+ @client.discover unless @client.discovered
529
+ @client.rest_client['/nodes'].tap do |endpoint|
530
+ log.info("Use URL #{endpoint.url} for scraping node metrics")
531
+ end
532
+ end
533
+ end
534
+
535
+ def scrape_resource_usage_metrics
536
+ response = resource_usage_api.get(@client.headers)
537
+ handle_resource_usage_response(response)
538
+ end
539
+
540
+ # This method is used to handle responses from the kubelet summary api
541
+ def handle_resource_usage_response(response)
542
+ # Checking response codes only for a successful GET request viz., 2XX codes
543
+ if (response.code < 300) && (response.code > 199)
544
+ @scraped_at = Time.now
545
+ process_resource_usage_res MultiJson.load(response.body)
546
+ else
547
+ log.error "ExMultiJson.load(response.body) expected 2xx from summary API, but got #{response.code}. Response body = #{response.body}"
548
+ end
549
+ rescue StandardError => e
550
+ log.error "Failed to scrape metrics, error=#{$ERROR_INFO}, #{e.inspect}"
551
+ log.error_backtrace
552
+ end
553
+
554
+ def process_resource_usage_res(response)
555
+ @scraped_node_at = Time.now
556
+ @mutex_node_res_usage.synchronize do
557
+ Array(response['items']).each do |node_json|
558
+ node_name = node_json['metadata']['name']
559
+ node_rest_client =
560
+ begin
561
+ @client.discover unless @client.discovered
562
+ @client.rest_client["/nodes/#{node_name}:#{@kubelet_port}/proxy/stats/summary"].tap do |endpoint|
563
+ log.info("Use URL #{endpoint.url} for scraping resource usage metrics")
564
+ end
565
+ end
566
+
567
+ node_response = JSON.parse(node_rest_client.get(@client.headers))
568
+ Array(node_response['pods']).each do |pod_json|
569
+ pod_cpu_usage = pod_json['cpu']['usageNanoCores']
570
+ pod_memory_usage = pod_json['memory']['usageBytes']
571
+ pod_namespace = pod_json['podRef']['namespace']
572
+ pod_usage = ResourceUsageMetricsUnit.new
573
+ pod_usage.add_resource_usage_metrics(pod_cpu_usage, pod_memory_usage)
574
+ if @@namespace_resource_usage_metrics_map[pod_namespace].nil?
575
+ namespace_usage_metrics = ResourceUsageMetricsUnit.new
576
+ @@namespace_resource_usage_metrics_map[pod_namespace] = pod_usage
577
+ else
578
+ @@namespace_resource_usage_metrics_map[pod_namespace].add_resource_usage_metrics(pod_cpu_usage, pod_memory_usage)
579
+ end
580
+ if @@node_resource_usage_metrics_map[node_name].nil?
581
+ node_name_usage_metrics = ResourceUsageMetricsUnit.new
582
+ @@node_resource_usage_metrics_map[node_name] = node_name_usage_metrics
583
+ end
584
+ @@node_resource_usage_metrics_map[node_name].add_resource_usage_metrics(pod_cpu_usage, pod_memory_usage)
585
+ pod_usage = nil
586
+ end
587
+ end
588
+ end
589
+
590
+ cluster_usage_metrics = ResourceUsageMetricsUnit.new
591
+ @@namespace_resource_usage_metrics_map.each do |key, value|
592
+ cluster_usage_metrics.add_resource_usage_metrics(value.instance_variable_get(:@cpu_usage), value.instance_variable_get(:@memory_usage))
593
+ emit_resource_usage_metrics(generate_tag('namespace'), @scraped_at, { 'name' => key }, value)
594
+ value = nil
595
+ end
596
+ emit_resource_usage_metrics(generate_tag('cluster'), @scraped_at, { 'name' => @cluster_name }, cluster_usage_metrics)
597
+ cluster_usage_metrics = nil
598
+
599
+ @@namespace_resource_usage_metrics_map = nil
600
+ @@namespace_resource_usage_metrics_map = {}
601
+ end
602
+ end
603
+ end
604
+ end