fluent-plugin-kubernetes-metrics-hbrewster 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,713 @@
1
+ #
2
+ # Copyright 2018- Splunk Inc
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ require 'time'
17
+ require 'fluent/plugin/input'
18
+ require 'kubeclient'
19
+ require 'multi_json'
20
+
21
+ module Fluent
22
+ module Plugin
23
+ class KubernetesMetricsInput < Fluent::Plugin::Input
24
+ Fluent::Plugin.register_input('kubernetes_metrics', self)
25
+
26
+ helpers :timer
27
+
28
+ desc 'The tag of the event.'
29
+ config_param :tag, :string, default: 'kubernetes.metrics.*'
30
+
31
+ desc 'How often it pulls metrcs.'
32
+ config_param :interval, :time, default: '15s'
33
+
34
+ desc 'Path to a kubeconfig file points to a cluster the plugin should collect metrics from. Mostly useful when running fluentd outside of the cluster. When `kubeconfig` is set, `kubernetes_url`, `client_cert`, `client_key`, `ca_file`, `insecure_ssl`, `bearer_token_file`, and `secret_dir` will all be ignored.'
35
+ config_param :kubeconfig, :string, default: nil
36
+
37
+ desc 'URL of the kubernetes API server.'
38
+ config_param :kubernetes_url, :string, default: nil
39
+
40
+ desc 'Path to the certificate file for this client.'
41
+ config_param :client_cert, :string, default: nil
42
+
43
+ desc 'Path to the private key file for this client.'
44
+ config_param :client_key, :string, default: nil
45
+
46
+ desc 'Path to the CA file.'
47
+ config_param :ca_file, :string, default: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
48
+
49
+ desc "If `insecure_ssl` is set to `true`, it won't verify apiserver's certificate."
50
+ config_param :insecure_ssl, :bool, default: false
51
+
52
+ desc 'Path to the file contains the API token. By default it reads from the file "token" in the `secret_dir`.'
53
+ config_param :bearer_token_file, :string, default: nil
54
+
55
+ desc "Path of the location where pod's service account's credentials are stored."
56
+ config_param :secret_dir, :string, default: '/var/run/secrets/kubernetes.io/serviceaccount'
57
+
58
+ desc 'Name of the node that this plugin should collect metrics from.'
59
+ config_param :node_name, :string, default: nil
60
+
61
+ desc 'Name of the nodes that this plugin should collect metrics from.'
62
+ config_param :node_names, :array, default: [], value_type: :string
63
+
64
+ desc 'The hostname or IP address that kubelet will use to connect to. If not supplied, status.hostIP of the node is used to fetch metrics from the Kubelet API (via the $KUBERNETES_NODE_IP environment variable)'
65
+ config_param :kubelet_address, :string, default: "#{ENV['KUBERNETES_NODE_IP']}"
66
+
67
+ desc 'The port that kubelet is listening to.'
68
+ config_param :kubelet_port, :integer, default: 10_250
69
+
70
+ desc 'Use the rest client to get the metrics from summary api on each kubelet'
71
+ config_param :use_rest_client, :bool, default: true
72
+
73
+ desc 'Use SSL for rest client.'
74
+ config_param :use_rest_client_ssl, :bool, default: true
75
+
76
+ def configure(conf)
77
+ super
78
+
79
+ if @use_rest_client
80
+ raise Fluentd::ConfigError, 'node_name is required' if @node_name.nil? || @node_name.empty?
81
+ else
82
+ raise Fluentd::ConfigError, 'node_names is required' if @node_names.nil? || @node_names.empty?
83
+ end
84
+
85
+ parse_tag
86
+ initialize_client
87
+ end
88
+
89
+ def start
90
+ super
91
+
92
+ timer_execute :metric_scraper, @interval, &method(:scrape_metrics)
93
+ timer_execute :stats_metric_scraper, @interval, &method(:scrape_stats_metrics)
94
+ timer_execute :cadvisor_metric_scraper, @interval, &method(:scrape_cadvisor_metrics)
95
+ end
96
+
97
+ def close
98
+ @watchers.each &:finish if @watchers
99
+
100
+ super
101
+ end
102
+
103
+ private
104
+
105
+ def parse_tag
106
+ @tag_prefix, @tag_suffix = @tag.split('*') if @tag.include?('*')
107
+ end
108
+
109
+ def generate_tag(item_name)
110
+ return @tag unless @tag_prefix
111
+
112
+ [@tag_prefix, item_name, @tag_suffix].join
113
+ end
114
+
115
+ def init_with_kubeconfig(options = {})
116
+ config = Kubeclient::Config.read @kubeconfig
117
+ current_context = config.context
118
+
119
+ @client = Kubeclient::Client.new(
120
+ current_context.api_endpoint,
121
+ current_context.api_version,
122
+ options.merge(
123
+ ssl_options: current_context.ssl_options,
124
+ auth_options: current_context.auth_options
125
+ )
126
+ )
127
+ end
128
+
129
+ def init_without_kubeconfig(_options = {})
130
+ # mostly borrowed from Fluentd Kubernetes Metadata Filter Plugin
131
+ if @kubernetes_url.nil?
132
+ # Use Kubernetes default service account if we're in a pod.
133
+ env_host = ENV['KUBERNETES_SERVICE_HOST']
134
+ env_port = ENV['KUBERNETES_SERVICE_PORT']
135
+ if env_host && env_port
136
+ @kubernetes_url = "https://#{env_host}:#{env_port}/api/"
137
+ end
138
+ end
139
+
140
+ raise Fluent::ConfigError, 'kubernetes url is not set' unless @kubernetes_url
141
+
142
+ # Use SSL certificate and bearer token from Kubernetes service account.
143
+ if Dir.exist?(@secret_dir)
144
+ secret_ca_file = File.join(@secret_dir, 'ca.crt')
145
+ secret_token_file = File.join(@secret_dir, 'token')
146
+
147
+ if @ca_file.nil? && File.exist?(secret_ca_file)
148
+ @ca_file = secret_ca_file
149
+ end
150
+
151
+ if @bearer_token_file.nil? && File.exist?(secret_token_file)
152
+ @bearer_token_file = secret_token_file
153
+ end
154
+ end
155
+
156
+ ssl_options = {
157
+ client_cert: @client_cert && OpenSSL::X509::Certificate.new(File.read(@client_cert)),
158
+ client_key: @client_key && OpenSSL::PKey::RSA.new(File.read(@client_key)),
159
+ ca_file: @ca_file,
160
+ verify_ssl: @insecure_ssl ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER
161
+ }
162
+
163
+ auth_options = {}
164
+ auth_options[:bearer_token] = File.read(@bearer_token_file) if @bearer_token_file
165
+
166
+ @client = Kubeclient::Client.new(
167
+ @kubernetes_url, 'v1',
168
+ ssl_options: ssl_options,
169
+ auth_options: auth_options
170
+ )
171
+
172
+ begin
173
+ @client.api_valid?
174
+ rescue KubeException => kube_error
175
+ raise Fluent::ConfigError, "Invalid Kubernetes API #{@api_version} endpoint #{@kubernetes_url}: #{kube_error.message}"
176
+ end
177
+ end
178
+
179
+ def initialize_client
180
+ if @use_rest_client
181
+ initialize_rest_client
182
+ else
183
+ options = {
184
+ timeouts: {
185
+ open: 10,
186
+ read: nil
187
+ }
188
+ }
189
+
190
+ if @kubeconfig.nil?
191
+ init_without_kubeconfig options
192
+ else
193
+ init_with_kubeconfig options
194
+ end
195
+ end
196
+ end
197
+
198
+ def initialize_rest_client
199
+ env_host = @kubelet_address
200
+ env_port = @kubelet_port
201
+
202
+ if env_host && env_port
203
+ if @use_rest_client_ssl
204
+ @kubelet_url = "https://#{env_host}:#{env_port}/stats/summary"
205
+ @kubelet_url_stats = "https://#{env_host}:#{env_port}/stats"
206
+ @cadvisor_url = "https://#{env_host}:#{env_port}/metrics/cadvisor"
207
+ else
208
+ @kubelet_url = "http://#{env_host}:#{env_port}/stats/summary"
209
+ @kubelet_url_stats = "http://#{env_host}:#{env_port}/stats"
210
+ @cadvisor_url = "http://#{env_host}:#{env_port}/metrics/cadvisor"
211
+ end
212
+ end
213
+
214
+ if Dir.exist?(@secret_dir)
215
+ secret_ca_file = File.join(@secret_dir, 'ca.crt')
216
+ secret_token_file = File.join(@secret_dir, 'token')
217
+ if @ca_file.nil? && File.exist?(secret_ca_file)
218
+ @ca_file = secret_ca_file
219
+ end
220
+ if @bearer_token_file.nil? && File.exist?(secret_token_file)
221
+ @bearer_token_file = secret_token_file
222
+ end
223
+ end
224
+ log.info("Use URL #{@kubelet_url} for creating client to query kubelet summary api")
225
+ log.info("Use URL #{@kubelet_url_stats} for creating client to query kubelet stats api")
226
+ log.info("Use URL #{@cadvisor_url} for creating client to query cadvisor metrics api")
227
+ end
228
+
229
+ def set_ssl_options
230
+ if @use_rest_client_ssl
231
+ ssl_options = {
232
+ ssl_ca_file: @ca_file,
233
+ verify_ssl: @insecure_ssl ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER,
234
+ headers: { Authorization: 'Bearer ' + File.read(@bearer_token_file) }
235
+ }
236
+ else
237
+ ssl_options = {}
238
+ end
239
+ ssl_options
240
+ end
241
+
242
+ # This method is used to set the options for sending a request to the kubelet api
243
+ def request_options
244
+ options = { method: 'get', url: @kubelet_url }
245
+ options = options.merge(set_ssl_options)
246
+ options
247
+ end
248
+
249
+ # This method is used to set the options for sending a request to the stats api
250
+ def request_options_stats
251
+ options = { method: 'get', url: @kubelet_url_stats }
252
+ options = options.merge(set_ssl_options)
253
+ options
254
+ end
255
+
256
+ # This method is used to set the options for sending a request to the cadvisor api
257
+ def cadvisor_request_options
258
+ options = { method: 'get', url: @cadvisor_url }
259
+ options = options.merge(set_ssl_options)
260
+ options
261
+ end
262
+
263
+ # @client.proxy_url only returns the url, but we need the resource, not just the url
264
+ def summary_proxy_api(node)
265
+ @summary_api =
266
+ begin
267
+ @client.discover unless @client.discovered
268
+ @client.rest_client["/nodes/#{node}:#{@kubelet_port}/proxy/stats/summary"].tap do |endpoint|
269
+ log.info("Use URL #{endpoint.url} for scraping metrics")
270
+ end
271
+ end
272
+ end
273
+
274
+ def stats_proxy_api(node)
275
+ @stats_api =
276
+ begin
277
+ @client.discover unless @client.discovered
278
+ @client.rest_client["/nodes/#{node}:#{@kubelet_port}/proxy/stats"].tap do |endpoint|
279
+ log.info("Use URL #{endpoint.url} for scraping stats metrics")
280
+ end
281
+ end
282
+ end
283
+
284
+ def cadvisor_proxy_api(node)
285
+ @cadvisor_api =
286
+ begin
287
+ @client.discover unless @client.discovered
288
+ @client.rest_client["/nodes/#{node}:#{@kubelet_port}/proxy/metrics/cadvisor"].tap do |endpoint|
289
+ log.info("Use URL #{endpoint.url} for scraping metrics")
290
+ end
291
+ end
292
+ end
293
+
294
+ def parse_time(metric_time)
295
+ Fluent::EventTime.from_time Time.iso8601(metric_time)
296
+ end
297
+
298
+ def underscore(camlcase)
299
+ camlcase.gsub(/[A-Z]/) { |c| "_#{c.downcase}" }
300
+ end
301
+
302
+ def emit_uptime(tag:, start_time:, labels:)
303
+ unless start_time.nil?
304
+ uptime = @scraped_at - Time.iso8601(start_time)
305
+ router.emit generate_tag("#{tag}.uptime"), Fluent::EventTime.from_time(@scraped_at), labels.merge('value' => uptime)
306
+ end
307
+ end
308
+
309
+ def emit_cpu_metrics(tag:, metrics:, labels:)
310
+ unless metrics['time'].nil?
311
+ time = parse_time metrics['time']
312
+ if usage_rate = metrics['usageNanoCores']
313
+ router.emit generate_tag("#{tag}.cpu.usage_rate"), time, labels.merge('value' => usage_rate / 1_000_000)
314
+ end
315
+ if usage = metrics['usageNanoCores']
316
+ router.emit generate_tag("#{tag}.cpu.usage"), time, labels.merge('value' => usage)
317
+ end
318
+ end
319
+ end
320
+
321
+ def emit_memory_metrics(tag:, metrics:, labels:)
322
+ unless metrics['time'].nil?
323
+ time = parse_time metrics['time']
324
+ %w[availableBytes usageBytes workingSetBytes rssBytes pageFaults majorPageFaults].each do |name|
325
+ if value = metrics[name]
326
+ router.emit generate_tag("#{tag}.memory.#{underscore name}"), time, labels.merge('value' => value)
327
+ end
328
+ end
329
+ end
330
+ end
331
+
332
+ def emit_network_metrics(tag:, metrics:, labels:)
333
+ unless metrics['time'].nil?
334
+ time = parse_time metrics['time']
335
+ Array(metrics['interfaces']).each do |it|
336
+ it_name = it['name']
337
+ %w[rxBytes rxErrors txBytes txErrors].each do |metric_name|
338
+ if value = it[metric_name]
339
+ router.emit generate_tag("#{tag}.network.#{underscore metric_name}"), time, labels.merge('value' => value, 'interface' => it_name)
340
+ end
341
+ end
342
+ end
343
+ end
344
+ end
345
+
346
+ def emit_fs_metrics(tag:, metrics:, labels:)
347
+ unless metrics['time'].nil?
348
+ time = parse_time metrics['time']
349
+ %w[availableBytes capacityBytes usedBytes inodesFree inodes inodesUsed].each do |metric_name|
350
+ if value = metrics[metric_name]
351
+ router.emit generate_tag("#{tag}.#{underscore metric_name}"), time, labels.merge('value' => value)
352
+ end
353
+ end
354
+ end
355
+ end
356
+
357
+ def emit_node_rlimit_metrics(node_name, rlimit)
358
+ unless rlimit['time'].nil?
359
+ time = parse_time rlimit['time']
360
+ %w[maxpid curproc].each do |metric_name|
361
+ next unless value = rlimit[metric_name]
362
+
363
+ router.emit(generate_tag("node.runtime.imagefs.#{metric_name}"), time,
364
+ 'value' => value,
365
+ 'node' => node_name)
366
+ end
367
+ end
368
+ end
369
+
370
+ def emit_system_container_metrics(node_name, container)
371
+ tag = 'sys-container'
372
+ labels = { 'node' => node_name, 'name' => container['name'] }
373
+ unless container['startTime'].nil?
374
+ emit_uptime tag: tag, start_time: container['startTime'], labels: labels
375
+ emit_cpu_metrics tag: tag, metrics: container['cpu'], labels: labels unless container['cpu'].nil?
376
+ emit_memory_metrics tag: tag, metrics: container['memory'], labels: labels unless container['memory'].nil?
377
+ end
378
+ end
379
+
380
+ def emit_stats_breakdown(stats)
381
+ stats_latest = stats[-1]
382
+ tag = 'node'
383
+ labels = { 'node' => @node_name }
384
+ unless stats_latest['timestamp'].nil?
385
+ stats_timestamp = parse_time stats_latest['timestamp']
386
+ unless stats_latest['cpu'].nil?
387
+ emit_cpu_metrics_stats tag: tag, metrics: stats_latest['cpu'], labels: labels, time: stats_timestamp
388
+ end
389
+
390
+ unless stats_latest['diskio'].nil?
391
+ emit_diskio_metrics_stats tag: tag, metrics: stats_latest['diskio'], labels: labels, time: stats_timestamp
392
+ end
393
+
394
+ unless stats_latest['memory'].nil?
395
+ emit_memory_metrics_stats tag: tag, metrics: stats_latest['memory'], labels: labels, time: stats_timestamp
396
+ end
397
+
398
+ unless stats_latest['network'].nil?
399
+ emit_network_metrics_stats tag: tag, metrics: stats_latest['network'], labels: labels, time: stats_timestamp
400
+ end
401
+
402
+ unless stats_latest['filesystem'].nil?
403
+ emit_filesystem_metrics_stats tag: tag, metrics: stats_latest['filesystem'], labels: labels, time: stats_timestamp
404
+ end
405
+
406
+ unless stats_latest['task_stats'].nil?
407
+ emit_tasks_stats_metrics_stats tag: tag, metrics: stats_latest['task_stats'], labels: labels, time: stats_timestamp
408
+ end
409
+ end
410
+ end
411
+
412
+ def emit_cpu_metrics_stats(tag:, metrics:, labels:, time:)
413
+ if cpu_usage_total = metrics['usage']['total']
414
+ router.emit generate_tag("#{tag}.cpu.usage.total"), time, labels.merge('value' => cpu_usage_total / 1_000_000)
415
+ end
416
+ if cpu_usage_user = metrics['usage']['user']
417
+ router.emit generate_tag("#{tag}.cpu.usage.user"), time, labels.merge('value' => cpu_usage_user / 1_000_000)
418
+ end
419
+ if cpu_usage_system = metrics['usage']['system']
420
+ router.emit generate_tag("#{tag}.cpu.usage.system"), time, labels.merge('value' => cpu_usage_system / 1_000_000)
421
+ end
422
+
423
+ if cpu_cfs_periods = metrics['cfs']['periods']
424
+ router.emit generate_tag("#{tag}.cpu.cfs.periods"), time, labels.merge('value' => cpu_cfs_periods)
425
+ end
426
+ if cpu_cfs_throttled_periods = metrics['cfs']['throttled_periods']
427
+ router.emit generate_tag("#{tag}.cpu.cfs.throttled_periods"), time, labels.merge('value' => cpu_cfs_throttled_periods)
428
+ end
429
+ if cpu_cfs_throttled_time = metrics['cfs']['throttled_time']
430
+ router.emit generate_tag("#{tag}.cpu.cfs.throttled_time"), time, labels.merge('value' => cpu_cfs_throttled_time)
431
+ end
432
+ if cpu_load_average = metrics['load_average']
433
+ router.emit generate_tag("#{tag}.cpu.load_average"), time, labels.merge('value' => cpu_load_average)
434
+ end
435
+ end
436
+
437
+ def emit_diskio_metrics_stats(tag:, metrics:, labels:, time:)
438
+ %w[io_service_bytes io_serviced io_queued sectors io_service_time io_wait_time io_merged io_time].each do |metric_name|
439
+ next unless current_io_metric = metrics[metric_name]
440
+
441
+ current_io_metric.each do |device|
442
+ if diskio_io_service_bytes_major = device['major']
443
+ router.emit generate_tag("#{tag}.diskio".concat(metric_name).concat('.major.')), time, labels.merge('device' => device['device'], 'value' => diskio_io_service_bytes_major)
444
+ end
445
+ if diskio_io_service_bytes_minor = device['minor']
446
+ router.emit generate_tag("#{tag}.diskio".concat(metric_name).concat('.minor.')), time, labels.merge('device' => device['device'], 'value' => diskio_io_service_bytes_minor)
447
+ end
448
+ device_stats = device['stats'] unless device['stats'].nil?
449
+ device_stats.each do |device_stat|
450
+ device_key, device_value = device_stat unless device_stat.nil?
451
+ router.emit generate_tag("#{tag}.diskio.".concat(metric_name).concat('.stats.').concat(device_key)), time, labels.merge('device' => device['device'], 'value' => device_value)
452
+ end
453
+ end
454
+ end
455
+ end
456
+
457
+ def emit_memory_metrics_stats(tag:, metrics:, labels:, time:)
458
+ %w[usage max_usage cache rss swap working_set failcnt].each do |metric_name|
459
+ if current_memory_metric = metrics[metric_name]
460
+ router.emit generate_tag("#{tag}.memory.".concat(metric_name)), time, labels.merge('value' => current_memory_metric)
461
+ end
462
+ end
463
+ %w[container_data hierarchical_data].each do |metric_name_group|
464
+ next unless current_memory_metric_group = metrics[metric_name_group]
465
+
466
+ current_memory_metric_group.each do |metric_name|
467
+ metric_key, metric_value = metric_name unless metric_name.nil?
468
+ router.emit generate_tag("#{tag}.memory.".concat(metric_name_group).concat('.').concat(metric_key)), time, labels.merge('value' => metric_value)
469
+ end
470
+ end
471
+ end
472
+
473
+ def emit_network_metrics_stats(tag:, metrics:, labels:, time:)
474
+ network_name = metrics['name']
475
+ %w[rx_bytes rx_packets rx_errors rx_dropped tx_bytes tx_packets tx_errors tx_dropped].each do |metric_name|
476
+ if current_network_metric = metrics[metric_name]
477
+ router.emit generate_tag("#{tag}.network.".concat(network_name).concat('.').concat(metric_name)), time, labels.merge('value' => current_network_metric)
478
+ end
479
+ end
480
+
481
+ if network_interfaces = metrics['interfaces']
482
+ network_interfaces.each do |current_interface|
483
+ name = current_interface['name']
484
+ %w[rx_bytes rx_packets rx_errors rx_dropped tx_bytes tx_packets tx_errors tx_dropped].each do |current_metric|
485
+ if metric_value = current_interface[current_metric]
486
+ router.emit generate_tag("#{tag}.network.".concat(name).concat('.').concat(current_metric)), time, labels.merge('value' => metric_value)
487
+ end
488
+ end
489
+ end
490
+ end
491
+
492
+ %w[tcp tcp6 udp udp6].each do |metric_name_group|
493
+ next unless metric_group = metrics[metric_name_group]
494
+
495
+ metric_group.each do |current_metric|
496
+ metric_key, metric_value = current_metric unless current_metric.nil?
497
+ router.emit generate_tag("#{tag}.network.".concat(metric_name_group).concat('.').concat(metric_key)), time, labels.merge('value' => metric_value)
498
+ end
499
+ end
500
+ end
501
+
502
+ def emit_filesystem_metrics_stats(tag:, metrics:, labels:, time:)
503
+ metrics.each do |file_system|
504
+ device = file_system['device']
505
+ type = file_system['type']
506
+ file_system.each do |file_metric|
507
+ file_key, file_value = file_metric unless file_metric.nil?
508
+ unless %w[device type has_inodes].include? file_key
509
+ router.emit generate_tag("#{tag}.filesystem.".concat(file_key)), time, labels.merge('device' => device, 'type' => type, 'value' => file_value)
510
+ end
511
+ end
512
+ end
513
+ end
514
+
515
+ def emit_tasks_stats_metrics_stats(tag:, metrics:, labels:, time:)
516
+ metrics.each do |task_stats|
517
+ task_key, task_value = task_stats unless task_stats.nil?
518
+ router.emit generate_tag("#{tag}.tasks_stats.".concat(task_key)), time, labels.merge('value' => task_value)
519
+ end
520
+ end
521
+
522
+ def emit_node_metrics(node)
523
+ node_name = node['nodeName']
524
+ tag = 'node'
525
+ labels = { 'node' => node_name }
526
+
527
+ unless node['startTime'].nil?
528
+ emit_uptime tag: tag, start_time: node['startTime'], labels: labels
529
+ unless node['cpu'].nil?
530
+ emit_cpu_metrics tag: tag, metrics: node['cpu'], labels: labels
531
+ end
532
+ unless node['memory'].nil?
533
+ emit_memory_metrics tag: tag, metrics: node['memory'], labels: labels
534
+ end
535
+ unless node['network'].nil?
536
+ emit_network_metrics tag: tag, metrics: node['network'], labels: labels
537
+ end
538
+ unless node['fs'].nil?
539
+ emit_fs_metrics tag: "#{tag}.fs", metrics: node['fs'], labels: labels
540
+ end
541
+ unless node['runtime']['imageFs'].nil?
542
+ emit_fs_metrics tag: "#{tag}.imagefs", metrics: node['runtime']['imageFs'], labels: labels
543
+ end
544
+ unless node['rlimit'].nil?
545
+ emit_node_rlimit_metrics node_name, node['rlimit']
546
+ end
547
+ unless node['systemContainers'].nil?
548
+ node['systemContainers'].each do |c|
549
+ emit_system_container_metrics node_name, c unless c.nil?
550
+ end
551
+ end
552
+ end
553
+ end
554
+
555
+ def emit_container_metrics(pod_labels, container)
556
+ tag = 'container'
557
+ labels = pod_labels.merge 'container-name' => container['name']
558
+ unless container['startTime'].nil?
559
+ emit_uptime tag: tag, start_time: container['startTime'], labels: labels
560
+ emit_cpu_metrics tag: tag, metrics: container['cpu'], labels: labels unless container['cpu'].nil?
561
+ emit_memory_metrics tag: tag, metrics: container['memory'], labels: labels unless container['memory'].nil?
562
+ emit_fs_metrics tag: "#{tag}.rootfs", metrics: container['rootfs'], labels: labels unless container['rootfs'].nil?
563
+ emit_fs_metrics tag: "#{tag}.logs", metrics: container['logs'], labels: labels unless container['logs'].nil?
564
+ end
565
+ end
566
+
567
+ def emit_pod_metrics(node_name, pod)
568
+ tag = 'pod'
569
+ labels = pod['podRef'].transform_keys &'pod-'.method(:+)
570
+ labels['node'] = node_name
571
+
572
+ unless pod['startTime'].nil?
573
+ emit_uptime tag: tag, start_time: pod['startTime'], labels: labels
574
+ emit_cpu_metrics tag: tag, metrics: pod['cpu'], labels: labels if pod['cpu'] unless pod['cpu'].nil?
575
+ emit_memory_metrics tag: tag, metrics: pod['memory'], labels: labels if pod['memory'] unless pod['memory'].nil?
576
+ emit_network_metrics tag: tag, metrics: pod['network'], labels: labels unless pod['network'].nil?
577
+ emit_fs_metrics tag: "#{tag}.ephemeral-storage", metrics: pod['ephemeral-storage'], labels: labels unless pod['ephemeral-storage'].nil?
578
+ unless pod['volume'].nil?
579
+ Array(pod['volume']).each do |volume|
580
+ emit_fs_metrics tag: "#{tag}.volume", metrics: volume, labels: labels.merge('name' => volume['name']) unless volume.nil?
581
+ end
582
+ end
583
+ unless pod['containers'].nil?
584
+ Array(pod['containers']).each do |container|
585
+ emit_container_metrics labels, container unless container.nil?
586
+ end
587
+ end
588
+ end
589
+ end
590
+
591
+ def emit_metrics(metrics)
592
+ emit_node_metrics(metrics['node']) unless metrics['node'].nil?
593
+ Array(metrics['pods']).each &method(:emit_pod_metrics).curry.call(metrics['node']['nodeName']) unless metrics['pods'].nil?
594
+ end
595
+
596
+ def emit_stats_metrics(metrics)
597
+ emit_stats_breakdown(metrics['stats']) unless metrics['stats'].nil?
598
+ end
599
+
600
+ def emit_cadvisor_metrics(metrics)
601
+ metrics = metrics.split("\n")
602
+ metrics.each do |metric|
603
+ next unless metric.include? 'container_name='
604
+
605
+ next unless metric.match(/^((?!container_name="").)*$/) && metric[0] != '#'
606
+
607
+ metric_str, metric_val = metric.split(' ')
608
+ metric_val = metric_val.to_f if metric_val.is_a? String
609
+ first_occur = metric_str.index('{')
610
+ metric_name = metric_str[0..first_occur - 1]
611
+ pod_name = metric.match(/pod_name="\S*"/).to_s
612
+ pod_name = pod_name.split('"')[1]
613
+ image_name = metric.match(/image="\S*"/).to_s
614
+ image_name = image_name.split('"')[1]
615
+ namespace = metric.match(/namespace="\S*"/).to_s
616
+ namespace = namespace.split('"')[1]
617
+ metric_labels = { 'pod_name' => pod_name, 'image' => image_name, 'namespace' => namespace, 'value' => metric_val, 'node' => @node_name }
618
+ if metric =~ /^((?!container_name="POD").)*$/
619
+ tag = 'pod'
620
+ tag = generate_tag("#{tag}#{metric_name.tr('_', '.')}")
621
+ tag = tag.gsub('container', '')
622
+ else
623
+ container_name = metric.match(/container_name="\S*"/).to_s
624
+ container_name = container_name.split('"')[1]
625
+ container_label = { 'container_name' => container_name }
626
+ metric_labels.merge(container_label)
627
+ tag = generate_tag(metric_name.tr('_', '.').to_s)
628
+ end
629
+ router.emit tag, @scraped_at_cadvisor, metric_labels
630
+ end
631
+ end
632
+
633
+ def scrape_metrics
634
+ if @use_rest_client
635
+ response = RestClient::Request.execute request_options
636
+ handle_response(response)
637
+ else
638
+ @node_names.each do |node|
639
+ response = summary_proxy_api(node).get(@client.headers)
640
+ handle_response(response)
641
+ end
642
+ end
643
+ end
644
+
645
+ def scrape_stats_metrics
646
+ if @use_rest_client
647
+ response_stats = RestClient::Request.execute request_options_stats
648
+ handle_stats_response(response_stats)
649
+ else
650
+ @node_names.each do |node|
651
+ @node_name = node
652
+ response_stats = stats_proxy_api(node).get(@client.headers)
653
+ handle_stats_response(response_stats)
654
+ end
655
+ end
656
+ end
657
+
658
+ def scrape_cadvisor_metrics
659
+ if @use_rest_client
660
+ response_cadvisor = RestClient::Request.execute cadvisor_request_options
661
+ handle_cadvisor_response(response_cadvisor)
662
+ else
663
+ @node_names.each do |node|
664
+ response_cadvisor = cadvisor_proxy_api(node).get(@client.headers)
665
+ handle_cadvisor_response(response_cadvisor)
666
+ end
667
+ end
668
+ end
669
+
670
+ # This method is used to handle responses from the kubelet summary api
671
+ def handle_response(response)
672
+ # Checking response codes only for a successful GET request viz., 2XX codes
673
+ if (response.code < 300) && (response.code > 199)
674
+ @scraped_at = Time.now
675
+ emit_metrics MultiJson.load(response.body)
676
+ else
677
+ log.error "ExMultiJson.load(response.body) expected 2xx from summary API, but got #{response.code}. Response body = #{response.body}"
678
+ end
679
+ rescue StandardError => error
680
+ log.error "Failed to scrape metrics, error=#{error.inspect}"
681
+ log.error_backtrace
682
+ end
683
+
684
+ # This method is used to handle responses from the kubelet stats api
685
+ def handle_stats_response(response)
686
+ # Checking response codes only for a successful GET request viz., 2XX codes
687
+ if (response.code < 300) && (response.code > 199)
688
+ @scraped_at = Time.now
689
+ emit_stats_metrics MultiJson.load(response.body)
690
+ else
691
+ log.error "ExMultiJson.load(response.body) expected 2xx from stats API, but got #{response.code}. Response body = #{response.body}"
692
+ end
693
+ rescue StandardError => error
694
+ log.error "Failed to scrape metrics, error=#{error.inspect}"
695
+ log.error_backtrace
696
+ end
697
+
698
+ # This method is used to handle responses from the cadvisor api
699
+ def handle_cadvisor_response(response)
700
+ # Checking response codes only for a successful GET request viz., 2XX codes
701
+ if (response.code < 300) && (response.code > 199)
702
+ @scraped_at_cadvisor = Time.now
703
+ emit_cadvisor_metrics response.body
704
+ else
705
+ log.error "Expected 2xx from cadvisor metrics API, but got #{response.code}. Response body = #{response.body}"
706
+ end
707
+ rescue StandardError => e
708
+ log.error "Failed to scrape metrics, error=#{$ERROR_INFO}, #{e.inspect}"
709
+ log.error_backtrace
710
+ end
711
+ end
712
+ end
713
+ end