fluent-plugin-k8s-metrics 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,114 @@
1
+ apiVersion: v1
2
+ kind: ConfigMap
3
+ metadata:
4
+ name: fluentd-metrics-conf
5
+ labels:
6
+ app: metrics-to-splunk
7
+ data:
8
+ fluent.conf: |
9
+ <system>
10
+ log_level debug
11
+ </system>
12
+
13
+ <source>
14
+ @type kubernetes_metrics
15
+ tag kube.*
16
+ insecure_ssl true
17
+ node_name "#{ENV['MY_NODE_NAME']}"
18
+ </source>
19
+
20
+ <filter kube.**>
21
+ @type record_modifier
22
+ <record>
23
+ metric_name ${tag}
24
+ </record>
25
+ </filter>
26
+
27
+ <filter kube.node.**>
28
+ @type record_modifier
29
+ <record>
30
+ source ${record['node']}
31
+ </record>
32
+ </filter>
33
+
34
+ <filter kube.pod.**>
35
+ @type record_modifier
36
+ <record>
37
+ source ${record['node']}/${record['pod-name']}
38
+ </record>
39
+ </filter>
40
+
41
+ <filter kube.sys-container.**>
42
+ @type record_modifier
43
+ <record>
44
+ source ${record['node']}/${record['pod-name']}/${record['name']}
45
+ </record>
46
+ </filter>
47
+
48
+ <filter kube.container.**>
49
+ @type record_modifier
50
+ <record>
51
+ source ${record['node']}/${record['pod-name']}/${record['container-name']}
52
+ </record>
53
+ </filter>
54
+
55
+ <match kube.**>
56
+ @type splunk_hec
57
+ protocol https
58
+ hec_host my.splunk.host
59
+ hec_port 8088
60
+ hec_token my.hec.token
61
+ data_type metric
62
+ metric_name_key metric_name
63
+ metric_value_key value
64
+ host "#{ENV['MY_NODE_NAME']}"
65
+ source_key source
66
+ insecure_ssl true
67
+ interval 15s
68
+ <buffer>
69
+ @type memory
70
+ total_limit_size 100m
71
+ chunk_limit_size 10m
72
+ flush_interval 15s
73
+ flush_thread_count 1
74
+ overflow_action block
75
+ retry_max_times 3
76
+ </buffer>
77
+ </match>
78
+
79
+ ---
80
+ apiVersion: extensions/v1beta1
81
+ kind: DaemonSet
82
+ metadata:
83
+ name: metrics-to-splunk
84
+ labels:
85
+ app: metrics-to-splunk
86
+ engine: fluentd
87
+ spec:
88
+ template:
89
+ metadata:
90
+ labels:
91
+ app: metrics-to-splunk
92
+ engine: fluentd
93
+ annotations:
94
+ spec:
95
+ containers:
96
+ - name: fluentd
97
+ image: splunk/connect-for-kubernetes:v1.0.0-beta
98
+ imagePullPolicy: Never
99
+ env:
100
+ - name: MY_NODE_NAME
101
+ valueFrom:
102
+ fieldRef:
103
+ fieldPath: spec.nodeName
104
+ resources:
105
+ requests:
106
+ cpu: 200m
107
+ memory: 200Mi
108
+ volumeMounts:
109
+ - name: conf-configmap
110
+ mountPath: /fluentd/etc
111
+ volumes:
112
+ - name: conf-configmap
113
+ configMap:
114
+ name: fluentd-metrics-conf
@@ -0,0 +1,29 @@
1
+ lib = File.expand_path('lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+
4
+ Gem::Specification.new do |spec|
5
+ spec.name = 'fluent-plugin-k8s-metrics'
6
+ spec.version = File.read('VERSION')
7
+ spec.authors = ['Splunk Inc.']
8
+ spec.email = ['DataEdge@splunk.com']
9
+ spec.summary = 'A fluentd input plugin that collects kubernetes cluster metrics.'
10
+ spec.description = 'A fluentd input plugin that collects node and container metrics from a kubernetes cluster.'
11
+ spec.homepage = 'https://github.com/splunk/fluent-plugin-kubernetes-metrics'
12
+ spec.license = 'Apache-2.0'
13
+ test_files, files = `git ls-files -z`.split("\x0").partition do |f|
14
+ f.match(%r{^(test|spec|features)/})
15
+ end
16
+ spec.files = files
17
+ spec.executables = files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = test_files
19
+ spec.require_paths = ['lib']
20
+ spec.add_development_dependency 'bundler', '~> 2.0.0'
21
+ spec.add_development_dependency 'rake', '~> 12.3.2'
22
+ spec.add_development_dependency 'simplecov', '~> 0.16.1'
23
+ spec.add_development_dependency 'test-unit', '~> 3.3.0'
24
+ spec.add_development_dependency 'webmock', '~> 3.5.1'
25
+ spec.add_runtime_dependency 'fluentd', '~> 1.4.0'
26
+ spec.add_runtime_dependency 'kubeclient', '~> 4.2.2'
27
+ spec.add_runtime_dependency 'multi_json', '~> 1.13.1'
28
+ spec.add_runtime_dependency 'oj', '~> 3.7.9'
29
+ end
@@ -0,0 +1,689 @@
1
+ #
2
+ # Copyright 2018- Splunk Inc
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ require 'time'
17
+ require 'fluent/plugin/input'
18
+ require 'kubeclient'
19
+ require 'multi_json'
20
+
21
+ module Fluent
22
+ module Plugin
23
+ class KubernetesMetricsInput < Fluent::Plugin::Input
24
+ Fluent::Plugin.register_input('kubernetes_metrics', self)
25
+
26
+ helpers :timer
27
+
28
+ desc 'The tag of the event.'
29
+ config_param :tag, :string, default: 'kubernetes.metrics.*'
30
+
31
+ desc 'How often it pulls metrcs.'
32
+ config_param :interval, :time, default: '15s'
33
+
34
+ desc 'Path to a kubeconfig file points to a cluster the plugin should collect metrics from. Mostly useful when running fluentd outside of the cluster. When `kubeconfig` is set, `kubernetes_url`, `client_cert`, `client_key`, `ca_file`, `insecure_ssl`, `bearer_token_file`, and `secret_dir` will all be ignored.'
35
+ config_param :kubeconfig, :string, default: nil
36
+
37
+ desc 'URL of the kubernetes API server.'
38
+ config_param :kubernetes_url, :string, default: nil
39
+
40
+ desc 'Path to the certificate file for this client.'
41
+ config_param :client_cert, :string, default: nil
42
+
43
+ desc 'Path to the private key file for this client.'
44
+ config_param :client_key, :string, default: nil
45
+
46
+ desc 'Path to the CA file.'
47
+ config_param :ca_file, :string, default: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
48
+
49
+ desc "If `insecure_ssl` is set to `true`, it won't verify apiserver's certificate."
50
+ config_param :insecure_ssl, :bool, default: false
51
+
52
+ desc 'Path to the file contains the API token. By default it reads from the file "token" in the `secret_dir`.'
53
+ config_param :bearer_token_file, :string, default: nil
54
+
55
+ desc "Path of the location where pod's service account's credentials are stored."
56
+ config_param :secret_dir, :string, default: '/var/run/secrets/kubernetes.io/serviceaccount'
57
+
58
+ desc 'Name of the node that this plugin should collect metrics from.'
59
+ config_param :node_name, :string, default: nil
60
+
61
+ desc 'Name of the nodes that this plugin should collect metrics from.'
62
+ config_param :node_names, :array, default: [], value_type: :string
63
+
64
+ desc 'The hostname or IP address that kubelet will use to connect to. If not supplied, status.hostIP of the node is used to fetch metrics from the Kubelet API (via the $KUBERNETES_NODE_IP environment variable)'
65
+ config_param :kubelet_address, :string, default: "#{ENV['KUBERNETES_NODE_IP']}"
66
+
67
+ desc 'The port that kubelet is listening to.'
68
+ config_param :kubelet_port, :integer, default: 10_250
69
+
70
+ desc 'Use the rest client to get the metrics from summary api on each kubelet'
71
+ config_param :use_rest_client, :bool, default: true
72
+
73
+ desc 'Use SSL for rest client.'
74
+ config_param :use_rest_client_ssl, :bool, default: true
75
+
76
+ def configure(conf)
77
+ super
78
+
79
+ if @use_rest_client
80
+ raise Fluentd::ConfigError, 'node_name is required' if @node_name.nil? || @node_name.empty?
81
+ else
82
+ raise Fluentd::ConfigError, 'node_names is required' if @node_names.nil? || @node_names.empty?
83
+ end
84
+
85
+ parse_tag
86
+ initialize_client
87
+ end
88
+
89
+ def start
90
+ super
91
+
92
+ timer_execute :metric_scraper, @interval, &method(:scrape_metrics)
93
+ timer_execute :stats_metric_scraper, @interval, &method(:scrape_stats_metrics)
94
+ timer_execute :cadvisor_metric_scraper, @interval, &method(:scrape_cadvisor_metrics)
95
+ end
96
+
97
+ def close
98
+ @watchers.each &:finish if @watchers
99
+
100
+ super
101
+ end
102
+
103
+ private
104
+
105
+ def parse_tag
106
+ @tag_prefix, @tag_suffix = @tag.split('*') if @tag.include?('*')
107
+ end
108
+
109
+ def generate_tag(item_name)
110
+ return @tag unless @tag_prefix
111
+
112
+ [@tag_prefix, item_name, @tag_suffix].join
113
+ end
114
+
115
+ def init_with_kubeconfig(options = {})
116
+ config = Kubeclient::Config.read @kubeconfig
117
+ current_context = config.context
118
+
119
+ @client = Kubeclient::Client.new(
120
+ current_context.api_endpoint,
121
+ current_context.api_version,
122
+ options.merge(
123
+ ssl_options: current_context.ssl_options,
124
+ auth_options: current_context.auth_options
125
+ )
126
+ )
127
+ end
128
+
129
+ def init_without_kubeconfig(_options = {})
130
+ # mostly borrowed from Fluentd Kubernetes Metadata Filter Plugin
131
+ if @kubernetes_url.nil?
132
+ # Use Kubernetes default service account if we're in a pod.
133
+ env_host = ENV['KUBERNETES_SERVICE_HOST']
134
+ env_port = ENV['KUBERNETES_SERVICE_PORT']
135
+ if env_host && env_port
136
+ @kubernetes_url = "https://#{env_host}:#{env_port}/api/"
137
+ end
138
+ end
139
+
140
+ raise Fluent::ConfigError, 'kubernetes url is not set' unless @kubernetes_url
141
+
142
+ # Use SSL certificate and bearer token from Kubernetes service account.
143
+ if Dir.exist?(@secret_dir)
144
+ secret_ca_file = File.join(@secret_dir, 'ca.crt')
145
+ secret_token_file = File.join(@secret_dir, 'token')
146
+
147
+ if @ca_file.nil? && File.exist?(secret_ca_file)
148
+ @ca_file = secret_ca_file
149
+ end
150
+
151
+ if @bearer_token_file.nil? && File.exist?(secret_token_file)
152
+ @bearer_token_file = secret_token_file
153
+ end
154
+ end
155
+
156
+ ssl_options = {
157
+ client_cert: @client_cert && OpenSSL::X509::Certificate.new(File.read(@client_cert)),
158
+ client_key: @client_key && OpenSSL::PKey::RSA.new(File.read(@client_key)),
159
+ ca_file: @ca_file,
160
+ verify_ssl: @insecure_ssl ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER
161
+ }
162
+
163
+ auth_options = {}
164
+ auth_options[:bearer_token] = File.read(@bearer_token_file) if @bearer_token_file
165
+
166
+ @client = Kubeclient::Client.new(
167
+ @kubernetes_url, 'v1',
168
+ ssl_options: ssl_options,
169
+ auth_options: auth_options
170
+ )
171
+
172
+ begin
173
+ @client.api_valid?
174
+ rescue KubeException => kube_error
175
+ raise Fluent::ConfigError, "Invalid Kubernetes API #{@api_version} endpoint #{@kubernetes_url}: #{kube_error.message}"
176
+ end
177
+ end
178
+
179
+ def initialize_client
180
+ if @use_rest_client
181
+ initialize_rest_client
182
+ else
183
+ options = {
184
+ timeouts: {
185
+ open: 10,
186
+ read: nil
187
+ }
188
+ }
189
+
190
+ if @kubeconfig.nil?
191
+ init_without_kubeconfig options
192
+ else
193
+ init_with_kubeconfig options
194
+ end
195
+ end
196
+ end
197
+
198
+ def initialize_rest_client
199
+ env_host = @kubelet_address
200
+ env_port = @kubelet_port
201
+
202
+ if env_host && env_port
203
+ if @use_rest_client_ssl
204
+ @kubelet_url = "https://#{env_host}:#{env_port}/stats/summary"
205
+ @kubelet_url_stats = "https://#{env_host}:#{env_port}/stats/"
206
+ @cadvisor_url = "https://#{env_host}:#{env_port}/metrics/cadvisor"
207
+ else
208
+ @kubelet_url = "http://#{env_host}:#{env_port}/stats/summary"
209
+ @kubelet_url_stats = "http://#{env_host}:#{env_port}/stats/"
210
+ @cadvisor_url = "http://#{env_host}:#{env_port}/metrics/cadvisor"
211
+ end
212
+ end
213
+
214
+ if Dir.exist?(@secret_dir)
215
+ secret_ca_file = File.join(@secret_dir, 'ca.crt')
216
+ secret_token_file = File.join(@secret_dir, 'token')
217
+ if @ca_file.nil? && File.exist?(secret_ca_file)
218
+ @ca_file = secret_ca_file
219
+ end
220
+ if @bearer_token_file.nil? && File.exist?(secret_token_file)
221
+ @bearer_token_file = secret_token_file
222
+ end
223
+ end
224
+ log.info("Use URL #{@kubelet_url} for creating client to query kubelet summary api")
225
+ log.info("Use URL #{@kubelet_url_stats} for creating client to query kubelet stats api")
226
+ log.info("Use URL #{@cadvisor_url} for creating client to query cadvisor metrics api")
227
+ end
228
+
229
+ def set_ssl_options
230
+ if @use_rest_client_ssl
231
+ ssl_options = {
232
+ ssl_ca_file: @ca_file,
233
+ verify_ssl: @insecure_ssl ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER,
234
+ headers: { Authorization: 'Bearer ' + File.read(@bearer_token_file) }
235
+ }
236
+ else
237
+ ssl_options = {}
238
+ end
239
+ ssl_options
240
+ end
241
+
242
+ # This method is used to set the options for sending a request to the kubelet api
243
+ def request_options
244
+ options = { method: 'get', url: @kubelet_url }
245
+ options = options.merge(set_ssl_options)
246
+ options
247
+ end
248
+
249
+ # This method is used to set the options for sending a request to the stats api
250
+ def request_options_stats
251
+ options = { method: 'get', url: @kubelet_url_stats }
252
+ options = options.merge(set_ssl_options)
253
+ options
254
+ end
255
+
256
+ # This method is used to set the options for sending a request to the cadvisor api
257
+ def cadvisor_request_options
258
+ options = { method: 'get', url: @cadvisor_url }
259
+ options = options.merge(set_ssl_options)
260
+ options
261
+ end
262
+
263
+ # @client.proxy_url only returns the url, but we need the resource, not just the url
264
+ def summary_proxy_api(node)
265
+ @summary_api =
266
+ begin
267
+ @client.discover unless @client.discovered
268
+ @client.rest_client["/nodes/#{node}:#{@kubelet_port}/proxy/stats/summary"].tap do |endpoint|
269
+ log.info("Use URL #{endpoint.url} for scraping metrics")
270
+ end
271
+ end
272
+ end
273
+
274
+ def stats_proxy_api(node)
275
+ @stats_api =
276
+ begin
277
+ @client.discover unless @client.discovered
278
+ @client.rest_client["/nodes/#{node}:#{@kubelet_port}/proxy/stats/"].tap do |endpoint|
279
+ log.info("Use URL #{endpoint.url} for scraping stats metrics")
280
+ end
281
+ end
282
+ end
283
+
284
+ def cadvisor_proxy_api(node)
285
+ @cadvisor_api =
286
+ begin
287
+ @client.discover unless @client.discovered
288
+ @client.rest_client["/nodes/#{node}:#{@kubelet_port}/proxy/metrics/cadvisor"].tap do |endpoint|
289
+ log.info("Use URL #{endpoint.url} for scraping metrics")
290
+ end
291
+ end
292
+ end
293
+
294
+ def parse_time(metric_time)
295
+ Fluent::EventTime.from_time Time.iso8601(metric_time)
296
+ end
297
+
298
+ def underscore(camlcase)
299
+ camlcase.gsub(/[A-Z]/) { |c| "_#{c.downcase}" }
300
+ end
301
+
302
+ def emit_uptime(tag:, start_time:, labels:)
303
+ uptime = @scraped_at - Time.iso8601(start_time)
304
+ router.emit generate_tag("#{tag}.uptime"), Fluent::EventTime.from_time(@scraped_at), labels.merge('value' => uptime)
305
+ end
306
+
307
+ def emit_cpu_metrics(tag:, metrics:, labels:)
308
+ time = parse_time metrics['time']
309
+ if usage_rate = metrics['usageNanoCores']
310
+ router.emit generate_tag("#{tag}.cpu.usage_rate"), time, labels.merge('value' => usage_rate / 1_000_000)
311
+ end
312
+ if usage = metrics['usageNanoCores']
313
+ router.emit generate_tag("#{tag}.cpu.usage"), time, labels.merge('value' => usage)
314
+ end
315
+ end
316
+
317
+ def emit_memory_metrics(tag:, metrics:, labels:)
318
+ time = parse_time metrics['time']
319
+ %w[availableBytes usageBytes workingSetBytes rssBytes pageFaults majorPageFaults].each do |name|
320
+ if value = metrics[name]
321
+ router.emit generate_tag("#{tag}.memory.#{underscore name}"), time, labels.merge('value' => value)
322
+ end
323
+ end
324
+ end
325
+
326
+ def emit_network_metrics(tag:, metrics:, labels:)
327
+ time = parse_time metrics['time']
328
+ Array(metrics['interfaces']).each do |it|
329
+ it_name = it['name']
330
+ %w[rxBytes rxErrors txBytes txErrors].each do |metric_name|
331
+ if value = it[metric_name]
332
+ router.emit generate_tag("#{tag}.network.#{underscore metric_name}"), time, labels.merge('value' => value, 'interface' => it_name)
333
+ end
334
+ end
335
+ end
336
+ end
337
+
338
+ def emit_fs_metrics(tag:, metrics:, labels:)
339
+ time = parse_time metrics['time']
340
+ %w[availableBytes capacityBytes usedBytes inodesFree inodes inodesUsed].each do |metric_name|
341
+ if value = metrics[metric_name]
342
+ router.emit generate_tag("#{tag}.#{underscore metric_name}"), time, labels.merge('value' => value)
343
+ end
344
+ end
345
+ end
346
+
347
+ def emit_node_rlimit_metrics(node_name, rlimit)
348
+ time = parse_time rlimit['time']
349
+ %w[maxpid curproc].each do |metric_name|
350
+ next unless value = rlimit[metric_name]
351
+
352
+ router.emit(generate_tag("node.runtime.imagefs.#{metric_name}"), time,
353
+ 'value' => value,
354
+ 'node' => node_name)
355
+ end
356
+ end
357
+
358
+ def emit_system_container_metrics(node_name, container)
359
+ tag = 'sys-container'
360
+ labels = { 'node' => node_name, 'name' => container['name'] }
361
+ emit_uptime tag: tag, start_time: container['startTime'], labels: labels
362
+ emit_cpu_metrics tag: tag, metrics: container['cpu'], labels: labels
363
+ emit_memory_metrics tag: tag, metrics: container['memory'], labels: labels
364
+ end
365
+
366
+ def emit_stats_breakdown(stats)
367
+ stats_latest = stats[-1]
368
+ tag = 'node'
369
+ labels = { 'node' => @node_name }
370
+ stats_timestamp = parse_time stats_latest['timestamp']
371
+ unless stats_latest['cpu'].nil?
372
+ emit_cpu_metrics_stats tag: tag, metrics: stats_latest['cpu'], labels: labels, time: stats_timestamp
373
+ end
374
+
375
+ unless stats_latest['diskio'].nil?
376
+ emit_diskio_metrics_stats tag: tag, metrics: stats_latest['diskio'], labels: labels, time: stats_timestamp
377
+ end
378
+
379
+ unless stats_latest['memory'].nil?
380
+ emit_memory_metrics_stats tag: tag, metrics: stats_latest['memory'], labels: labels, time: stats_timestamp
381
+ end
382
+
383
+ unless stats_latest['network'].nil?
384
+ emit_network_metrics_stats tag: tag, metrics: stats_latest['network'], labels: labels, time: stats_timestamp
385
+ end
386
+
387
+ unless stats_latest['filesystem'].nil?
388
+ emit_filesystem_metrics_stats tag: tag, metrics: stats_latest['filesystem'], labels: labels, time: stats_timestamp
389
+ end
390
+
391
+ unless stats_latest['task_stats'].nil?
392
+ emit_tasks_stats_metrics_stats tag: tag, metrics: stats_latest['task_stats'], labels: labels, time: stats_timestamp
393
+ end
394
+ end
395
+
396
+ def emit_cpu_metrics_stats(tag:, metrics:, labels:, time:)
397
+ if cpu_usage_total = metrics['usage']['total']
398
+ router.emit generate_tag("#{tag}.cpu.usage.total"), time, labels.merge('value' => cpu_usage_total / 1_000_000)
399
+ end
400
+ if cpu_usage_user = metrics['usage']['user']
401
+ router.emit generate_tag("#{tag}.cpu.usage.user"), time, labels.merge('value' => cpu_usage_user / 1_000_000)
402
+ end
403
+ if cpu_usage_system = metrics['usage']['system']
404
+ router.emit generate_tag("#{tag}.cpu.usage.system"), time, labels.merge('value' => cpu_usage_system / 1_000_000)
405
+ end
406
+
407
+ if cpu_cfs_periods = metrics['cfs']['periods']
408
+ router.emit generate_tag("#{tag}.cpu.cfs.periods"), time, labels.merge('value' => cpu_cfs_periods)
409
+ end
410
+ if cpu_cfs_throttled_periods = metrics['cfs']['throttled_periods']
411
+ router.emit generate_tag("#{tag}.cpu.cfs.throttled_periods"), time, labels.merge('value' => cpu_cfs_throttled_periods)
412
+ end
413
+ if cpu_cfs_throttled_time = metrics['cfs']['throttled_time']
414
+ router.emit generate_tag("#{tag}.cpu.cfs.throttled_time"), time, labels.merge('value' => cpu_cfs_throttled_time)
415
+ end
416
+ if cpu_load_average = metrics['load_average']
417
+ router.emit generate_tag("#{tag}.cpu.load_average"), time, labels.merge('value' => cpu_load_average)
418
+ end
419
+ end
420
+
421
+ def emit_diskio_metrics_stats(tag:, metrics:, labels:, time:)
422
+ %w[io_service_bytes io_serviced io_queued sectors io_service_time io_wait_time io_merged io_time].each do |metric_name|
423
+ next unless current_io_metric = metrics[metric_name]
424
+
425
+ current_io_metric.each do |device|
426
+ if diskio_io_service_bytes_major = device['major']
427
+ router.emit generate_tag("#{tag}.diskio".concat(metric_name).concat('.major.')), time, labels.merge('device' => device['device'], 'value' => diskio_io_service_bytes_major)
428
+ end
429
+ if diskio_io_service_bytes_minor = device['minor']
430
+ router.emit generate_tag("#{tag}.diskio".concat(metric_name).concat('.minor.')), time, labels.merge('device' => device['device'], 'value' => diskio_io_service_bytes_minor)
431
+ end
432
+ device_stats = device['stats']
433
+ device_stats.each do |device_stat|
434
+ device_key, device_value = device_stat
435
+ router.emit generate_tag("#{tag}.diskio.".concat(metric_name).concat('.stats.').concat(device_key)), time, labels.merge('device' => device['device'], 'value' => device_value)
436
+ end
437
+ end
438
+ end
439
+ end
440
+
441
+ def emit_memory_metrics_stats(tag:, metrics:, labels:, time:)
442
+ %w[usage max_usage cache rss swap working_set failcnt].each do |metric_name|
443
+ if current_memory_metric = metrics[metric_name]
444
+ router.emit generate_tag("#{tag}.memory.".concat(metric_name)), time, labels.merge('value' => current_memory_metric)
445
+ end
446
+ end
447
+ %w[container_data hierarchical_data].each do |metric_name_group|
448
+ next unless current_memory_metric_group = metrics[metric_name_group]
449
+
450
+ current_memory_metric_group.each do |metric_name|
451
+ metric_key, metric_value = metric_name
452
+ router.emit generate_tag("#{tag}.memory.".concat(metric_name_group).concat('.').concat(metric_key)), time, labels.merge('value' => metric_value)
453
+ end
454
+ end
455
+ end
456
+
457
+ def emit_network_metrics_stats(tag:, metrics:, labels:, time:)
458
+ network_name = metrics['name']
459
+ %w[rx_bytes rx_packets rx_errors rx_dropped tx_bytes tx_packets tx_errors tx_dropped].each do |metric_name|
460
+ if current_network_metric = metrics[metric_name]
461
+ router.emit generate_tag("#{tag}.network.".concat(network_name).concat('.').concat(metric_name)), time, labels.merge('value' => current_network_metric)
462
+ end
463
+ end
464
+
465
+ if network_interfaces = metrics['interfaces']
466
+ network_interfaces.each do |current_interface|
467
+ name = current_interface['name']
468
+ %w[rx_bytes rx_packets rx_errors rx_dropped tx_bytes tx_packets tx_errors tx_dropped].each do |current_metric|
469
+ if metric_value = current_interface[current_metric]
470
+ router.emit generate_tag("#{tag}.network.".concat(name).concat('.').concat(current_metric)), time, labels.merge('value' => metric_value)
471
+ end
472
+ end
473
+ end
474
+ end
475
+
476
+ %w[tcp tcp6 udp udp6].each do |metric_name_group|
477
+ next unless metric_group = metrics[metric_name_group]
478
+
479
+ metric_group.each do |current_metric|
480
+ metric_key, metric_value = current_metric
481
+ router.emit generate_tag("#{tag}.network.".concat(metric_name_group).concat('.').concat(metric_key)), time, labels.merge('value' => metric_value)
482
+ end
483
+ end
484
+ end
485
+
486
+ def emit_filesystem_metrics_stats(tag:, metrics:, labels:, time:)
487
+ metrics.each do |file_system|
488
+ device = file_system['device']
489
+ type = file_system['type']
490
+ file_system.each do |file_metric|
491
+ file_key, file_value = file_metric
492
+ unless %w[device type has_inodes].include? file_key
493
+ router.emit generate_tag("#{tag}.filesystem.".concat(file_key)), time, labels.merge('device' => device, 'type' => type, 'value' => file_value)
494
+ end
495
+ end
496
+ end
497
+ end
498
+
499
+ def emit_tasks_stats_metrics_stats(tag:, metrics:, labels:, time:)
500
+ metrics.each do |task_stats|
501
+ task_key, task_value = task_stats
502
+ router.emit generate_tag("#{tag}.tasks_stats.".concat(task_key)), time, labels.merge('value' => task_value)
503
+ end
504
+ end
505
+
506
+ def emit_node_metrics(node)
507
+ node_name = node['nodeName']
508
+ tag = 'node'
509
+ labels = { 'node' => node_name }
510
+
511
+ unless node['startTime'].nil?
512
+ emit_uptime tag: tag, start_time: node['startTime'], labels: labels
513
+ end
514
+ unless node['cpu'].nil?
515
+ emit_cpu_metrics tag: tag, metrics: node['cpu'], labels: labels
516
+ end
517
+ unless node['memory'].nil?
518
+ emit_memory_metrics tag: tag, metrics: node['memory'], labels: labels
519
+ end
520
+ unless node['network'].nil?
521
+ emit_network_metrics tag: tag, metrics: node['network'], labels: labels
522
+ end
523
+ unless node['fs'].nil?
524
+ emit_fs_metrics tag: "#{tag}.fs", metrics: node['fs'], labels: labels
525
+ end
526
+ unless node['runtime']['imageFs'].nil?
527
+ emit_fs_metrics tag: "#{tag}.imagefs", metrics: node['runtime']['imageFs'], labels: labels
528
+ end
529
+ unless node['rlimit'].nil?
530
+ emit_node_rlimit_metrics node_name, node['rlimit']
531
+ end
532
+ unless node['systemContainers'].nil?
533
+ node['systemContainers'].each do |c|
534
+ emit_system_container_metrics node_name, c
535
+ end
536
+ end
537
+ end
538
+
539
+ def emit_container_metrics(pod_labels, container)
540
+ tag = 'container'
541
+ labels = pod_labels.merge 'container-name' => container['name']
542
+ emit_uptime tag: tag, start_time: container['startTime'], labels: labels
543
+ emit_cpu_metrics tag: tag, metrics: container['cpu'], labels: labels
544
+ emit_memory_metrics tag: tag, metrics: container['memory'], labels: labels
545
+ emit_fs_metrics tag: "#{tag}.rootfs", metrics: container['rootfs'], labels: labels
546
+ emit_fs_metrics tag: "#{tag}.logs", metrics: container['logs'], labels: labels
547
+ end
548
+
549
+ def emit_pod_metrics(node_name, pod)
550
+ tag = 'pod'
551
+ labels = pod['podRef'].transform_keys &'pod-'.method(:+)
552
+ labels['node'] = node_name
553
+
554
+ emit_uptime tag: tag, start_time: pod['startTime'], labels: labels
555
+ emit_cpu_metrics tag: tag, metrics: pod['cpu'], labels: labels if pod['cpu']
556
+ emit_memory_metrics tag: tag, metrics: pod['memory'], labels: labels if pod['memory']
557
+ emit_network_metrics tag: tag, metrics: pod['network'], labels: labels
558
+ emit_fs_metrics tag: "#{tag}.ephemeral-storage", metrics: pod['ephemeral-storage'], labels: labels
559
+ Array(pod['volume']).each do |volume|
560
+ emit_fs_metrics tag: "#{tag}.volume", metrics: volume, labels: labels.merge('name' => volume['name'])
561
+ end
562
+ Array(pod['containers']).each do |container|
563
+ emit_container_metrics labels, container
564
+ end
565
+ end
566
+
567
+ def emit_metrics(metrics)
568
+ emit_node_metrics(metrics['node']) unless metrics['node'].nil?
569
+ Array(metrics['pods']).each &method(:emit_pod_metrics).curry.call(metrics['node']['nodeName']) unless metrics['pods'].nil?
570
+ end
571
+
572
+ def emit_stats_metrics(metrics)
573
+ emit_stats_breakdown(metrics['stats']) unless metrics['stats'].nil?
574
+ end
575
+
576
+ def emit_cadvisor_metrics(metrics)
577
+ metrics = metrics.split("\n")
578
+ metrics.each do |metric|
579
+ next unless metric.include? 'container_name='
580
+
581
+ next unless metric.match(/^((?!container_name="").)*$/) && metric[0] != '#'
582
+
583
+ metric_str, metric_val = metric.split(' ')
584
+ metric_val = metric_val.to_f if metric_val.is_a? String
585
+ first_occur = metric_str.index('{')
586
+ metric_name = metric_str[0..first_occur - 1]
587
+ pod_name = metric.match(/pod_name="\S*"/).to_s
588
+ pod_name = pod_name.split('"')[1]
589
+ image_name = metric.match(/image="\S*"/).to_s
590
+ image_name = image_name.split('"')[1]
591
+ namespace = metric.match(/namespace="\S*"/).to_s
592
+ namespace = namespace.split('"')[1]
593
+ metric_labels = { 'pod_name' => pod_name, 'image' => image_name, 'namespace' => namespace, 'value' => metric_val, 'node' => @node_name }
594
+ if metric =~ /^((?!container_name="POD").)*$/
595
+ tag = 'pod'
596
+ tag = generate_tag("#{tag}#{metric_name.tr('_', '.')}")
597
+ tag = tag.gsub('container', '')
598
+ else
599
+ container_name = metric.match(/container_name="\S*"/).to_s
600
+ container_name = container_name.split('"')[1]
601
+ container_label = { 'container_name' => container_name }
602
+ metric_labels.merge(container_label)
603
+ tag = generate_tag(metric_name.tr('_', '.').to_s)
604
+ end
605
+ router.emit tag, @scraped_at_cadvisor, metric_labels
606
+ end
607
+ end
608
+
609
+ def scrape_metrics
610
+ if @use_rest_client
611
+ response = RestClient::Request.execute request_options
612
+ handle_response(response)
613
+ else
614
+ @node_names.each do |node|
615
+ response = summary_proxy_api(node).get(@client.headers)
616
+ handle_response(response)
617
+ end
618
+ end
619
+ end
620
+
621
+ def scrape_stats_metrics
622
+ if @use_rest_client
623
+ response_stats = RestClient::Request.execute request_options_stats
624
+ handle_stats_response(response_stats)
625
+ else
626
+ @node_names.each do |node|
627
+ @node_name = node
628
+ response_stats = stats_proxy_api(node).get(@client.headers)
629
+ handle_stats_response(response_stats)
630
+ end
631
+ end
632
+ end
633
+
634
+ def scrape_cadvisor_metrics
635
+ if @use_rest_client
636
+ response_cadvisor = RestClient::Request.execute cadvisor_request_options
637
+ handle_cadvisor_response(response_cadvisor)
638
+ else
639
+ @node_names.each do |node|
640
+ response_cadvisor = cadvisor_proxy_api(node).get(@client.headers)
641
+ handle_cadvisor_response(response_cadvisor)
642
+ end
643
+ end
644
+ end
645
+
646
+ # This method is used to handle responses from the kubelet summary api
647
+ def handle_response(response)
648
+ # Checking response codes only for a successful GET request viz., 2XX codes
649
+ if (response.code < 300) && (response.code > 199)
650
+ @scraped_at = Time.now
651
+ emit_metrics MultiJson.load(response.body)
652
+ else
653
+ log.error "ExMultiJson.load(response.body) expected 2xx from summary API, but got #{response.code}. Response body = #{response.body}"
654
+ end
655
+ rescue StandardError => error
656
+ log.error "Failed to scrape metrics, error=#{error.inspect}"
657
+ log.error_backtrace
658
+ end
659
+
660
+ # This method is used to handle responses from the kubelet stats api
661
+ def handle_stats_response(response)
662
+ # Checking response codes only for a successful GET request viz., 2XX codes
663
+ if (response.code < 300) && (response.code > 199)
664
+ @scraped_at = Time.now
665
+ emit_stats_metrics MultiJson.load(response.body)
666
+ else
667
+ log.error "ExMultiJson.load(response.body) expected 2xx from stats API, but got #{response.code}. Response body = #{response.body}"
668
+ end
669
+ rescue StandardError => error
670
+ log.error "Failed to scrape metrics, error=#{error.inspect}"
671
+ log.error_backtrace
672
+ end
673
+
674
+ # This method is used to handle responses from the cadvisor api
675
+ def handle_cadvisor_response(response)
676
+ # Checking response codes only for a successful GET request viz., 2XX codes
677
+ if (response.code < 300) && (response.code > 199)
678
+ @scraped_at_cadvisor = Time.now
679
+ emit_cadvisor_metrics response.body
680
+ else
681
+ log.error "Expected 2xx from cadvisor metrics API, but got #{response.code}. Response body = #{response.body}"
682
+ end
683
+ rescue StandardError => e
684
+ log.error "Failed to scrape metrics, error=#{$ERROR_INFO}, #{e.inspect}"
685
+ log.error_backtrace
686
+ end
687
+ end
688
+ end
689
+ end