fluent-plugin-kubernetes_metadata_filter 2.1.4 → 2.9.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +57 -0
  3. data/.gitignore +0 -1
  4. data/.rubocop.yml +57 -0
  5. data/Gemfile +4 -2
  6. data/Gemfile.lock +158 -0
  7. data/README.md +48 -28
  8. data/Rakefile +15 -11
  9. data/fluent-plugin-kubernetes_metadata_filter.gemspec +25 -28
  10. data/lib/fluent/plugin/filter_kubernetes_metadata.rb +185 -131
  11. data/lib/fluent/plugin/kubernetes_metadata_cache_strategy.rb +27 -20
  12. data/lib/fluent/plugin/kubernetes_metadata_common.rb +59 -33
  13. data/lib/fluent/plugin/kubernetes_metadata_stats.rb +6 -6
  14. data/lib/fluent/plugin/kubernetes_metadata_test_api_adapter.rb +68 -0
  15. data/lib/fluent/plugin/kubernetes_metadata_util.rb +53 -0
  16. data/lib/fluent/plugin/kubernetes_metadata_watch_namespaces.rb +121 -27
  17. data/lib/fluent/plugin/kubernetes_metadata_watch_pods.rb +138 -29
  18. data/release_notes.md +42 -0
  19. data/test/cassettes/kubernetes_docker_metadata_annotations.yml +0 -34
  20. data/test/cassettes/{kubernetes_docker_metadata_dotted_labels.yml → kubernetes_docker_metadata_dotted_slashed_labels.yml} +0 -34
  21. data/test/cassettes/kubernetes_get_api_v1.yml +193 -0
  22. data/test/cassettes/kubernetes_get_api_v1_using_token.yml +195 -0
  23. data/test/cassettes/kubernetes_get_namespace_default.yml +69 -0
  24. data/test/cassettes/kubernetes_get_namespace_default_using_token.yml +71 -0
  25. data/test/cassettes/{kubernetes_docker_metadata.yml → kubernetes_get_pod.yml} +0 -82
  26. data/test/cassettes/{metadata_with_namespace_id.yml → kubernetes_get_pod_container_init.yml} +3 -134
  27. data/test/cassettes/{kubernetes_docker_metadata_using_bearer_token.yml → kubernetes_get_pod_using_token.yml} +5 -105
  28. data/test/cassettes/metadata_from_tag_and_journald_fields.yml +0 -255
  29. data/test/cassettes/metadata_from_tag_journald_and_kubernetes_fields.yml +0 -255
  30. data/test/cassettes/{non_kubernetes_docker_metadata.yml → valid_kubernetes_api_server_using_token.yml} +4 -44
  31. data/test/helper.rb +20 -2
  32. data/test/plugin/test_cache_stats.rb +10 -13
  33. data/test/plugin/test_cache_strategy.rb +158 -160
  34. data/test/plugin/test_filter_kubernetes_metadata.rb +480 -320
  35. data/test/plugin/test_utils.rb +56 -0
  36. data/test/plugin/test_watch_namespaces.rb +209 -55
  37. data/test/plugin/test_watch_pods.rb +302 -103
  38. data/test/plugin/watch_test.rb +52 -33
  39. metadata +69 -72
  40. data/circle.yml +0 -17
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  #
2
4
  # Fluentd Kubernetes Metadata Filter Plugin - Enrich Fluentd events with
3
5
  # Kubernetes metadata
@@ -18,6 +20,11 @@
18
20
  #
19
21
  module KubernetesMetadata
20
22
  module Common
23
+ class GoneError < StandardError
24
+ def initialize(msg = '410 Gone')
25
+ super
26
+ end
27
+ end
21
28
 
22
29
  def match_annotations(annotations)
23
30
  result = {}
@@ -32,61 +39,81 @@ module KubernetesMetadata
32
39
  end
33
40
 
34
41
  def parse_namespace_metadata(namespace_object)
35
- labels = syms_to_strs(namespace_object['metadata']['labels'].to_h)
36
- annotations = match_annotations(syms_to_strs(namespace_object['metadata']['annotations'].to_h))
42
+ labels = ''
43
+ labels = syms_to_strs(namespace_object[:metadata][:labels].to_h) unless @skip_labels
44
+
45
+ annotations = match_annotations(syms_to_strs(namespace_object[:metadata][:annotations].to_h))
37
46
  if @de_dot
38
- self.de_dot!(labels)
39
- self.de_dot!(annotations)
47
+ de_dot!(labels) unless @skip_labels
48
+ de_dot!(annotations)
49
+ end
50
+ if @de_slash
51
+ de_slash!(labels) unless @skip_labels
52
+ de_slash!(annotations)
40
53
  end
41
54
  kubernetes_metadata = {
42
- 'namespace_id' => namespace_object['metadata']['uid'],
43
- 'creation_timestamp' => namespace_object['metadata']['creationTimestamp']
55
+ 'namespace_id' => namespace_object[:metadata][:uid],
56
+ 'creation_timestamp' => namespace_object[:metadata][:creationTimestamp]
44
57
  }
45
58
  kubernetes_metadata['namespace_labels'] = labels unless labels.empty?
46
59
  kubernetes_metadata['namespace_annotations'] = annotations unless annotations.empty?
47
- return kubernetes_metadata
60
+ kubernetes_metadata
48
61
  end
49
62
 
50
63
  def parse_pod_metadata(pod_object)
51
- labels = syms_to_strs(pod_object['metadata']['labels'].to_h)
52
- annotations = match_annotations(syms_to_strs(pod_object['metadata']['annotations'].to_h))
64
+ labels = ''
65
+ labels = syms_to_strs(pod_object[:metadata][:labels].to_h) unless @skip_labels
66
+
67
+ annotations = match_annotations(syms_to_strs(pod_object[:metadata][:annotations].to_h))
53
68
  if @de_dot
54
- self.de_dot!(labels)
55
- self.de_dot!(annotations)
69
+ de_dot!(labels) unless @skip_labels
70
+ de_dot!(annotations)
71
+ end
72
+ if @de_slash
73
+ de_slash!(labels) unless @skip_labels
74
+ de_slash!(annotations)
56
75
  end
57
76
 
58
- # collect container informations
77
+ # collect container information
59
78
  container_meta = {}
60
79
  begin
61
- pod_object['status']['containerStatuses'].each do|container_status|
62
- # get plain container id (eg. docker://hash -> hash)
63
- container_id = container_status['containerID'].sub /^[-_a-zA-Z0-9]+:\/\//, ''
64
- container_meta[container_id] = {
65
- 'name' => container_status['name'],
66
- 'image' => container_status['image'],
67
- 'image_id' => container_status['imageID']
68
- }
69
- end
70
- rescue
71
- log.debug("parsing container meta information failed for: #{pod_object['metadata']['namespace']}/#{pod_object['metadata']['name']} ")
80
+ pod_object[:status][:containerStatuses].each do |container_status|
81
+ container_id = (container_status[:containerID]||"").sub(%r{^[-_a-zA-Z0-9]+://}, '')
82
+ key = container_status[:name]
83
+ container_meta[key] = if @skip_container_metadata
84
+ {
85
+ 'name' => container_status[:name]
86
+ }
87
+ else
88
+ {
89
+ 'name' => container_status[:name],
90
+ 'image' => container_status[:image],
91
+ 'image_id' => container_status[:imageID],
92
+ :containerID => container_id
93
+ }
94
+ end
95
+ end if pod_object[:status] && pod_object[:status][:containerStatuses]
96
+ rescue StandardError=>e
97
+ log.warn("parsing container meta information failed for: #{pod_object[:metadata][:namespace]}/#{pod_object[:metadata][:name]}: #{e}")
72
98
  end
73
99
 
74
100
  kubernetes_metadata = {
75
- 'namespace_name' => pod_object['metadata']['namespace'],
76
- 'pod_id' => pod_object['metadata']['uid'],
77
- 'pod_name' => pod_object['metadata']['name'],
78
- 'containers' => syms_to_strs(container_meta),
79
- 'labels' => labels,
80
- 'host' => pod_object['spec']['nodeName'],
81
- 'master_url' => @kubernetes_url
101
+ 'namespace_name' => pod_object[:metadata][:namespace],
102
+ 'pod_id' => pod_object[:metadata][:uid],
103
+ 'pod_name' => pod_object[:metadata][:name],
104
+ 'pod_ip' => pod_object[:status][:podIP],
105
+ 'containers' => syms_to_strs(container_meta),
106
+ 'host' => pod_object[:spec][:nodeName]
82
107
  }
83
108
  kubernetes_metadata['annotations'] = annotations unless annotations.empty?
84
- return kubernetes_metadata
109
+ kubernetes_metadata['labels'] = labels unless labels.empty?
110
+ kubernetes_metadata['master_url'] = @kubernetes_url unless @skip_master_url
111
+ kubernetes_metadata
85
112
  end
86
113
 
87
114
  def syms_to_strs(hsh)
88
115
  newhsh = {}
89
- hsh.each_pair do |kk,vv|
116
+ hsh.each_pair do |kk, vv|
90
117
  if vv.is_a?(Hash)
91
118
  vv = syms_to_strs(vv)
92
119
  end
@@ -98,6 +125,5 @@ module KubernetesMetadata
98
125
  end
99
126
  newhsh
100
127
  end
101
-
102
128
  end
103
129
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  #
2
4
  # Fluentd Kubernetes Metadata Filter Plugin - Enrich Fluentd events with
3
5
  # Kubernetes metadata
@@ -19,17 +21,16 @@
19
21
  require 'lru_redux'
20
22
  module KubernetesMetadata
21
23
  class Stats
22
-
23
24
  def initialize
24
25
  @stats = ::LruRedux::TTL::ThreadSafeCache.new(1000, 3600)
25
26
  end
26
27
 
27
28
  def bump(key)
28
- @stats[key] = @stats.getset(key) { 0 } + 1
29
+ @stats[key] = @stats.getset(key) { 0 } + 1
29
30
  end
30
31
 
31
32
  def set(key, value)
32
- @stats[key] = value
33
+ @stats[key] = value
33
34
  end
34
35
 
35
36
  def [](key)
@@ -37,10 +38,9 @@ module KubernetesMetadata
37
38
  end
38
39
 
39
40
  def to_s
40
- "stats - " + [].tap do |a|
41
- @stats.each {|k,v| a << "#{k.to_s}: #{v}"}
41
+ 'stats - ' + [].tap do |a|
42
+ @stats.each { |k, v| a << "#{k}: #{v}" }
42
43
  end.join(', ')
43
44
  end
44
-
45
45
  end
46
46
  end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Fluentd Kubernetes Metadata Filter Plugin - Enrich Fluentd events with
5
+ # Kubernetes metadata
6
+ #
7
+ # Copyright 2021 Red Hat, Inc.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ #
21
+ require 'kubeclient'
22
+
23
+ module KubernetesMetadata
24
+ module TestApiAdapter
25
+
26
+ def api_valid?
27
+ true
28
+ end
29
+ def get_namespace(namespace_name)
30
+ return {
31
+ metadata: {
32
+ name: namespace_name,
33
+ uid: namespace_name + 'uuid',
34
+ labels: {
35
+ foo_ns: 'bar_ns'
36
+ }
37
+ }
38
+ }
39
+ end
40
+
41
+ def get_pod(pod_name, namespace_name)
42
+ return {
43
+ metadata: {
44
+ name: pod_name,
45
+ namespace: namespace_name,
46
+ uid: namespace_name + namespace_name + "uuid",
47
+ labels: {
48
+ foo: 'bar'
49
+ }
50
+ },
51
+ spec: {
52
+ nodeName: 'aNodeName',
53
+ containers: [{
54
+ name: 'foo',
55
+ image: 'bar'
56
+ }, {
57
+ name: 'bar',
58
+ image: 'foo'
59
+ }]
60
+ },
61
+ status: {
62
+ podIP: '172.17.0.8'
63
+ }
64
+ }
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Fluentd Kubernetes Metadata Filter Plugin - Enrich Fluentd events with
5
+ # Kubernetes metadata
6
+ #
7
+ # Copyright 2021 Red Hat, Inc.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ #
21
+ module KubernetesMetadata
22
+ module Util
23
+ def create_time_from_record(record, internal_time)
24
+ time_key = @time_fields.detect { |ii| record.key?(ii) }
25
+ time = record[time_key]
26
+ if time.nil? || time.is_a?(String) && time.chop.empty?
27
+ # `internal_time` is a Fluent::EventTime, it can't compare with Time.
28
+ return Time.at(internal_time.to_f)
29
+ end
30
+
31
+ if ['_SOURCE_REALTIME_TIMESTAMP', '__REALTIME_TIMESTAMP'].include?(time_key)
32
+ timei = time.to_i
33
+ return Time.at(timei / 1_000_000, timei % 1_000_000)
34
+ end
35
+ return Time.at(time) if time.is_a?(Numeric)
36
+
37
+ Time.parse(time)
38
+ end
39
+ end
40
+ end
41
+
42
+ #https://stackoverflow.com/questions/5622435/how-do-i-convert-a-ruby-class-name-to-a-underscore-delimited-symbol
43
+ class String
44
+ def underscore
45
+ word = self.dup
46
+ word.gsub!(/::/, '_')
47
+ word.gsub!(/([A-Z]+)([A-Z][a-z])/,'\1_\2')
48
+ word.gsub!(/([a-z\d])([A-Z])/,'\1_\2')
49
+ word.tr!("-", "_")
50
+ word.downcase!
51
+ word
52
+ end
53
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  #
2
4
  # Fluentd Kubernetes Metadata Filter Plugin - Enrich Fluentd events with
3
5
  # Kubernetes metadata
@@ -16,45 +18,137 @@
16
18
  # See the License for the specific language governing permissions and
17
19
  # limitations under the License.
18
20
  #
21
+ # TODO: this is mostly copy-paste from kubernetes_metadata_watch_pods.rb unify them
19
22
  require_relative 'kubernetes_metadata_common'
20
23
 
21
24
  module KubernetesMetadata
22
25
  module WatchNamespaces
23
-
24
26
  include ::KubernetesMetadata::Common
25
27
 
28
+ def set_up_namespace_thread
29
+ # Any failures / exceptions in the initial setup should raise
30
+ # Fluent:ConfigError, so that users can inspect potential errors in
31
+ # the configuration.
32
+ namespace_watcher = start_namespace_watch
33
+ Thread.current[:namespace_watch_retry_backoff_interval] = @watch_retry_interval
34
+ Thread.current[:namespace_watch_retry_count] = 0
35
+
36
+ # Any failures / exceptions in the followup watcher notice
37
+ # processing will be swallowed and retried. These failures /
38
+ # exceptions could be caused by Kubernetes API being temporarily
39
+ # down. We assume the configuration is correct at this point.
40
+ loop do
41
+ namespace_watcher ||= get_namespaces_and_start_watcher
42
+ process_namespace_watcher_notices(namespace_watcher)
43
+ rescue GoneError => e
44
+ # Expected error. Quietly go back through the loop in order to
45
+ # start watching from the latest resource versions
46
+ @stats.bump(:namespace_watch_gone_errors)
47
+ log.info('410 Gone encountered. Restarting namespace watch to reset resource versions.', e)
48
+ namespace_watcher = nil
49
+ rescue StandardError => e
50
+ @stats.bump(:namespace_watch_failures)
51
+ if Thread.current[:namespace_watch_retry_count] < @watch_retry_max_times
52
+ # Instead of raising exceptions and crashing Fluentd, swallow
53
+ # the exception and reset the watcher.
54
+ log.info(
55
+ 'Exception encountered parsing namespace watch event. ' \
56
+ 'The connection might have been closed. Sleeping for ' \
57
+ "#{Thread.current[:namespace_watch_retry_backoff_interval]} " \
58
+ 'seconds and resetting the namespace watcher.', e
59
+ )
60
+ sleep(Thread.current[:namespace_watch_retry_backoff_interval])
61
+ Thread.current[:namespace_watch_retry_count] += 1
62
+ Thread.current[:namespace_watch_retry_backoff_interval] *= @watch_retry_exponential_backoff_base
63
+ namespace_watcher = nil
64
+ else
65
+ # Since retries failed for many times, log as errors instead
66
+ # of info and raise exceptions and trigger Fluentd to restart.
67
+ message =
68
+ 'Exception encountered parsing namespace watch event. The ' \
69
+ 'connection might have been closed. Retried ' \
70
+ "#{@watch_retry_max_times} times yet still failing. Restarting."
71
+ log.error(message, e)
72
+ raise Fluent::UnrecoverableError, message
73
+ end
74
+ end
75
+ end
76
+
26
77
  def start_namespace_watch
27
- begin
28
- resource_version = @client.get_namespaces.resourceVersion
29
- watcher = @client.watch_namespaces(resource_version)
30
- rescue Exception=>e
31
- message = "start_namespace_watch: Exception encountered setting up namespace watch from Kubernetes API #{@apiVersion} endpoint #{@kubernetes_url}: #{e.message}"
32
- message += " (#{e.response})" if e.respond_to?(:response)
33
- log.debug(message)
34
- raise Fluent::ConfigError, message
78
+ get_namespaces_and_start_watcher
79
+ rescue StandardError => e
80
+ message = 'start_namespace_watch: Exception encountered setting up ' \
81
+ "namespace watch from Kubernetes API #{@apiVersion} endpoint " \
82
+ "#{@kubernetes_url}: #{e.message}"
83
+ message += " (#{e.response})" if e.respond_to?(:response)
84
+ log.debug(message)
85
+
86
+ raise Fluent::ConfigError, message
87
+ end
88
+
89
+ # List all namespaces, record the resourceVersion and return a watcher
90
+ # starting from that resourceVersion.
91
+ def get_namespaces_and_start_watcher
92
+ options = {
93
+ resource_version: '0' # Fetch from API server cache instead of etcd quorum read
94
+ }
95
+ namespaces = @client.get_namespaces(options)
96
+ namespaces[:items].each do |namespace|
97
+ cache_key = namespace[:metadata][:uid]
98
+ @namespace_cache[cache_key] = parse_namespace_metadata(namespace)
99
+ @stats.bump(:namespace_cache_host_updates)
35
100
  end
101
+
102
+ # continue watching from most recent resourceVersion
103
+ options[:resource_version] = namespaces[:metadata][:resourceVersion]
104
+
105
+ watcher = @client.watch_namespaces(options)
106
+ reset_namespace_watch_retry_stats
107
+ watcher
108
+ end
109
+
110
+ # Reset namespace watch retry count and backoff interval as there is a
111
+ # successful watch notice.
112
+ def reset_namespace_watch_retry_stats
113
+ Thread.current[:namespace_watch_retry_count] = 0
114
+ Thread.current[:namespace_watch_retry_backoff_interval] = @watch_retry_interval
115
+ end
116
+
117
+ # Process a watcher notice and potentially raise an exception.
118
+ def process_namespace_watcher_notices(watcher)
36
119
  watcher.each do |notice|
37
- case notice.type
38
- when 'MODIFIED'
39
- cache_key = notice.object['metadata']['uid']
40
- cached = @namespace_cache[cache_key]
41
- if cached
42
- @namespace_cache[cache_key] = parse_namespace_metadata(notice.object)
43
- @stats.bump(:namespace_cache_watch_updates)
44
- else
45
- @stats.bump(:namespace_cache_watch_misses)
46
- end
47
- when 'DELETED'
48
- # ignore and let age out for cases where
49
- # deleted but still processing logs
50
- @stats.bump(:namespace_cache_watch_deletes_ignored)
120
+ case notice[:type]
121
+ when 'MODIFIED'
122
+ reset_namespace_watch_retry_stats
123
+ cache_key = notice[:object][:metadata][:uid]
124
+ cached = @namespace_cache[cache_key]
125
+ if cached
126
+ @namespace_cache[cache_key] = parse_namespace_metadata(notice[:object])
127
+ @stats.bump(:namespace_cache_watch_updates)
51
128
  else
52
- # Don't pay attention to creations, since the created namespace may not
53
- # be used by any pod on this node.
54
- @stats.bump(:namespace_cache_watch_ignored)
129
+ @stats.bump(:namespace_cache_watch_misses)
130
+ end
131
+ when 'DELETED'
132
+ reset_namespace_watch_retry_stats
133
+ # ignore and let age out for cases where
134
+ # deleted but still processing logs
135
+ @stats.bump(:namespace_cache_watch_deletes_ignored)
136
+ when 'ERROR'
137
+ if notice[:object] && notice[:object][:code] == 410
138
+ @stats.bump(:namespace_watch_gone_notices)
139
+ raise GoneError
140
+ else
141
+ @stats.bump(:namespace_watch_error_type_notices)
142
+ message = notice[:object][:message] if notice[:object] && notice[:object][:message]
143
+ raise "Error while watching namespaces: #{message}"
144
+ end
145
+ else
146
+ reset_namespace_watch_retry_stats
147
+ # Don't pay attention to creations, since the created namespace may not
148
+ # be used by any namespace on this node.
149
+ @stats.bump(:namespace_cache_watch_ignored)
55
150
  end
56
151
  end
57
152
  end
58
-
59
153
  end
60
154
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  #
2
4
  # Fluentd Kubernetes Metadata Filter Plugin - Enrich Fluentd events with
3
5
  # Kubernetes metadata
@@ -16,46 +18,153 @@
16
18
  # See the License for the specific language governing permissions and
17
19
  # limitations under the License.
18
20
  #
21
+ # TODO: this is mostly copy-paste from kubernetes_metadata_watch_namespaces.rb unify them
19
22
  require_relative 'kubernetes_metadata_common'
20
23
 
21
24
  module KubernetesMetadata
22
25
  module WatchPods
23
-
24
26
  include ::KubernetesMetadata::Common
25
27
 
28
+ def set_up_pod_thread
29
+ # Any failures / exceptions in the initial setup should raise
30
+ # Fluent:ConfigError, so that users can inspect potential errors in
31
+ # the configuration.
32
+ pod_watcher = start_pod_watch
33
+
34
+ Thread.current[:pod_watch_retry_backoff_interval] = @watch_retry_interval
35
+ Thread.current[:pod_watch_retry_count] = 0
36
+
37
+ # Any failures / exceptions in the followup watcher notice
38
+ # processing will be swallowed and retried. These failures /
39
+ # exceptions could be caused by Kubernetes API being temporarily
40
+ # down. We assume the configuration is correct at this point.
41
+ loop do
42
+ pod_watcher ||= get_pods_and_start_watcher
43
+ process_pod_watcher_notices(pod_watcher)
44
+ rescue GoneError => e
45
+ # Expected error. Quietly go back through the loop in order to
46
+ # start watching from the latest resource versions
47
+ @stats.bump(:pod_watch_gone_errors)
48
+ log.info('410 Gone encountered. Restarting pod watch to reset resource versions.', e)
49
+ pod_watcher = nil
50
+ rescue StandardError => e
51
+ @stats.bump(:pod_watch_failures)
52
+ if Thread.current[:pod_watch_retry_count] < @watch_retry_max_times
53
+ # Instead of raising exceptions and crashing Fluentd, swallow
54
+ # the exception and reset the watcher.
55
+ log.info(
56
+ 'Exception encountered parsing pod watch event. The ' \
57
+ 'connection might have been closed. Sleeping for ' \
58
+ "#{Thread.current[:pod_watch_retry_backoff_interval]} " \
59
+ 'seconds and resetting the pod watcher.', e
60
+ )
61
+ sleep(Thread.current[:pod_watch_retry_backoff_interval])
62
+ Thread.current[:pod_watch_retry_count] += 1
63
+ Thread.current[:pod_watch_retry_backoff_interval] *= @watch_retry_exponential_backoff_base
64
+ pod_watcher = nil
65
+ else
66
+ # Since retries failed for many times, log as errors instead
67
+ # of info and raise exceptions and trigger Fluentd to restart.
68
+ message =
69
+ 'Exception encountered parsing pod watch event. The ' \
70
+ 'connection might have been closed. Retried ' \
71
+ "#{@watch_retry_max_times} times yet still failing. Restarting."
72
+ log.error(message, e)
73
+ raise Fluent::UnrecoverableError, message
74
+ end
75
+ end
76
+ end
77
+
26
78
  def start_pod_watch
27
- begin
28
- resource_version = @client.get_pods.resourceVersion
29
- watcher = @client.watch_pods(resource_version)
30
- rescue Exception => e
31
- message = "Exception encountered fetching metadata from Kubernetes API endpoint: #{e.message}"
32
- message += " (#{e.response})" if e.respond_to?(:response)
33
-
34
- raise Fluent::ConfigError, message
79
+ get_pods_and_start_watcher
80
+ rescue StandardError => e
81
+ message = 'start_pod_watch: Exception encountered setting up pod watch ' \
82
+ "from Kubernetes API #{@apiVersion} endpoint " \
83
+ "#{@kubernetes_url}: #{e.message}"
84
+ message += " (#{e.response})" if e.respond_to?(:response)
85
+ log.debug(message)
86
+
87
+ raise Fluent::ConfigError, message
88
+ end
89
+
90
+ # List all pods, record the resourceVersion and return a watcher starting
91
+ # from that resourceVersion.
92
+ def get_pods_and_start_watcher
93
+ options = {
94
+ resource_version: '0' # Fetch from API server cache instead of etcd quorum read
95
+ }
96
+ if ENV['K8S_NODE_NAME']
97
+ options[:field_selector] = 'spec.nodeName=' + ENV['K8S_NODE_NAME']
35
98
  end
99
+ if @last_seen_resource_version
100
+ options[:resource_version] = @last_seen_resource_version
101
+ else
102
+ pods = @client.get_pods(options)
103
+ pods[:items].each do |pod|
104
+ cache_key = pod[:metadata][:uid]
105
+ @cache[cache_key] = parse_pod_metadata(pod)
106
+ @stats.bump(:pod_cache_host_updates)
107
+ end
108
+
109
+ # continue watching from most recent resourceVersion
110
+ options[:resource_version] = pods[:metadata][:resourceVersion]
111
+ end
112
+
113
+ watcher = @client.watch_pods(options)
114
+ reset_pod_watch_retry_stats
115
+ watcher
116
+ end
117
+
118
+ # Reset pod watch retry count and backoff interval as there is a
119
+ # successful watch notice.
120
+ def reset_pod_watch_retry_stats
121
+ Thread.current[:pod_watch_retry_count] = 0
122
+ Thread.current[:pod_watch_retry_backoff_interval] = @watch_retry_interval
123
+ end
36
124
 
125
+ # Process a watcher notice and potentially raise an exception.
126
+ def process_pod_watcher_notices(watcher)
37
127
  watcher.each do |notice|
38
- case notice.type
39
- when 'MODIFIED'
40
- cache_key = notice.object['metadata']['uid']
41
- cached = @cache[cache_key]
42
- if cached
43
- @cache[cache_key] = parse_pod_metadata(notice.object)
44
- @stats.bump(:pod_cache_watch_updates)
45
- elsif ENV['K8S_NODE_NAME'] == notice.object['spec']['nodeName'] then
46
- @cache[cache_key] = parse_pod_metadata(notice.object)
47
- @stats.bump(:pod_cache_host_updates)
48
- else
49
- @stats.bump(:pod_cache_watch_misses)
50
- end
51
- when 'DELETED'
52
- # ignore and let age out for cases where pods
53
- # deleted but still processing logs
54
- @stats.bump(:pod_cache_watch_delete_ignored)
128
+ # store version we processed to not reprocess it ... do not unset when there is no version in response
129
+ version = ( # TODO: replace with &.dig once we are on ruby 2.5+
130
+ notice[:object] && notice[:object][:metadata] && notice[:object][:metadata][:resourceVersion]
131
+ )
132
+ @last_seen_resource_version = version if version
133
+
134
+ case notice[:type]
135
+ when 'MODIFIED'
136
+ reset_pod_watch_retry_stats
137
+ cache_key = notice.dig(:object, :metadata, :uid)
138
+ cached = @cache[cache_key]
139
+ if cached
140
+ @cache[cache_key] = parse_pod_metadata(notice[:object])
141
+ @stats.bump(:pod_cache_watch_updates)
142
+ elsif ENV['K8S_NODE_NAME'] == notice[:object][:spec][:nodeName]
143
+ @cache[cache_key] = parse_pod_metadata(notice[:object])
144
+ @stats.bump(:pod_cache_host_updates)
145
+ else
146
+ @stats.bump(:pod_cache_watch_misses)
147
+ end
148
+ when 'DELETED'
149
+ reset_pod_watch_retry_stats
150
+ # ignore and let age out for cases where pods
151
+ # deleted but still processing logs
152
+ @stats.bump(:pod_cache_watch_delete_ignored)
153
+ when 'ERROR'
154
+ if notice[:object] && notice[:object][:code] == 410
155
+ @last_seen_resource_version = nil # requested resourceVersion was too old, need to reset
156
+ @stats.bump(:pod_watch_gone_notices)
157
+ raise GoneError
55
158
  else
56
- # Don't pay attention to creations, since the created pod may not
57
- # end up on this node.
58
- @stats.bump(:pod_cache_watch_ignored)
159
+ @stats.bump(:pod_watch_error_type_notices)
160
+ message = notice[:object][:message] if notice[:object] && notice[:object][:message]
161
+ raise "Error while watching pods: #{message}"
162
+ end
163
+ else
164
+ reset_pod_watch_retry_stats
165
+ # Don't pay attention to creations, since the created pod may not
166
+ # end up on this node.
167
+ @stats.bump(:pod_cache_watch_ignored)
59
168
  end
60
169
  end
61
170
  end