kube-platform 3.3.1.gk.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +31 -0
- data/README.md +192 -0
- data/bin/kube-platform +37 -0
- data/lib/kube-platform/application.rb +203 -0
- data/lib/kube-platform/cli.rb +114 -0
- data/lib/kube-platform/client.rb +217 -0
- data/lib/kube-platform/cluster.rb +224 -0
- data/lib/kube-platform/cluster_definition.rb +115 -0
- data/lib/kube-platform/configuration.rb +145 -0
- data/lib/kube-platform/exceptions.rb +9 -0
- data/lib/kube-platform/handlers/dockerhub_secret_copy.rb +52 -0
- data/lib/kube-platform/handlers/ebs_from_snapshot.rb +108 -0
- data/lib/kube-platform/handlers/handler.rb +36 -0
- data/lib/kube-platform/handlers/recreate_resource.rb +11 -0
- data/lib/kube-platform/handlers/secret_copy.rb +43 -0
- data/lib/kube-platform/handlers/wait_for_job_completion.rb +69 -0
- data/lib/kube-platform/handlers/wait_for_termination.rb +47 -0
- data/lib/kube-platform/health_check.rb +19 -0
- data/lib/kube-platform/health_checks/pods_ready.rb +188 -0
- data/lib/kube-platform/health_checks/r53_records.rb +82 -0
- data/lib/kube-platform/helpers/retry.rb +20 -0
- data/lib/kube-platform/images/descriptor.rb +49 -0
- data/lib/kube-platform/images/docker_hub_image.rb +49 -0
- data/lib/kube-platform/images/dockerhub_image_factory.rb +64 -0
- data/lib/kube-platform/images/kubernetes_docker_hub_secret_provider.rb +44 -0
- data/lib/kube-platform/images/repository.rb +77 -0
- data/lib/kube-platform/images/tag_associator.rb +80 -0
- data/lib/kube-platform/images/tagged_dockerhub_image.rb +36 -0
- data/lib/kube-platform/logger.rb +32 -0
- data/lib/kube-platform/manifest.rb +61 -0
- data/lib/kube-platform/pre_checks/r53_records.rb +66 -0
- data/lib/kube-platform/pre_checks/valid_platform_dependencies.rb +52 -0
- data/lib/kube-platform/pre_checks.rb +19 -0
- data/lib/kube-platform/resource.rb +152 -0
- data/lib/kube-platform/resource_repository.rb +73 -0
- data/lib/kube-platform/thor/descriptor_to_option_adapter.rb +33 -0
- data/lib/kube-platform/update_checker.rb +39 -0
- data/lib/kube-platform/version.rb +5 -0
- data/lib/kube-platform.rb +40 -0
- metadata +179 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "secret_copy"
|
4
|
+
|
5
|
+
module KubePlatform
|
6
|
+
module Handlers
|
7
|
+
class DockerhubSecretCopy < SecretCopy
|
8
|
+
|
9
|
+
def post_create(client)
|
10
|
+
update_service_account(client)
|
11
|
+
end
|
12
|
+
|
13
|
+
def post_update(client)
|
14
|
+
update_service_account(client)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def update_service_account(client)
|
20
|
+
wait_for_service_account(client)
|
21
|
+
api_client(client).patch_service_account(service_account_name, dockerhub_secret, namespace)
|
22
|
+
end
|
23
|
+
|
24
|
+
def api_client(client)
|
25
|
+
client.client_for_api
|
26
|
+
end
|
27
|
+
|
28
|
+
def dockerhub_secret
|
29
|
+
{ imagePullSecrets: [{ name: secret_name }] }
|
30
|
+
end
|
31
|
+
|
32
|
+
def wait_for_service_account(client)
|
33
|
+
api_client(client).get_service_account(service_account_name, namespace)
|
34
|
+
rescue Kubeclient::ResourceNotFoundError
|
35
|
+
sleep(service_account_retry_delay)
|
36
|
+
retry # TODO: are infinite retries ok?
|
37
|
+
end
|
38
|
+
|
39
|
+
def service_account_name
|
40
|
+
config[:service_account_name]
|
41
|
+
end
|
42
|
+
|
43
|
+
def service_account_retry_delay
|
44
|
+
config[:service_account_retry_delay]
|
45
|
+
end
|
46
|
+
|
47
|
+
def namespace
|
48
|
+
resource.namespace
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "aws-sdk-ec2"
|
4
|
+
require_relative "handler"
|
5
|
+
require_relative "../logger"
|
6
|
+
|
7
|
+
module KubePlatform
|
8
|
+
module Handlers
|
9
|
+
class EbsFromSnapshot < Handler
|
10
|
+
include Logger
|
11
|
+
|
12
|
+
VOLUME_TYPE = "gp2"
|
13
|
+
WAIT_DELAY = 5.seconds
|
14
|
+
|
15
|
+
def pre_create(_client)
|
16
|
+
logger.info("Creating EBS volume from snapshot ID #{snapshot_id}")
|
17
|
+
volume_id = create_volume
|
18
|
+
update_resource(volume_id)
|
19
|
+
logger.info("Created EBS volume #{volume_id} for use with #{resource.kind} #{resource.name}")
|
20
|
+
end
|
21
|
+
|
22
|
+
def pre_delete(client)
|
23
|
+
if resource_exists?(client)
|
24
|
+
volume_id = volume_id_from_resource(client)
|
25
|
+
logger.info("Waiting for EBS volume #{volume_id} to delete")
|
26
|
+
wait_until_deleted(volume_id)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def resource_exists?(client)
|
33
|
+
retrieve_resource(client) != nil
|
34
|
+
end
|
35
|
+
|
36
|
+
def volume_id_from_resource(client)
|
37
|
+
unwrapped = retrieve_resource(client).unwrap
|
38
|
+
unwrapped.spec.awsElasticBlockStore.volumeID
|
39
|
+
end
|
40
|
+
|
41
|
+
def retrieve_resource(client)
|
42
|
+
@retrieve_resource ||= client.get(resource)
|
43
|
+
end
|
44
|
+
|
45
|
+
def availability_zone
|
46
|
+
config[:availability_zone]
|
47
|
+
end
|
48
|
+
|
49
|
+
def tags
|
50
|
+
config[:tags]
|
51
|
+
end
|
52
|
+
|
53
|
+
def create_volume
|
54
|
+
volume = ec2_client.create_volume(
|
55
|
+
availability_zone: availability_zone,
|
56
|
+
volume_type: VOLUME_TYPE,
|
57
|
+
snapshot_id: snapshot_id
|
58
|
+
)
|
59
|
+
|
60
|
+
id = volume.volume_id
|
61
|
+
wait_until_available(id)
|
62
|
+
id
|
63
|
+
end
|
64
|
+
|
65
|
+
def wait_until_available(volume_id)
|
66
|
+
ec2_client.wait_until(:volume_available, { volume_ids: [volume_id] }, delay: WAIT_DELAY)
|
67
|
+
end
|
68
|
+
|
69
|
+
def wait_until_deleted(volume_id)
|
70
|
+
ec2_client.wait_until(:volume_deleted, { volume_ids: [volume_id] }, delay: WAIT_DELAY)
|
71
|
+
end
|
72
|
+
|
73
|
+
def update_resource(volume_id)
|
74
|
+
unwrapped = resource.unwrap
|
75
|
+
unwrapped.spec.awsElasticBlockStore.volumeID = volume_id
|
76
|
+
end
|
77
|
+
|
78
|
+
def snapshot_id
|
79
|
+
@snapshot_id ||= snapshot_search(tags)
|
80
|
+
end
|
81
|
+
|
82
|
+
def ec2_client
|
83
|
+
@ec2_client ||= Aws::EC2::Client.new
|
84
|
+
end
|
85
|
+
|
86
|
+
def owner_id
|
87
|
+
@owner_id ||= Aws::STS::Client.new.get_caller_identity[:account]
|
88
|
+
end
|
89
|
+
|
90
|
+
def snapshot_search(tags)
|
91
|
+
snapshots = ec2_client.describe_snapshots(owner_ids: [owner_id], filters: build_filters(tags)).snapshots
|
92
|
+
snapshots.min { |a, b| b.start_time <=> a.start_time }.snapshot_id
|
93
|
+
end
|
94
|
+
|
95
|
+
def build_filters(tags)
|
96
|
+
tag_hash_to_filter_format(tags) + [filter_format("status", "completed")]
|
97
|
+
end
|
98
|
+
|
99
|
+
def tag_hash_to_filter_format(tags)
|
100
|
+
tags.map { |tag_name, value| filter_format("tag:#{tag_name}", value) }
|
101
|
+
end
|
102
|
+
|
103
|
+
def filter_format(name, value)
|
104
|
+
{ name: name, values: Array(value).map(&:to_s) }
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module KubePlatform
|
4
|
+
module Handlers
|
5
|
+
class Handler
|
6
|
+
attr_reader :config
|
7
|
+
attr_accessor :resource # TODO: don't expose this mutator. Pass the resource into the constructor.
|
8
|
+
|
9
|
+
def initialize(config)
|
10
|
+
@config = config
|
11
|
+
end
|
12
|
+
|
13
|
+
def pre_create(_client)
|
14
|
+
end
|
15
|
+
|
16
|
+
def post_create(_client)
|
17
|
+
end
|
18
|
+
|
19
|
+
def pre_update(_client)
|
20
|
+
end
|
21
|
+
|
22
|
+
def post_update(_client)
|
23
|
+
end
|
24
|
+
|
25
|
+
def pre_delete(_client)
|
26
|
+
end
|
27
|
+
|
28
|
+
def post_delete(_client)
|
29
|
+
end
|
30
|
+
|
31
|
+
def ==(other)
|
32
|
+
self.class == other.class && config == other.config
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module KubePlatform
|
4
|
+
module Handlers
|
5
|
+
class SecretCopy < Handler
|
6
|
+
def pre_create(client)
|
7
|
+
copy_secret(client)
|
8
|
+
end
|
9
|
+
|
10
|
+
def pre_update(client)
|
11
|
+
copy_secret(client)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def copy_secret(client)
|
17
|
+
unwrapped = resource.unwrap
|
18
|
+
unwrapped.data = existing_secret(client)
|
19
|
+
end
|
20
|
+
|
21
|
+
def api_client(client)
|
22
|
+
client.client_for_api
|
23
|
+
end
|
24
|
+
|
25
|
+
def existing_secret(client)
|
26
|
+
secret_resource = api_client(client).get_secret(secret_name, source_namespace)
|
27
|
+
extract_secret(secret_resource)
|
28
|
+
end
|
29
|
+
|
30
|
+
def extract_secret(secret)
|
31
|
+
secret.data.to_hash
|
32
|
+
end
|
33
|
+
|
34
|
+
def secret_name
|
35
|
+
resource.name
|
36
|
+
end
|
37
|
+
|
38
|
+
def source_namespace
|
39
|
+
config[:source_namespace]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "../logger"
|
4
|
+
|
5
|
+
module KubePlatform
|
6
|
+
module Handlers
|
7
|
+
class WaitForJobCompletion < Handler
|
8
|
+
include Logger
|
9
|
+
|
10
|
+
DEFAULT_CONFIG = { polling_interval: 5, timeout: 300 }.freeze
|
11
|
+
|
12
|
+
def initialize(config)
|
13
|
+
super(config.apply_defaults(DEFAULT_CONFIG))
|
14
|
+
end
|
15
|
+
|
16
|
+
def post_create(client)
|
17
|
+
logger.info("Waiting for #{resource.kind} #{resource.name} to complete")
|
18
|
+
raise_if_job_does_not_complete!(client)
|
19
|
+
end
|
20
|
+
|
21
|
+
def timeout
|
22
|
+
config[:timeout]
|
23
|
+
end
|
24
|
+
|
25
|
+
def polling_interval
|
26
|
+
config[:polling_interval]
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def raise_if_job_does_not_complete!(client)
|
32
|
+
attempts, interval = calculate_attempts_and_interval
|
33
|
+
|
34
|
+
job_complete_within_allotted_time?(client, attempts, interval) or
|
35
|
+
raise WaitTimeoutException, "#{resource.kind} #{resource.name} did not complete within #{timeout} seconds"
|
36
|
+
end
|
37
|
+
|
38
|
+
def calculate_attempts_and_interval
|
39
|
+
if polling_interval <= 0
|
40
|
+
logger.warn("Polling interval is set to #{polling_interval}. Will retry once after #{timeout} seconds.")
|
41
|
+
attempts = 2
|
42
|
+
interval = timeout
|
43
|
+
else
|
44
|
+
interval = polling_interval
|
45
|
+
attempts = (timeout / interval).ceil + 1
|
46
|
+
end
|
47
|
+
|
48
|
+
[attempts, interval]
|
49
|
+
end
|
50
|
+
|
51
|
+
def job_complete_within_allotted_time?(client, attempts, interval)
|
52
|
+
Helpers::Retry.with_retries(attempts, interval) do
|
53
|
+
if job_complete?(client) # TODO: Move this logging into the caller?
|
54
|
+
logger.info("#{resource.kind} #{resource.name} is complete")
|
55
|
+
true
|
56
|
+
else
|
57
|
+
logger.debug("#{resource.kind} #{resource.name} is not ready. Sleeping #{interval} seconds.")
|
58
|
+
false
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def job_complete?(client)
|
64
|
+
job = client.get(resource)
|
65
|
+
job.unwrap.spec.completions == job.unwrap&.status&.succeeded
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "../logger"
|
4
|
+
require_relative "../helpers/retry"
|
5
|
+
|
6
|
+
module KubePlatform
|
7
|
+
module Handlers
|
8
|
+
class WaitForTermination < Handler
|
9
|
+
include Logger
|
10
|
+
|
11
|
+
DEFAULT_CONFIG = { polling_interval: 5, timeout: 300 }.freeze
|
12
|
+
|
13
|
+
def initialize(config)
|
14
|
+
super(config.apply_defaults(DEFAULT_CONFIG))
|
15
|
+
end
|
16
|
+
|
17
|
+
def polling_interval
|
18
|
+
config[:polling_interval]
|
19
|
+
end
|
20
|
+
|
21
|
+
def timeout
|
22
|
+
config[:timeout]
|
23
|
+
end
|
24
|
+
|
25
|
+
def post_delete(client)
|
26
|
+
resource_terminated?(client) or
|
27
|
+
raise WaitTimeoutException, "Timeout of #{timeout} seconds reached while waiting for #{resource.kind} #{resource.name} to terminate"
|
28
|
+
|
29
|
+
logger.debug("#{resource.kind} #{resource.name} has terminated")
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def resource_terminated?(client)
|
35
|
+
attempts = timeout / polling_interval
|
36
|
+
Helpers::Retry.with_retries(attempts, polling_interval) do
|
37
|
+
if client.exist?(resource)
|
38
|
+
logger.debug("#{resource.kind} #{resource.name} exists. Sleeping for #{polling_interval} seconds")
|
39
|
+
false
|
40
|
+
else
|
41
|
+
true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module KubePlatform
|
4
|
+
class HealthCheck
|
5
|
+
attr_reader :name, :config
|
6
|
+
|
7
|
+
def initialize(name, config)
|
8
|
+
@name = name
|
9
|
+
@config = config
|
10
|
+
end
|
11
|
+
|
12
|
+
class << self
|
13
|
+
def load(class_name:, name:, config:)
|
14
|
+
klass = "KubePlatform::HealthChecks::#{class_name}".constantize
|
15
|
+
klass.new(name, config)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../health_check'
|
4
|
+
require_relative '../logger'
|
5
|
+
|
6
|
+
module KubePlatform
|
7
|
+
module HealthChecks
|
8
|
+
class PodsReady < HealthCheck
|
9
|
+
NotReadyContainerException = Class.new(KubePlatformException)
|
10
|
+
|
11
|
+
include Logger
|
12
|
+
|
13
|
+
DEFAULT_CONFIG = { attempts: 80, interval: 30 }.freeze
|
14
|
+
|
15
|
+
def initialize(name, config)
|
16
|
+
super(name, config.apply_defaults(DEFAULT_CONFIG))
|
17
|
+
end
|
18
|
+
|
19
|
+
# on success, returns true
|
20
|
+
# on failure, raises NotReadyContainerException
|
21
|
+
def run(client, _cluster_definition)
|
22
|
+
logger.info("Waiting for pods to become ready. This may take a while.")
|
23
|
+
list_of_unready_pods = wait_for_pods_to_become_ready(client)
|
24
|
+
|
25
|
+
if list_of_unready_pods.empty?
|
26
|
+
logger.info("All pods are ready")
|
27
|
+
true
|
28
|
+
else
|
29
|
+
log_unhealthy(:error, list_of_unready_pods)
|
30
|
+
|
31
|
+
independent_unhealthy_pods = independently_failing_pods(list_of_unready_pods)
|
32
|
+
not_ready_container_info = not_ready_containers(independent_unhealthy_pods)
|
33
|
+
not_ready_container_info.any? or raise ArgumentError, "not_ready_container_pairs is empty? #{independent_unhealthy_pods.inspect}"
|
34
|
+
|
35
|
+
raise NotReadyContainerException, not_ready_container_detail(not_ready_container_info, client)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def independently_failing_pods(unready_pods)
|
42
|
+
unready_pod_labels = Set.new(unready_pods.map { |pod| pod.metadata.labels.app || pod.metadata.labels.send(:"job-name") })
|
43
|
+
|
44
|
+
unready_pods.reject do |pod|
|
45
|
+
if (dependencies_csv = pod.metadata&.annotations&.platform_startup_dependencies)
|
46
|
+
dependencies_csv.split(/, */).any? { |pod_label| unready_pod_labels.include?(pod_label) }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# returns a hash of not ready pods
|
52
|
+
# key is the pod name
|
53
|
+
# value is a hash containing array of not ready containers and pod details
|
54
|
+
#
|
55
|
+
# Ex: { "pod-123" => { not_ready_containers: ["nginx"], pod_details: PodStruct } }
|
56
|
+
def not_ready_containers(unready_pods)
|
57
|
+
unready_pods.each_with_object({}) do |pod, result|
|
58
|
+
not_ready_containers = pod.status.containerStatuses&.reject(&:ready) || []
|
59
|
+
result[pod.metadata.name] = {
|
60
|
+
not_ready_containers: not_ready_containers.map(&:name),
|
61
|
+
details: pod
|
62
|
+
}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# returns detailed text describing the root cause of failure for the given array of not_ready_container pairs
|
67
|
+
# suitable for framing in an exception body
|
68
|
+
def not_ready_container_detail(not_ready_containers, client)
|
69
|
+
not_ready_containers.map do |pod_name, pod|
|
70
|
+
not_ready_container_details_for_pod(pod_name, pod[:not_ready_containers], pod[:details], client)
|
71
|
+
end.flatten.join("\n").sub(/\s+\z/m, "")
|
72
|
+
end
|
73
|
+
|
74
|
+
def not_ready_container_details_for_pod(pod_name, not_ready_containers, pod_details, client)
|
75
|
+
if not_ready_containers.any?
|
76
|
+
not_ready_containers.map do |container_name|
|
77
|
+
not_ready_container_details_from_log(pod_name, container_name, pod_details, client)
|
78
|
+
end
|
79
|
+
else
|
80
|
+
"Pod #{pod_name} not ready but no containerStatus available"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def not_ready_container_details_from_log(pod_name, container_name, pod_details, client)
|
85
|
+
summary = ["Pod #{pod_name} container #{container_name} not ready:"]
|
86
|
+
|
87
|
+
log_lines = client.client_for_api.get_pod_log(pod_name, config.cluster_name, container: container_name).to_s.split("\n")
|
88
|
+
|
89
|
+
summary << if (exit_reason_line = last_log_occurrence(log_lines, '"exit_reason":'))
|
90
|
+
exit_reason_yaml_from_log_line(exit_reason_line) || "JSON parse error or no 'app' key: #{exit_reason_line.inspect}"
|
91
|
+
else
|
92
|
+
"last 50 log lines:\n#{last_n_log_lines(log_lines, 50).join("\n")}"
|
93
|
+
end
|
94
|
+
|
95
|
+
summary.join("\n")
|
96
|
+
rescue Kubeclient::HttpError => ex
|
97
|
+
ex.message.include?("ContainerCreating") or raise
|
98
|
+
summary << pod_details.status.containerStatuses.map { |status| status.to_hash.stringify_keys }.to_yaml
|
99
|
+
summary.join("\n")
|
100
|
+
end
|
101
|
+
|
102
|
+
def exit_reason_yaml_from_log_line(exit_reason_line)
|
103
|
+
if (exit_reason_hash = JSON.parse(exit_reason_line) rescue nil)
|
104
|
+
if (exit_reason_hash_app = exit_reason_hash['app'])
|
105
|
+
exit_reason_hash_app_simplified = exit_reason_hash_app.except("host", "pid", "tid", "logfile", "fiber", "exit_code", "timestamp", "progname", "log_tags")
|
106
|
+
exit_reason_hash_app_simplified.to_yaml
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def last_log_occurrence(lines, pattern)
|
112
|
+
last_line = nil
|
113
|
+
lines.each do |line|
|
114
|
+
if line[pattern]
|
115
|
+
last_line = line
|
116
|
+
end
|
117
|
+
end
|
118
|
+
last_line
|
119
|
+
end
|
120
|
+
|
121
|
+
def last_n_log_lines(lines, n)
|
122
|
+
last_line_count = [lines.size, n].min
|
123
|
+
lines[-last_line_count..-1]
|
124
|
+
end
|
125
|
+
|
126
|
+
# returns the array of 0 or more unready pods
|
127
|
+
def wait_for_pods_to_become_ready(client)
|
128
|
+
unready = []
|
129
|
+
Helpers::Retry.with_retries(attempts, retry_interval) do
|
130
|
+
unready = unready_pods(client)
|
131
|
+
unready.empty? or begin
|
132
|
+
log_status_update(unready)
|
133
|
+
false
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
unready
|
138
|
+
end
|
139
|
+
|
140
|
+
def unready_pods(client)
|
141
|
+
client.client_for_api("v1").get_pods(namespace: namespace).reject { |pod| evicted?(pod) || pod_ready?(pod) }
|
142
|
+
end
|
143
|
+
|
144
|
+
def pod_ready?(pod)
|
145
|
+
if job?(pod)
|
146
|
+
pod.status.phase == "Succeeded"
|
147
|
+
else
|
148
|
+
pod.status&.containerStatuses&.all?(&:ready)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def evicted?(pod)
|
153
|
+
pod.status.phase == "Failed" && pod.status.reason == "Evicted"
|
154
|
+
end
|
155
|
+
|
156
|
+
def job?(pod)
|
157
|
+
pod.metadata.ownerReferences&.any? { |owner| owner.kind == "Job" }
|
158
|
+
end
|
159
|
+
|
160
|
+
def log_status_update(unready)
|
161
|
+
optional_detail =
|
162
|
+
if unready.size <= 5
|
163
|
+
": " + unready.map do |pod|
|
164
|
+
pod.metadata.name.split('-', 2).first
|
165
|
+
end.join(", ")
|
166
|
+
end
|
167
|
+
logger.info("#{unready.size} pod#{unready.size > 1 ? 's are' : ' is'} not in a ready state#{optional_detail}")
|
168
|
+
log_unhealthy(:debug, unready)
|
169
|
+
end
|
170
|
+
|
171
|
+
def log_unhealthy(level, pods)
|
172
|
+
pods.each { |pod| logger.send(level, "Pod #{pod.metadata.name} is not in a healthy state") }
|
173
|
+
end
|
174
|
+
|
175
|
+
def namespace
|
176
|
+
config[:cluster_name]
|
177
|
+
end
|
178
|
+
|
179
|
+
def attempts
|
180
|
+
config[:attempts]
|
181
|
+
end
|
182
|
+
|
183
|
+
def retry_interval
|
184
|
+
config[:interval]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "aws-sdk-route53"
|
4
|
+
require_relative "../logger"
|
5
|
+
require_relative "../helpers/retry"
|
6
|
+
|
7
|
+
module KubePlatform
|
8
|
+
module HealthChecks
|
9
|
+
class R53Records < HealthCheck
|
10
|
+
include Logger
|
11
|
+
|
12
|
+
DEFAULT_CONFIG = { attempts: 20, interval: 30 }.freeze
|
13
|
+
|
14
|
+
def initialize(name, config)
|
15
|
+
super(name, config.apply_defaults(DEFAULT_CONFIG))
|
16
|
+
end
|
17
|
+
|
18
|
+
def run(_client, _cluster_definition)
|
19
|
+
logger.info("Checking for Route53 DNS records")
|
20
|
+
|
21
|
+
missing = missing_records
|
22
|
+
if missing.empty?
|
23
|
+
logger.info("All Route53 records exist")
|
24
|
+
true
|
25
|
+
else
|
26
|
+
missing.each { |record| logger.error("Route53 record #{record} does not exist") }
|
27
|
+
false
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def missing_records
|
34
|
+
missing_records = fully_qualified_names
|
35
|
+
Helpers::Retry.with_retries(attempts, retry_interval) do
|
36
|
+
missing_records.reject! { |record| record_exists?(record) }
|
37
|
+
missing_records.empty? and break
|
38
|
+
end
|
39
|
+
|
40
|
+
missing_records
|
41
|
+
end
|
42
|
+
|
43
|
+
def fully_qualified_names
|
44
|
+
@fully_qualified_names ||= r53_records.map { |record| record.end_with?(".") ? record : "#{record}." }
|
45
|
+
end
|
46
|
+
|
47
|
+
def record_exists?(record)
|
48
|
+
response = r53_client.list_resource_record_sets(
|
49
|
+
hosted_zone_id: r53_zone_id,
|
50
|
+
start_record_name: record,
|
51
|
+
start_record_type: "A",
|
52
|
+
max_items: 1
|
53
|
+
)
|
54
|
+
response.resource_record_sets.first&.name == record
|
55
|
+
end
|
56
|
+
|
57
|
+
def r53_client
|
58
|
+
@r53_client ||= Aws::Route53::Client.new(region: region)
|
59
|
+
end
|
60
|
+
|
61
|
+
def region
|
62
|
+
config[:region]
|
63
|
+
end
|
64
|
+
|
65
|
+
def r53_zone_id
|
66
|
+
config[:r53_zone_id]
|
67
|
+
end
|
68
|
+
|
69
|
+
def r53_records
|
70
|
+
@r53_records ||= config[:r53_records]
|
71
|
+
end
|
72
|
+
|
73
|
+
def attempts
|
74
|
+
config[:attempts]
|
75
|
+
end
|
76
|
+
|
77
|
+
def retry_interval
|
78
|
+
config[:interval]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module KubePlatform
|
4
|
+
module Helpers
|
5
|
+
module Retry
|
6
|
+
class << self
|
7
|
+
def with_retries(max_attempts, retry_interval)
|
8
|
+
try = 0
|
9
|
+
loop do
|
10
|
+
success = yield and break success
|
11
|
+
|
12
|
+
(try += 1) >= max_attempts and break false
|
13
|
+
|
14
|
+
sleep(retry_interval)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|