kube-platform 3.3.1.gk.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +31 -0
- data/README.md +192 -0
- data/bin/kube-platform +37 -0
- data/lib/kube-platform/application.rb +203 -0
- data/lib/kube-platform/cli.rb +114 -0
- data/lib/kube-platform/client.rb +217 -0
- data/lib/kube-platform/cluster.rb +224 -0
- data/lib/kube-platform/cluster_definition.rb +115 -0
- data/lib/kube-platform/configuration.rb +145 -0
- data/lib/kube-platform/exceptions.rb +9 -0
- data/lib/kube-platform/handlers/dockerhub_secret_copy.rb +52 -0
- data/lib/kube-platform/handlers/ebs_from_snapshot.rb +108 -0
- data/lib/kube-platform/handlers/handler.rb +36 -0
- data/lib/kube-platform/handlers/recreate_resource.rb +11 -0
- data/lib/kube-platform/handlers/secret_copy.rb +43 -0
- data/lib/kube-platform/handlers/wait_for_job_completion.rb +69 -0
- data/lib/kube-platform/handlers/wait_for_termination.rb +47 -0
- data/lib/kube-platform/health_check.rb +19 -0
- data/lib/kube-platform/health_checks/pods_ready.rb +188 -0
- data/lib/kube-platform/health_checks/r53_records.rb +82 -0
- data/lib/kube-platform/helpers/retry.rb +20 -0
- data/lib/kube-platform/images/descriptor.rb +49 -0
- data/lib/kube-platform/images/docker_hub_image.rb +49 -0
- data/lib/kube-platform/images/dockerhub_image_factory.rb +64 -0
- data/lib/kube-platform/images/kubernetes_docker_hub_secret_provider.rb +44 -0
- data/lib/kube-platform/images/repository.rb +77 -0
- data/lib/kube-platform/images/tag_associator.rb +80 -0
- data/lib/kube-platform/images/tagged_dockerhub_image.rb +36 -0
- data/lib/kube-platform/logger.rb +32 -0
- data/lib/kube-platform/manifest.rb +61 -0
- data/lib/kube-platform/pre_checks/r53_records.rb +66 -0
- data/lib/kube-platform/pre_checks/valid_platform_dependencies.rb +52 -0
- data/lib/kube-platform/pre_checks.rb +19 -0
- data/lib/kube-platform/resource.rb +152 -0
- data/lib/kube-platform/resource_repository.rb +73 -0
- data/lib/kube-platform/thor/descriptor_to_option_adapter.rb +33 -0
- data/lib/kube-platform/update_checker.rb +39 -0
- data/lib/kube-platform/version.rb +5 -0
- data/lib/kube-platform.rb +40 -0
- metadata +179 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "secret_copy"
|
4
|
+
|
5
|
+
module KubePlatform
|
6
|
+
module Handlers
|
7
|
+
class DockerhubSecretCopy < SecretCopy
|
8
|
+
|
9
|
+
def post_create(client)
|
10
|
+
update_service_account(client)
|
11
|
+
end
|
12
|
+
|
13
|
+
def post_update(client)
|
14
|
+
update_service_account(client)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def update_service_account(client)
|
20
|
+
wait_for_service_account(client)
|
21
|
+
api_client(client).patch_service_account(service_account_name, dockerhub_secret, namespace)
|
22
|
+
end
|
23
|
+
|
24
|
+
def api_client(client)
|
25
|
+
client.client_for_api
|
26
|
+
end
|
27
|
+
|
28
|
+
def dockerhub_secret
|
29
|
+
{ imagePullSecrets: [{ name: secret_name }] }
|
30
|
+
end
|
31
|
+
|
32
|
+
def wait_for_service_account(client)
|
33
|
+
api_client(client).get_service_account(service_account_name, namespace)
|
34
|
+
rescue Kubeclient::ResourceNotFoundError
|
35
|
+
sleep(service_account_retry_delay)
|
36
|
+
retry # TODO: are infinite retries ok?
|
37
|
+
end
|
38
|
+
|
39
|
+
def service_account_name
|
40
|
+
config[:service_account_name]
|
41
|
+
end
|
42
|
+
|
43
|
+
def service_account_retry_delay
|
44
|
+
config[:service_account_retry_delay]
|
45
|
+
end
|
46
|
+
|
47
|
+
def namespace
|
48
|
+
resource.namespace
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "aws-sdk-ec2"
|
4
|
+
require_relative "handler"
|
5
|
+
require_relative "../logger"
|
6
|
+
|
7
|
+
module KubePlatform
|
8
|
+
module Handlers
|
9
|
+
class EbsFromSnapshot < Handler
|
10
|
+
include Logger
|
11
|
+
|
12
|
+
VOLUME_TYPE = "gp2"
|
13
|
+
WAIT_DELAY = 5.seconds
|
14
|
+
|
15
|
+
def pre_create(_client)
|
16
|
+
logger.info("Creating EBS volume from snapshot ID #{snapshot_id}")
|
17
|
+
volume_id = create_volume
|
18
|
+
update_resource(volume_id)
|
19
|
+
logger.info("Created EBS volume #{volume_id} for use with #{resource.kind} #{resource.name}")
|
20
|
+
end
|
21
|
+
|
22
|
+
def pre_delete(client)
|
23
|
+
if resource_exists?(client)
|
24
|
+
volume_id = volume_id_from_resource(client)
|
25
|
+
logger.info("Waiting for EBS volume #{volume_id} to delete")
|
26
|
+
wait_until_deleted(volume_id)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def resource_exists?(client)
|
33
|
+
retrieve_resource(client) != nil
|
34
|
+
end
|
35
|
+
|
36
|
+
def volume_id_from_resource(client)
|
37
|
+
unwrapped = retrieve_resource(client).unwrap
|
38
|
+
unwrapped.spec.awsElasticBlockStore.volumeID
|
39
|
+
end
|
40
|
+
|
41
|
+
def retrieve_resource(client)
|
42
|
+
@retrieve_resource ||= client.get(resource)
|
43
|
+
end
|
44
|
+
|
45
|
+
def availability_zone
|
46
|
+
config[:availability_zone]
|
47
|
+
end
|
48
|
+
|
49
|
+
def tags
|
50
|
+
config[:tags]
|
51
|
+
end
|
52
|
+
|
53
|
+
def create_volume
|
54
|
+
volume = ec2_client.create_volume(
|
55
|
+
availability_zone: availability_zone,
|
56
|
+
volume_type: VOLUME_TYPE,
|
57
|
+
snapshot_id: snapshot_id
|
58
|
+
)
|
59
|
+
|
60
|
+
id = volume.volume_id
|
61
|
+
wait_until_available(id)
|
62
|
+
id
|
63
|
+
end
|
64
|
+
|
65
|
+
def wait_until_available(volume_id)
|
66
|
+
ec2_client.wait_until(:volume_available, { volume_ids: [volume_id] }, delay: WAIT_DELAY)
|
67
|
+
end
|
68
|
+
|
69
|
+
def wait_until_deleted(volume_id)
|
70
|
+
ec2_client.wait_until(:volume_deleted, { volume_ids: [volume_id] }, delay: WAIT_DELAY)
|
71
|
+
end
|
72
|
+
|
73
|
+
def update_resource(volume_id)
|
74
|
+
unwrapped = resource.unwrap
|
75
|
+
unwrapped.spec.awsElasticBlockStore.volumeID = volume_id
|
76
|
+
end
|
77
|
+
|
78
|
+
def snapshot_id
|
79
|
+
@snapshot_id ||= snapshot_search(tags)
|
80
|
+
end
|
81
|
+
|
82
|
+
def ec2_client
|
83
|
+
@ec2_client ||= Aws::EC2::Client.new
|
84
|
+
end
|
85
|
+
|
86
|
+
def owner_id
|
87
|
+
@owner_id ||= Aws::STS::Client.new.get_caller_identity[:account]
|
88
|
+
end
|
89
|
+
|
90
|
+
def snapshot_search(tags)
|
91
|
+
snapshots = ec2_client.describe_snapshots(owner_ids: [owner_id], filters: build_filters(tags)).snapshots
|
92
|
+
snapshots.min { |a, b| b.start_time <=> a.start_time }.snapshot_id
|
93
|
+
end
|
94
|
+
|
95
|
+
def build_filters(tags)
|
96
|
+
tag_hash_to_filter_format(tags) + [filter_format("status", "completed")]
|
97
|
+
end
|
98
|
+
|
99
|
+
def tag_hash_to_filter_format(tags)
|
100
|
+
tags.map { |tag_name, value| filter_format("tag:#{tag_name}", value) }
|
101
|
+
end
|
102
|
+
|
103
|
+
def filter_format(name, value)
|
104
|
+
{ name: name, values: Array(value).map(&:to_s) }
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module KubePlatform
|
4
|
+
module Handlers
|
5
|
+
class Handler
|
6
|
+
attr_reader :config
|
7
|
+
attr_accessor :resource # TODO: don't expose this mutator. Pass the resource into the constructor.
|
8
|
+
|
9
|
+
def initialize(config)
|
10
|
+
@config = config
|
11
|
+
end
|
12
|
+
|
13
|
+
def pre_create(_client)
|
14
|
+
end
|
15
|
+
|
16
|
+
def post_create(_client)
|
17
|
+
end
|
18
|
+
|
19
|
+
def pre_update(_client)
|
20
|
+
end
|
21
|
+
|
22
|
+
def post_update(_client)
|
23
|
+
end
|
24
|
+
|
25
|
+
def pre_delete(_client)
|
26
|
+
end
|
27
|
+
|
28
|
+
def post_delete(_client)
|
29
|
+
end
|
30
|
+
|
31
|
+
def ==(other)
|
32
|
+
self.class == other.class && config == other.config
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module KubePlatform
|
4
|
+
module Handlers
|
5
|
+
class SecretCopy < Handler
|
6
|
+
def pre_create(client)
|
7
|
+
copy_secret(client)
|
8
|
+
end
|
9
|
+
|
10
|
+
def pre_update(client)
|
11
|
+
copy_secret(client)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def copy_secret(client)
|
17
|
+
unwrapped = resource.unwrap
|
18
|
+
unwrapped.data = existing_secret(client)
|
19
|
+
end
|
20
|
+
|
21
|
+
def api_client(client)
|
22
|
+
client.client_for_api
|
23
|
+
end
|
24
|
+
|
25
|
+
def existing_secret(client)
|
26
|
+
secret_resource = api_client(client).get_secret(secret_name, source_namespace)
|
27
|
+
extract_secret(secret_resource)
|
28
|
+
end
|
29
|
+
|
30
|
+
def extract_secret(secret)
|
31
|
+
secret.data.to_hash
|
32
|
+
end
|
33
|
+
|
34
|
+
def secret_name
|
35
|
+
resource.name
|
36
|
+
end
|
37
|
+
|
38
|
+
def source_namespace
|
39
|
+
config[:source_namespace]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "../logger"
|
4
|
+
|
5
|
+
module KubePlatform
|
6
|
+
module Handlers
|
7
|
+
class WaitForJobCompletion < Handler
|
8
|
+
include Logger
|
9
|
+
|
10
|
+
DEFAULT_CONFIG = { polling_interval: 5, timeout: 300 }.freeze
|
11
|
+
|
12
|
+
def initialize(config)
|
13
|
+
super(config.apply_defaults(DEFAULT_CONFIG))
|
14
|
+
end
|
15
|
+
|
16
|
+
def post_create(client)
|
17
|
+
logger.info("Waiting for #{resource.kind} #{resource.name} to complete")
|
18
|
+
raise_if_job_does_not_complete!(client)
|
19
|
+
end
|
20
|
+
|
21
|
+
def timeout
|
22
|
+
config[:timeout]
|
23
|
+
end
|
24
|
+
|
25
|
+
def polling_interval
|
26
|
+
config[:polling_interval]
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def raise_if_job_does_not_complete!(client)
|
32
|
+
attempts, interval = calculate_attempts_and_interval
|
33
|
+
|
34
|
+
job_complete_within_allotted_time?(client, attempts, interval) or
|
35
|
+
raise WaitTimeoutException, "#{resource.kind} #{resource.name} did not complete within #{timeout} seconds"
|
36
|
+
end
|
37
|
+
|
38
|
+
def calculate_attempts_and_interval
|
39
|
+
if polling_interval <= 0
|
40
|
+
logger.warn("Polling interval is set to #{polling_interval}. Will retry once after #{timeout} seconds.")
|
41
|
+
attempts = 2
|
42
|
+
interval = timeout
|
43
|
+
else
|
44
|
+
interval = polling_interval
|
45
|
+
attempts = (timeout / interval).ceil + 1
|
46
|
+
end
|
47
|
+
|
48
|
+
[attempts, interval]
|
49
|
+
end
|
50
|
+
|
51
|
+
def job_complete_within_allotted_time?(client, attempts, interval)
|
52
|
+
Helpers::Retry.with_retries(attempts, interval) do
|
53
|
+
if job_complete?(client) # TODO: Move this logging into the caller?
|
54
|
+
logger.info("#{resource.kind} #{resource.name} is complete")
|
55
|
+
true
|
56
|
+
else
|
57
|
+
logger.debug("#{resource.kind} #{resource.name} is not ready. Sleeping #{interval} seconds.")
|
58
|
+
false
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def job_complete?(client)
|
64
|
+
job = client.get(resource)
|
65
|
+
job.unwrap.spec.completions == job.unwrap&.status&.succeeded
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "../logger"
|
4
|
+
require_relative "../helpers/retry"
|
5
|
+
|
6
|
+
module KubePlatform
|
7
|
+
module Handlers
|
8
|
+
class WaitForTermination < Handler
|
9
|
+
include Logger
|
10
|
+
|
11
|
+
DEFAULT_CONFIG = { polling_interval: 5, timeout: 300 }.freeze
|
12
|
+
|
13
|
+
def initialize(config)
|
14
|
+
super(config.apply_defaults(DEFAULT_CONFIG))
|
15
|
+
end
|
16
|
+
|
17
|
+
def polling_interval
|
18
|
+
config[:polling_interval]
|
19
|
+
end
|
20
|
+
|
21
|
+
def timeout
|
22
|
+
config[:timeout]
|
23
|
+
end
|
24
|
+
|
25
|
+
def post_delete(client)
|
26
|
+
resource_terminated?(client) or
|
27
|
+
raise WaitTimeoutException, "Timeout of #{timeout} seconds reached while waiting for #{resource.kind} #{resource.name} to terminate"
|
28
|
+
|
29
|
+
logger.debug("#{resource.kind} #{resource.name} has terminated")
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def resource_terminated?(client)
|
35
|
+
attempts = timeout / polling_interval
|
36
|
+
Helpers::Retry.with_retries(attempts, polling_interval) do
|
37
|
+
if client.exist?(resource)
|
38
|
+
logger.debug("#{resource.kind} #{resource.name} exists. Sleeping for #{polling_interval} seconds")
|
39
|
+
false
|
40
|
+
else
|
41
|
+
true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module KubePlatform
|
4
|
+
class HealthCheck
|
5
|
+
attr_reader :name, :config
|
6
|
+
|
7
|
+
def initialize(name, config)
|
8
|
+
@name = name
|
9
|
+
@config = config
|
10
|
+
end
|
11
|
+
|
12
|
+
class << self
|
13
|
+
def load(class_name:, name:, config:)
|
14
|
+
klass = "KubePlatform::HealthChecks::#{class_name}".constantize
|
15
|
+
klass.new(name, config)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../health_check'
|
4
|
+
require_relative '../logger'
|
5
|
+
|
6
|
+
module KubePlatform
|
7
|
+
module HealthChecks
|
8
|
+
class PodsReady < HealthCheck
|
9
|
+
NotReadyContainerException = Class.new(KubePlatformException)
|
10
|
+
|
11
|
+
include Logger
|
12
|
+
|
13
|
+
DEFAULT_CONFIG = { attempts: 80, interval: 30 }.freeze
|
14
|
+
|
15
|
+
def initialize(name, config)
|
16
|
+
super(name, config.apply_defaults(DEFAULT_CONFIG))
|
17
|
+
end
|
18
|
+
|
19
|
+
# on success, returns true
|
20
|
+
# on failure, raises NotReadyContainerException
|
21
|
+
def run(client, _cluster_definition)
|
22
|
+
logger.info("Waiting for pods to become ready. This may take a while.")
|
23
|
+
list_of_unready_pods = wait_for_pods_to_become_ready(client)
|
24
|
+
|
25
|
+
if list_of_unready_pods.empty?
|
26
|
+
logger.info("All pods are ready")
|
27
|
+
true
|
28
|
+
else
|
29
|
+
log_unhealthy(:error, list_of_unready_pods)
|
30
|
+
|
31
|
+
independent_unhealthy_pods = independently_failing_pods(list_of_unready_pods)
|
32
|
+
not_ready_container_info = not_ready_containers(independent_unhealthy_pods)
|
33
|
+
not_ready_container_info.any? or raise ArgumentError, "not_ready_container_pairs is empty? #{independent_unhealthy_pods.inspect}"
|
34
|
+
|
35
|
+
raise NotReadyContainerException, not_ready_container_detail(not_ready_container_info, client)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def independently_failing_pods(unready_pods)
|
42
|
+
unready_pod_labels = Set.new(unready_pods.map { |pod| pod.metadata.labels.app || pod.metadata.labels.send(:"job-name") })
|
43
|
+
|
44
|
+
unready_pods.reject do |pod|
|
45
|
+
if (dependencies_csv = pod.metadata&.annotations&.platform_startup_dependencies)
|
46
|
+
dependencies_csv.split(/, */).any? { |pod_label| unready_pod_labels.include?(pod_label) }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# returns a hash of not ready pods
|
52
|
+
# key is the pod name
|
53
|
+
# value is a hash containing array of not ready containers and pod details
|
54
|
+
#
|
55
|
+
# Ex: { "pod-123" => { not_ready_containers: ["nginx"], pod_details: PodStruct } }
|
56
|
+
def not_ready_containers(unready_pods)
|
57
|
+
unready_pods.each_with_object({}) do |pod, result|
|
58
|
+
not_ready_containers = pod.status.containerStatuses&.reject(&:ready) || []
|
59
|
+
result[pod.metadata.name] = {
|
60
|
+
not_ready_containers: not_ready_containers.map(&:name),
|
61
|
+
details: pod
|
62
|
+
}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# returns detailed text describing the root cause of failure for the given array of not_ready_container pairs
|
67
|
+
# suitable for framing in an exception body
|
68
|
+
def not_ready_container_detail(not_ready_containers, client)
|
69
|
+
not_ready_containers.map do |pod_name, pod|
|
70
|
+
not_ready_container_details_for_pod(pod_name, pod[:not_ready_containers], pod[:details], client)
|
71
|
+
end.flatten.join("\n").sub(/\s+\z/m, "")
|
72
|
+
end
|
73
|
+
|
74
|
+
def not_ready_container_details_for_pod(pod_name, not_ready_containers, pod_details, client)
|
75
|
+
if not_ready_containers.any?
|
76
|
+
not_ready_containers.map do |container_name|
|
77
|
+
not_ready_container_details_from_log(pod_name, container_name, pod_details, client)
|
78
|
+
end
|
79
|
+
else
|
80
|
+
"Pod #{pod_name} not ready but no containerStatus available"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def not_ready_container_details_from_log(pod_name, container_name, pod_details, client)
|
85
|
+
summary = ["Pod #{pod_name} container #{container_name} not ready:"]
|
86
|
+
|
87
|
+
log_lines = client.client_for_api.get_pod_log(pod_name, config.cluster_name, container: container_name).to_s.split("\n")
|
88
|
+
|
89
|
+
summary << if (exit_reason_line = last_log_occurrence(log_lines, '"exit_reason":'))
|
90
|
+
exit_reason_yaml_from_log_line(exit_reason_line) || "JSON parse error or no 'app' key: #{exit_reason_line.inspect}"
|
91
|
+
else
|
92
|
+
"last 50 log lines:\n#{last_n_log_lines(log_lines, 50).join("\n")}"
|
93
|
+
end
|
94
|
+
|
95
|
+
summary.join("\n")
|
96
|
+
rescue Kubeclient::HttpError => ex
|
97
|
+
ex.message.include?("ContainerCreating") or raise
|
98
|
+
summary << pod_details.status.containerStatuses.map { |status| status.to_hash.stringify_keys }.to_yaml
|
99
|
+
summary.join("\n")
|
100
|
+
end
|
101
|
+
|
102
|
+
def exit_reason_yaml_from_log_line(exit_reason_line)
|
103
|
+
if (exit_reason_hash = JSON.parse(exit_reason_line) rescue nil)
|
104
|
+
if (exit_reason_hash_app = exit_reason_hash['app'])
|
105
|
+
exit_reason_hash_app_simplified = exit_reason_hash_app.except("host", "pid", "tid", "logfile", "fiber", "exit_code", "timestamp", "progname", "log_tags")
|
106
|
+
exit_reason_hash_app_simplified.to_yaml
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def last_log_occurrence(lines, pattern)
|
112
|
+
last_line = nil
|
113
|
+
lines.each do |line|
|
114
|
+
if line[pattern]
|
115
|
+
last_line = line
|
116
|
+
end
|
117
|
+
end
|
118
|
+
last_line
|
119
|
+
end
|
120
|
+
|
121
|
+
def last_n_log_lines(lines, n)
|
122
|
+
last_line_count = [lines.size, n].min
|
123
|
+
lines[-last_line_count..-1]
|
124
|
+
end
|
125
|
+
|
126
|
+
# returns the array of 0 or more unready pods
|
127
|
+
def wait_for_pods_to_become_ready(client)
|
128
|
+
unready = []
|
129
|
+
Helpers::Retry.with_retries(attempts, retry_interval) do
|
130
|
+
unready = unready_pods(client)
|
131
|
+
unready.empty? or begin
|
132
|
+
log_status_update(unready)
|
133
|
+
false
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
unready
|
138
|
+
end
|
139
|
+
|
140
|
+
def unready_pods(client)
|
141
|
+
client.client_for_api("v1").get_pods(namespace: namespace).reject { |pod| evicted?(pod) || pod_ready?(pod) }
|
142
|
+
end
|
143
|
+
|
144
|
+
def pod_ready?(pod)
|
145
|
+
if job?(pod)
|
146
|
+
pod.status.phase == "Succeeded"
|
147
|
+
else
|
148
|
+
pod.status&.containerStatuses&.all?(&:ready)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def evicted?(pod)
|
153
|
+
pod.status.phase == "Failed" && pod.status.reason == "Evicted"
|
154
|
+
end
|
155
|
+
|
156
|
+
def job?(pod)
|
157
|
+
pod.metadata.ownerReferences&.any? { |owner| owner.kind == "Job" }
|
158
|
+
end
|
159
|
+
|
160
|
+
def log_status_update(unready)
|
161
|
+
optional_detail =
|
162
|
+
if unready.size <= 5
|
163
|
+
": " + unready.map do |pod|
|
164
|
+
pod.metadata.name.split('-', 2).first
|
165
|
+
end.join(", ")
|
166
|
+
end
|
167
|
+
logger.info("#{unready.size} pod#{unready.size > 1 ? 's are' : ' is'} not in a ready state#{optional_detail}")
|
168
|
+
log_unhealthy(:debug, unready)
|
169
|
+
end
|
170
|
+
|
171
|
+
def log_unhealthy(level, pods)
|
172
|
+
pods.each { |pod| logger.send(level, "Pod #{pod.metadata.name} is not in a healthy state") }
|
173
|
+
end
|
174
|
+
|
175
|
+
def namespace
|
176
|
+
config[:cluster_name]
|
177
|
+
end
|
178
|
+
|
179
|
+
def attempts
|
180
|
+
config[:attempts]
|
181
|
+
end
|
182
|
+
|
183
|
+
def retry_interval
|
184
|
+
config[:interval]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "aws-sdk-route53"
|
4
|
+
require_relative "../logger"
|
5
|
+
require_relative "../helpers/retry"
|
6
|
+
|
7
|
+
module KubePlatform
|
8
|
+
module HealthChecks
|
9
|
+
class R53Records < HealthCheck
|
10
|
+
include Logger
|
11
|
+
|
12
|
+
DEFAULT_CONFIG = { attempts: 20, interval: 30 }.freeze
|
13
|
+
|
14
|
+
def initialize(name, config)
|
15
|
+
super(name, config.apply_defaults(DEFAULT_CONFIG))
|
16
|
+
end
|
17
|
+
|
18
|
+
def run(_client, _cluster_definition)
|
19
|
+
logger.info("Checking for Route53 DNS records")
|
20
|
+
|
21
|
+
missing = missing_records
|
22
|
+
if missing.empty?
|
23
|
+
logger.info("All Route53 records exist")
|
24
|
+
true
|
25
|
+
else
|
26
|
+
missing.each { |record| logger.error("Route53 record #{record} does not exist") }
|
27
|
+
false
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def missing_records
|
34
|
+
missing_records = fully_qualified_names
|
35
|
+
Helpers::Retry.with_retries(attempts, retry_interval) do
|
36
|
+
missing_records.reject! { |record| record_exists?(record) }
|
37
|
+
missing_records.empty? and break
|
38
|
+
end
|
39
|
+
|
40
|
+
missing_records
|
41
|
+
end
|
42
|
+
|
43
|
+
def fully_qualified_names
|
44
|
+
@fully_qualified_names ||= r53_records.map { |record| record.end_with?(".") ? record : "#{record}." }
|
45
|
+
end
|
46
|
+
|
47
|
+
def record_exists?(record)
|
48
|
+
response = r53_client.list_resource_record_sets(
|
49
|
+
hosted_zone_id: r53_zone_id,
|
50
|
+
start_record_name: record,
|
51
|
+
start_record_type: "A",
|
52
|
+
max_items: 1
|
53
|
+
)
|
54
|
+
response.resource_record_sets.first&.name == record
|
55
|
+
end
|
56
|
+
|
57
|
+
def r53_client
|
58
|
+
@r53_client ||= Aws::Route53::Client.new(region: region)
|
59
|
+
end
|
60
|
+
|
61
|
+
def region
|
62
|
+
config[:region]
|
63
|
+
end
|
64
|
+
|
65
|
+
def r53_zone_id
|
66
|
+
config[:r53_zone_id]
|
67
|
+
end
|
68
|
+
|
69
|
+
def r53_records
|
70
|
+
@r53_records ||= config[:r53_records]
|
71
|
+
end
|
72
|
+
|
73
|
+
def attempts
|
74
|
+
config[:attempts]
|
75
|
+
end
|
76
|
+
|
77
|
+
def retry_interval
|
78
|
+
config[:interval]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module KubePlatform
|
4
|
+
module Helpers
|
5
|
+
module Retry
|
6
|
+
class << self
|
7
|
+
def with_retries(max_attempts, retry_interval)
|
8
|
+
try = 0
|
9
|
+
loop do
|
10
|
+
success = yield and break success
|
11
|
+
|
12
|
+
(try += 1) >= max_attempts and break false
|
13
|
+
|
14
|
+
sleep(retry_interval)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|