paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,4679 @@
|
|
|
1
|
+
# Copyright 2015-2018 Yelp Inc.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
import base64
|
|
14
|
+
import functools
|
|
15
|
+
import hashlib
|
|
16
|
+
import itertools
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import math
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
from datetime import timezone
|
|
24
|
+
from enum import Enum
|
|
25
|
+
from functools import lru_cache
|
|
26
|
+
from inspect import currentframe
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any
|
|
29
|
+
from typing import cast
|
|
30
|
+
from typing import Collection
|
|
31
|
+
from typing import Container
|
|
32
|
+
from typing import Dict
|
|
33
|
+
from typing import Iterable
|
|
34
|
+
from typing import List
|
|
35
|
+
from typing import Literal
|
|
36
|
+
from typing import Mapping
|
|
37
|
+
from typing import MutableMapping
|
|
38
|
+
from typing import NamedTuple
|
|
39
|
+
from typing import Optional
|
|
40
|
+
from typing import Sequence
|
|
41
|
+
from typing import Set
|
|
42
|
+
from typing import Tuple
|
|
43
|
+
from typing import Union
|
|
44
|
+
|
|
45
|
+
import a_sync
|
|
46
|
+
import requests
|
|
47
|
+
import service_configuration_lib
|
|
48
|
+
from humanfriendly import parse_size
|
|
49
|
+
from kubernetes import client as kube_client
|
|
50
|
+
from kubernetes import config as kube_config
|
|
51
|
+
from kubernetes.client import CoreV1Event
|
|
52
|
+
from kubernetes.client import models
|
|
53
|
+
from kubernetes.client import V1Affinity
|
|
54
|
+
from kubernetes.client import V1AWSElasticBlockStoreVolumeSource
|
|
55
|
+
from kubernetes.client import V1Capabilities
|
|
56
|
+
from kubernetes.client import V1ConfigMap
|
|
57
|
+
from kubernetes.client import V1Container
|
|
58
|
+
from kubernetes.client import V1ContainerPort
|
|
59
|
+
from kubernetes.client import V1ContainerStatus
|
|
60
|
+
from kubernetes.client import V1ControllerRevision
|
|
61
|
+
from kubernetes.client import V1CustomResourceDefinition
|
|
62
|
+
from kubernetes.client import V1CustomResourceDefinitionList
|
|
63
|
+
from kubernetes.client import V1DeleteOptions
|
|
64
|
+
from kubernetes.client import V1Deployment
|
|
65
|
+
from kubernetes.client import V1DeploymentSpec
|
|
66
|
+
from kubernetes.client import V1DeploymentStrategy
|
|
67
|
+
from kubernetes.client import V1EnvVar
|
|
68
|
+
from kubernetes.client import V1EnvVarSource
|
|
69
|
+
from kubernetes.client import V1ExecAction
|
|
70
|
+
from kubernetes.client import V1HostPathVolumeSource
|
|
71
|
+
from kubernetes.client import V1HTTPGetAction
|
|
72
|
+
from kubernetes.client import V1Job
|
|
73
|
+
from kubernetes.client import V1JobSpec
|
|
74
|
+
from kubernetes.client import V1KeyToPath
|
|
75
|
+
from kubernetes.client import V1LabelSelector
|
|
76
|
+
from kubernetes.client import V1Lifecycle
|
|
77
|
+
from kubernetes.client import V1LifecycleHandler
|
|
78
|
+
from kubernetes.client import V1LimitRange
|
|
79
|
+
from kubernetes.client import V1LimitRangeItem
|
|
80
|
+
from kubernetes.client import V1LimitRangeSpec
|
|
81
|
+
from kubernetes.client import V1Namespace
|
|
82
|
+
from kubernetes.client import V1Node
|
|
83
|
+
from kubernetes.client import V1NodeAffinity
|
|
84
|
+
from kubernetes.client import V1NodeSelector
|
|
85
|
+
from kubernetes.client import V1NodeSelectorRequirement
|
|
86
|
+
from kubernetes.client import V1NodeSelectorTerm
|
|
87
|
+
from kubernetes.client import V1ObjectFieldSelector
|
|
88
|
+
from kubernetes.client import V1ObjectMeta
|
|
89
|
+
from kubernetes.client import V1PersistentVolumeClaim
|
|
90
|
+
from kubernetes.client import V1PersistentVolumeClaimSpec
|
|
91
|
+
from kubernetes.client import V1Pod
|
|
92
|
+
from kubernetes.client import V1PodAffinityTerm
|
|
93
|
+
from kubernetes.client import V1PodAntiAffinity
|
|
94
|
+
from kubernetes.client import V1PodCondition
|
|
95
|
+
from kubernetes.client import V1PodDisruptionBudget
|
|
96
|
+
from kubernetes.client import V1PodDisruptionBudgetSpec
|
|
97
|
+
from kubernetes.client import V1PodSecurityContext
|
|
98
|
+
from kubernetes.client import V1PodSpec
|
|
99
|
+
from kubernetes.client import V1PodTemplateSpec
|
|
100
|
+
from kubernetes.client import V1PreferredSchedulingTerm
|
|
101
|
+
from kubernetes.client import V1Probe
|
|
102
|
+
from kubernetes.client import V1ProjectedVolumeSource
|
|
103
|
+
from kubernetes.client import V1ReplicaSet
|
|
104
|
+
from kubernetes.client import V1ResourceRequirements
|
|
105
|
+
from kubernetes.client import V1RoleBinding
|
|
106
|
+
from kubernetes.client import V1RoleRef
|
|
107
|
+
from kubernetes.client import V1RollingUpdateDeployment
|
|
108
|
+
from kubernetes.client import V1Secret
|
|
109
|
+
from kubernetes.client import V1SecretKeySelector
|
|
110
|
+
from kubernetes.client import V1SecretVolumeSource
|
|
111
|
+
from kubernetes.client import V1SecurityContext
|
|
112
|
+
from kubernetes.client import V1ServiceAccount
|
|
113
|
+
from kubernetes.client import V1ServiceAccountTokenProjection
|
|
114
|
+
from kubernetes.client import V1StatefulSet
|
|
115
|
+
from kubernetes.client import V1StatefulSetSpec
|
|
116
|
+
from kubernetes.client import V1Subject
|
|
117
|
+
from kubernetes.client import V1TCPSocketAction
|
|
118
|
+
from kubernetes.client import V1TopologySpreadConstraint
|
|
119
|
+
from kubernetes.client import V1Volume
|
|
120
|
+
from kubernetes.client import V1VolumeMount
|
|
121
|
+
from kubernetes.client import V1VolumeProjection
|
|
122
|
+
from kubernetes.client import V1WeightedPodAffinityTerm
|
|
123
|
+
from kubernetes.client import V2CrossVersionObjectReference
|
|
124
|
+
from kubernetes.client import V2HorizontalPodAutoscaler
|
|
125
|
+
from kubernetes.client import V2HorizontalPodAutoscalerCondition
|
|
126
|
+
from kubernetes.client import V2HorizontalPodAutoscalerSpec
|
|
127
|
+
from kubernetes.client import V2MetricIdentifier
|
|
128
|
+
from kubernetes.client import V2MetricSpec
|
|
129
|
+
from kubernetes.client import V2MetricTarget
|
|
130
|
+
from kubernetes.client import V2ObjectMetricSource
|
|
131
|
+
from kubernetes.client import V2ResourceMetricSource
|
|
132
|
+
from kubernetes.client.models import V2HorizontalPodAutoscalerStatus
|
|
133
|
+
from kubernetes.client.rest import ApiException
|
|
134
|
+
from mypy_extensions import TypedDict
|
|
135
|
+
from service_configuration_lib import read_soa_metadata
|
|
136
|
+
|
|
137
|
+
from paasta_tools import __version__
|
|
138
|
+
from paasta_tools.async_utils import async_timeout
|
|
139
|
+
from paasta_tools.autoscaling.utils import AutoscalingParamsDict
|
|
140
|
+
from paasta_tools.autoscaling.utils import MetricsProviderDict
|
|
141
|
+
from paasta_tools.long_running_service_tools import host_passes_blacklist
|
|
142
|
+
from paasta_tools.long_running_service_tools import host_passes_whitelist
|
|
143
|
+
from paasta_tools.long_running_service_tools import InvalidHealthcheckMode
|
|
144
|
+
from paasta_tools.long_running_service_tools import load_service_namespace_config
|
|
145
|
+
from paasta_tools.long_running_service_tools import LongRunningServiceConfig
|
|
146
|
+
from paasta_tools.long_running_service_tools import LongRunningServiceConfigDict
|
|
147
|
+
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_ACTIVE_REQUESTS
|
|
148
|
+
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_CPU
|
|
149
|
+
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_GUNICORN
|
|
150
|
+
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PISCINA
|
|
151
|
+
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PROMQL
|
|
152
|
+
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI
|
|
153
|
+
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI_V2
|
|
154
|
+
from paasta_tools.long_running_service_tools import ServiceNamespaceConfig
|
|
155
|
+
from paasta_tools.secret_tools import get_secret_name_from_ref
|
|
156
|
+
from paasta_tools.secret_tools import is_secret_ref
|
|
157
|
+
from paasta_tools.secret_tools import is_shared_secret
|
|
158
|
+
from paasta_tools.secret_tools import SHARED_SECRET_SERVICE
|
|
159
|
+
from paasta_tools.utils import AwsEbsVolume
|
|
160
|
+
from paasta_tools.utils import BranchDictV2
|
|
161
|
+
from paasta_tools.utils import CAPS_DROP
|
|
162
|
+
from paasta_tools.utils import decompose_job_id
|
|
163
|
+
from paasta_tools.utils import deep_merge_dictionaries
|
|
164
|
+
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
165
|
+
from paasta_tools.utils import DeployBlacklist
|
|
166
|
+
from paasta_tools.utils import DeploymentVersion
|
|
167
|
+
from paasta_tools.utils import DeployWhitelist
|
|
168
|
+
from paasta_tools.utils import DockerVolume
|
|
169
|
+
from paasta_tools.utils import get_config_hash
|
|
170
|
+
from paasta_tools.utils import get_git_sha_from_dockerurl
|
|
171
|
+
from paasta_tools.utils import KubeContainerResourceRequest
|
|
172
|
+
from paasta_tools.utils import load_service_instance_config
|
|
173
|
+
from paasta_tools.utils import load_system_paasta_config
|
|
174
|
+
from paasta_tools.utils import load_v2_deployments_json
|
|
175
|
+
from paasta_tools.utils import PaastaColors
|
|
176
|
+
from paasta_tools.utils import PaastaNotConfiguredError
|
|
177
|
+
from paasta_tools.utils import PersistentVolume
|
|
178
|
+
from paasta_tools.utils import ProjectedSAVolume
|
|
179
|
+
from paasta_tools.utils import SecretVolume
|
|
180
|
+
from paasta_tools.utils import SystemPaastaConfig
|
|
181
|
+
from paasta_tools.utils import time_cache
|
|
182
|
+
from paasta_tools.utils import TopologySpreadConstraintDict
|
|
183
|
+
from paasta_tools.utils import VolumeWithMode
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
log = logging.getLogger(__name__)
|
|
187
|
+
|
|
188
|
+
KUBE_CONFIG_PATH = "/etc/kubernetes/admin.conf"
|
|
189
|
+
KUBE_CONFIG_USER_PATH = "/etc/kubernetes/paasta.conf"
|
|
190
|
+
YELP_ATTRIBUTE_PREFIX = "yelp.com/"
|
|
191
|
+
PAASTA_ATTRIBUTE_PREFIX = "paasta.yelp.com/"
|
|
192
|
+
KUBE_DEPLOY_STATEGY_MAP = {
|
|
193
|
+
"crossover": "RollingUpdate",
|
|
194
|
+
"downthenup": "Recreate",
|
|
195
|
+
"brutal": "RollingUpdate",
|
|
196
|
+
}
|
|
197
|
+
HACHECK_POD_NAME = "hacheck"
|
|
198
|
+
GUNICORN_EXPORTER_POD_NAME = "gunicorn--exporter"
|
|
199
|
+
SIDECAR_CONTAINER_NAMES = [
|
|
200
|
+
HACHECK_POD_NAME,
|
|
201
|
+
GUNICORN_EXPORTER_POD_NAME,
|
|
202
|
+
]
|
|
203
|
+
KUBERNETES_NAMESPACE = "paasta"
|
|
204
|
+
PAASTA_WORKLOAD_OWNER = "compute_infra_platform_experience"
|
|
205
|
+
MAX_EVENTS_TO_RETRIEVE = 200
|
|
206
|
+
DISCOVERY_ATTRIBUTES = {
|
|
207
|
+
"region",
|
|
208
|
+
"superregion",
|
|
209
|
+
"ecosystem",
|
|
210
|
+
"habitat",
|
|
211
|
+
"pool",
|
|
212
|
+
"hostname",
|
|
213
|
+
"owner",
|
|
214
|
+
}
|
|
215
|
+
ZONE_LABELS = (
|
|
216
|
+
"topology.kubernetes.io/zone",
|
|
217
|
+
"yelp.com/habitat",
|
|
218
|
+
"yelp.com/eni_config",
|
|
219
|
+
"karpenter.sh/nodepool",
|
|
220
|
+
"topology.ebs.csi.aws.com/zone",
|
|
221
|
+
)
|
|
222
|
+
JOB_TYPE_LABEL_NAME = "job_type"
|
|
223
|
+
|
|
224
|
+
GPU_RESOURCE_NAME = "nvidia.com/gpu"
|
|
225
|
+
DEFAULT_STORAGE_CLASS_NAME = "ebs"
|
|
226
|
+
|
|
227
|
+
DEFAULT_SIDECAR_REQUEST: KubeContainerResourceRequest = {
|
|
228
|
+
"cpu": 0.1,
|
|
229
|
+
"memory": "1024Mi",
|
|
230
|
+
"ephemeral-storage": "256Mi",
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
DEFAULT_PROJECTED_SA_EXPIRATION_SECONDS = 3600
|
|
234
|
+
PROJECTED_SA_TOKEN_PATH = "token"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# conditions is None when creating a new HPA, but the client raises an error in that case.
|
|
238
|
+
# For detail, https://github.com/kubernetes-client/python/issues/553
|
|
239
|
+
# This hack should be removed when the issue got fixed.
|
|
240
|
+
# This is no better way to work around rn.
|
|
241
|
+
class MonkeyPatchAutoScalingConditions(V2HorizontalPodAutoscalerStatus):
|
|
242
|
+
@property
|
|
243
|
+
def conditions(self) -> Sequence[V2HorizontalPodAutoscalerCondition]:
|
|
244
|
+
return super().conditions()
|
|
245
|
+
|
|
246
|
+
@conditions.setter
|
|
247
|
+
def conditions(
|
|
248
|
+
self, conditions: Optional[Sequence[V2HorizontalPodAutoscalerCondition]]
|
|
249
|
+
) -> None:
|
|
250
|
+
self._conditions = list() if conditions is None else conditions
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
models.V2HorizontalPodAutoscalerStatus = MonkeyPatchAutoScalingConditions
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class KubeKind(NamedTuple):
|
|
257
|
+
singular: str
|
|
258
|
+
plural: str
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class KubeDeployment(NamedTuple):
|
|
262
|
+
service: str
|
|
263
|
+
instance: str
|
|
264
|
+
git_sha: str
|
|
265
|
+
image_version: Optional[str]
|
|
266
|
+
config_sha: str
|
|
267
|
+
namespace: str
|
|
268
|
+
replicas: Optional[int]
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class KubeCustomResource(NamedTuple):
|
|
272
|
+
service: str
|
|
273
|
+
instance: str
|
|
274
|
+
config_sha: str
|
|
275
|
+
git_sha: str
|
|
276
|
+
kind: str
|
|
277
|
+
namespace: str
|
|
278
|
+
name: str
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class KubeContainerResources(NamedTuple):
|
|
282
|
+
cpus: float
|
|
283
|
+
mem: float # mb
|
|
284
|
+
disk: float # mb
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
class KubernetesServiceRegistration(NamedTuple):
|
|
288
|
+
name: str
|
|
289
|
+
instance: str
|
|
290
|
+
port: int
|
|
291
|
+
pod_ip: str
|
|
292
|
+
registrations: Sequence[str]
|
|
293
|
+
weight: int
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class CustomResourceDefinition(NamedTuple):
|
|
297
|
+
file_prefix: str
|
|
298
|
+
version: str
|
|
299
|
+
kube_kind: KubeKind
|
|
300
|
+
group: str
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class KubeLifecycleDict(TypedDict, total=False):
|
|
304
|
+
termination_grace_period_seconds: int
|
|
305
|
+
pre_stop_command: Union[str, List[str]]
|
|
306
|
+
pre_stop_drain_seconds: int
|
|
307
|
+
pre_stop_wait_for_connections_to_complete: bool
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class KubeAffinityCondition(TypedDict, total=False):
|
|
311
|
+
service: str
|
|
312
|
+
instance: str
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
class KubeWeightedAffinityCondition(KubeAffinityCondition):
|
|
316
|
+
weight: int
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class DatastoreCredentialsConfig(TypedDict, total=False):
|
|
320
|
+
mysql: List[str]
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _set_disrupted_pods(self: Any, disrupted_pods: Mapping[str, datetime]) -> None:
|
|
324
|
+
"""Private function used to patch the setter for V1PodDisruptionBudgetStatus.
|
|
325
|
+
Can be removed once https://github.com/kubernetes-client/python/issues/466 is resolved
|
|
326
|
+
"""
|
|
327
|
+
self._disrupted_pods = disrupted_pods
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
SidecarResourceRequirements = TypedDict(
|
|
331
|
+
"SidecarResourceRequirements",
|
|
332
|
+
{
|
|
333
|
+
"requests": KubeContainerResourceRequest,
|
|
334
|
+
"limits": KubeContainerResourceRequest,
|
|
335
|
+
},
|
|
336
|
+
total=False,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
KubePodAnnotations = TypedDict(
|
|
341
|
+
"KubePodAnnotations",
|
|
342
|
+
{
|
|
343
|
+
"autoscaling": str,
|
|
344
|
+
"iam.amazonaws.com/role": str,
|
|
345
|
+
"paasta.yelp.com/prometheus_path": str,
|
|
346
|
+
"paasta.yelp.com/prometheus_port": str,
|
|
347
|
+
"paasta.yelp.com/routable_ip": str,
|
|
348
|
+
"smartstack_registrations": str,
|
|
349
|
+
},
|
|
350
|
+
total=False,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
KubePodLabels = TypedDict(
|
|
354
|
+
"KubePodLabels",
|
|
355
|
+
{
|
|
356
|
+
# NOTE: we can't use the paasta_prefixed() helper here
|
|
357
|
+
# since mypy expects TypedDict keys to be string literals
|
|
358
|
+
"paasta.yelp.com/deploy_group": str,
|
|
359
|
+
"paasta.yelp.com/git_sha": str,
|
|
360
|
+
"paasta.yelp.com/image_version": str,
|
|
361
|
+
"paasta.yelp.com/instance": str,
|
|
362
|
+
"paasta.yelp.com/prometheus_shard": str,
|
|
363
|
+
"paasta.yelp.com/scrape_piscina_prometheus": str,
|
|
364
|
+
"paasta.yelp.com/scrape_gunicorn_prometheus": str,
|
|
365
|
+
"paasta.yelp.com/service": str,
|
|
366
|
+
"paasta.yelp.com/autoscaled": str,
|
|
367
|
+
"yelp.com/paasta_git_sha": str,
|
|
368
|
+
"yelp.com/paasta_instance": str,
|
|
369
|
+
"yelp.com/paasta_service": str,
|
|
370
|
+
"sidecar.istio.io/inject": str,
|
|
371
|
+
"paasta.yelp.com/cluster": str,
|
|
372
|
+
"paasta.yelp.com/pool": str,
|
|
373
|
+
"paasta.yelp.com/weight": str,
|
|
374
|
+
"yelp.com/owner": str,
|
|
375
|
+
"paasta.yelp.com/managed": str,
|
|
376
|
+
"elbv2.k8s.aws/pod-readiness-gate-inject": str,
|
|
377
|
+
},
|
|
378
|
+
total=False,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
class CryptoKeyConfig(TypedDict):
|
|
383
|
+
encrypt: List[str]
|
|
384
|
+
decrypt: List[str]
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
class NodeSelectorInNotIn(TypedDict):
|
|
388
|
+
operator: Literal["In", "NotIn"]
|
|
389
|
+
values: List[str]
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
class NodeSelectorExistsDoesNotExist(TypedDict):
|
|
393
|
+
operator: Literal["Exists", "DoesNotExist"]
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
class NodeSelectorGtLt(TypedDict):
|
|
397
|
+
operator: Literal["Gt", "Lt"]
|
|
398
|
+
value: int
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
NodeSelectorOperator = Union[
|
|
402
|
+
NodeSelectorInNotIn,
|
|
403
|
+
NodeSelectorExistsDoesNotExist,
|
|
404
|
+
NodeSelectorGtLt,
|
|
405
|
+
]
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
NodeSelectorConfig = Union[
|
|
409
|
+
str,
|
|
410
|
+
List[str],
|
|
411
|
+
List[NodeSelectorOperator],
|
|
412
|
+
]
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class NodeSelectorsPreferredConfigDict(TypedDict):
|
|
416
|
+
weight: int
|
|
417
|
+
preferences: Dict[str, NodeSelectorConfig]
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
class KubernetesDeploymentConfigDict(LongRunningServiceConfigDict, total=False):
|
|
421
|
+
bounce_method: str
|
|
422
|
+
bounce_health_params: Dict[str, Any]
|
|
423
|
+
service_account_name: str
|
|
424
|
+
node_selectors: Dict[str, NodeSelectorConfig]
|
|
425
|
+
node_selectors_preferred: List[NodeSelectorsPreferredConfigDict]
|
|
426
|
+
sidecar_resource_requirements: Dict[str, SidecarResourceRequirements]
|
|
427
|
+
lifecycle: KubeLifecycleDict
|
|
428
|
+
anti_affinity: Union[KubeAffinityCondition, List[KubeAffinityCondition]]
|
|
429
|
+
anti_affinity_preferred: Union[
|
|
430
|
+
KubeWeightedAffinityCondition, List[KubeWeightedAffinityCondition]
|
|
431
|
+
]
|
|
432
|
+
prometheus_shard: str
|
|
433
|
+
prometheus_path: str
|
|
434
|
+
prometheus_port: int
|
|
435
|
+
routable_ip: bool
|
|
436
|
+
pod_management_policy: str
|
|
437
|
+
is_istio_sidecar_injection_enabled: bool
|
|
438
|
+
boto_keys: List[str]
|
|
439
|
+
crypto_keys: CryptoKeyConfig
|
|
440
|
+
datastore_credentials: DatastoreCredentialsConfig
|
|
441
|
+
topology_spread_constraints: List[TopologySpreadConstraintDict]
|
|
442
|
+
enable_aws_lb_readiness_gate: bool
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def load_kubernetes_service_config_no_cache(
|
|
446
|
+
service: str,
|
|
447
|
+
instance: str,
|
|
448
|
+
cluster: str,
|
|
449
|
+
load_deployments: bool = True,
|
|
450
|
+
soa_dir: str = DEFAULT_SOA_DIR,
|
|
451
|
+
) -> "KubernetesDeploymentConfig":
|
|
452
|
+
"""Read a service instance's configuration for kubernetes.
|
|
453
|
+
|
|
454
|
+
If a branch isn't specified for a config, the 'branch' key defaults to
|
|
455
|
+
paasta-${cluster}.${instance}.
|
|
456
|
+
|
|
457
|
+
:param name: The service name
|
|
458
|
+
:param instance: The instance of the service to retrieve
|
|
459
|
+
:param cluster: The cluster to read the configuration for
|
|
460
|
+
:param load_deployments: A boolean indicating if the corresponding deployments.json for this service
|
|
461
|
+
should also be loaded
|
|
462
|
+
:param soa_dir: The SOA configuration directory to read from
|
|
463
|
+
:returns: A dictionary of whatever was in the config for the service instance"""
|
|
464
|
+
general_config = service_configuration_lib.read_service_configuration(
|
|
465
|
+
service, soa_dir=soa_dir
|
|
466
|
+
)
|
|
467
|
+
instance_config = load_service_instance_config(
|
|
468
|
+
service, instance, "kubernetes", cluster, soa_dir=soa_dir
|
|
469
|
+
)
|
|
470
|
+
general_config = deep_merge_dictionaries(
|
|
471
|
+
overrides=instance_config, defaults=general_config
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
branch_dict: Optional[BranchDictV2] = None
|
|
475
|
+
if load_deployments:
|
|
476
|
+
deployments_json = load_v2_deployments_json(service, soa_dir=soa_dir)
|
|
477
|
+
temp_instance_config = KubernetesDeploymentConfig(
|
|
478
|
+
service=service,
|
|
479
|
+
cluster=cluster,
|
|
480
|
+
instance=instance,
|
|
481
|
+
config_dict=general_config,
|
|
482
|
+
branch_dict=None,
|
|
483
|
+
soa_dir=soa_dir,
|
|
484
|
+
)
|
|
485
|
+
branch = temp_instance_config.get_branch()
|
|
486
|
+
deploy_group = temp_instance_config.get_deploy_group()
|
|
487
|
+
branch_dict = deployments_json.get_branch_dict(service, branch, deploy_group)
|
|
488
|
+
|
|
489
|
+
return KubernetesDeploymentConfig(
|
|
490
|
+
service=service,
|
|
491
|
+
cluster=cluster,
|
|
492
|
+
instance=instance,
|
|
493
|
+
config_dict=general_config,
|
|
494
|
+
branch_dict=branch_dict,
|
|
495
|
+
soa_dir=soa_dir,
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
@time_cache(ttl=5)
|
|
500
|
+
def load_kubernetes_service_config(
|
|
501
|
+
service: str,
|
|
502
|
+
instance: str,
|
|
503
|
+
cluster: str,
|
|
504
|
+
load_deployments: bool = True,
|
|
505
|
+
soa_dir: str = DEFAULT_SOA_DIR,
|
|
506
|
+
) -> "KubernetesDeploymentConfig":
|
|
507
|
+
"""Read a service instance's configuration for kubernetes.
|
|
508
|
+
|
|
509
|
+
If a branch isn't specified for a config, the 'branch' key defaults to
|
|
510
|
+
paasta-${cluster}.${instance}.
|
|
511
|
+
|
|
512
|
+
:param name: The service name
|
|
513
|
+
:param instance: The instance of the service to retrieve
|
|
514
|
+
:param cluster: The cluster to read the configuration for
|
|
515
|
+
:param load_deployments: A boolean indicating if the corresponding deployments.json for this service
|
|
516
|
+
should also be loaded
|
|
517
|
+
:param soa_dir: The SOA configuration directory to read from
|
|
518
|
+
:returns: A dictionary of whatever was in the config for the service instance"""
|
|
519
|
+
return load_kubernetes_service_config_no_cache(
|
|
520
|
+
service=service,
|
|
521
|
+
instance=instance,
|
|
522
|
+
cluster=cluster,
|
|
523
|
+
load_deployments=load_deployments,
|
|
524
|
+
soa_dir=soa_dir,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def limit_size_with_hash(name: str, limit: int = 63, suffix: int = 4) -> str:
|
|
529
|
+
"""Returns `name` unchanged if it's length does not exceed the `limit`.
|
|
530
|
+
Otherwise, returns truncated `name` with it's hash of size `suffix`
|
|
531
|
+
appended.
|
|
532
|
+
|
|
533
|
+
base32 encoding is chosen as it satisfies the common requirement in
|
|
534
|
+
various k8s names to be alphanumeric.
|
|
535
|
+
"""
|
|
536
|
+
if len(name) > limit:
|
|
537
|
+
digest = hashlib.md5(name.encode()).digest()
|
|
538
|
+
hashed = base64.b32encode(digest).decode().replace("=", "").lower()
|
|
539
|
+
return f"{name[:(limit-suffix-1)]}-{hashed[:suffix]}"
|
|
540
|
+
else:
|
|
541
|
+
return name
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def get_vault_key_secret_name(vault_key: str) -> str:
|
|
545
|
+
"""
|
|
546
|
+
Vault path may contain `/` slashes which is invalid as secret name
|
|
547
|
+
V1Secret's data key must match regexp [a-zA-Z0-9._-],
|
|
548
|
+
which is enforced with schema https://github.com/Yelp/paasta/blob/master/paasta_tools/cli/schemas/adhoc_schema.json#L80
|
|
549
|
+
Source: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Secret.md
|
|
550
|
+
"""
|
|
551
|
+
return vault_key.replace("/", "-")
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
class InvalidKubernetesConfig(Exception):
|
|
555
|
+
def __init__(self, exception: Exception, service: str, instance: str) -> None:
|
|
556
|
+
super().__init__(
|
|
557
|
+
f"Couldn't generate config for kubernetes service: {service}.{instance}: {exception}"
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
class KubeClient:
|
|
562
|
+
@functools.lru_cache() # type: ignore
|
|
563
|
+
def __new__(
|
|
564
|
+
cls,
|
|
565
|
+
component: Optional[str] = None,
|
|
566
|
+
config_file: Optional[str] = None,
|
|
567
|
+
context: Optional[str] = None,
|
|
568
|
+
) -> "KubeClient":
|
|
569
|
+
"""By @lru_cache'ing this function, repeated instantiations of KubeClient with the same arguments will return the
|
|
570
|
+
exact same object. This makes it possible to effectively cache function calls that take a KubeClient as an
|
|
571
|
+
argument."""
|
|
572
|
+
return super().__new__(cls)
|
|
573
|
+
|
|
574
|
+
@functools.lru_cache() # type: ignore
|
|
575
|
+
def __init__(
|
|
576
|
+
self,
|
|
577
|
+
component: Optional[str] = None,
|
|
578
|
+
config_file: Optional[str] = None,
|
|
579
|
+
context: Optional[str] = None,
|
|
580
|
+
) -> None:
|
|
581
|
+
if not config_file:
|
|
582
|
+
config_file = os.environ.get("KUBECONFIG", KUBE_CONFIG_PATH)
|
|
583
|
+
if not context:
|
|
584
|
+
context = os.environ.get("KUBECONTEXT")
|
|
585
|
+
kube_config.load_kube_config(
|
|
586
|
+
config_file=config_file,
|
|
587
|
+
context=context,
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
models.V1PodDisruptionBudgetStatus.disrupted_pods = property(
|
|
591
|
+
fget=lambda *args, **kwargs: models.V1PodDisruptionBudgetStatus.disrupted_pods(
|
|
592
|
+
*args, **kwargs
|
|
593
|
+
),
|
|
594
|
+
fset=_set_disrupted_pods,
|
|
595
|
+
)
|
|
596
|
+
if not component:
|
|
597
|
+
# If we don't get an explicit component set via constructor,
|
|
598
|
+
# try to find it by looking back in the stack, and getting `__file__` from
|
|
599
|
+
# the context calling this constructor
|
|
600
|
+
# Normally, `__module__` would make more sense, but since we have a lot of
|
|
601
|
+
# single scripts we directly call, that would be set to `__main__` most of the time.
|
|
602
|
+
current = currentframe()
|
|
603
|
+
parent = current.f_back
|
|
604
|
+
component = parent.f_globals.get("__file__", "unknown")
|
|
605
|
+
|
|
606
|
+
self.api_client = kube_client.ApiClient()
|
|
607
|
+
self.api_client.user_agent = f"paasta/{component}/v{__version__}"
|
|
608
|
+
|
|
609
|
+
self.deployments = kube_client.AppsV1Api(self.api_client)
|
|
610
|
+
self.core = kube_client.CoreV1Api(self.api_client)
|
|
611
|
+
self.policy = kube_client.PolicyV1Api(self.api_client)
|
|
612
|
+
self.apiextensions = kube_client.ApiextensionsV1Api(self.api_client)
|
|
613
|
+
self.batches = kube_client.BatchV1Api(self.api_client)
|
|
614
|
+
|
|
615
|
+
self.custom = kube_client.CustomObjectsApi(self.api_client)
|
|
616
|
+
self.autoscaling = kube_client.AutoscalingV2Api(self.api_client)
|
|
617
|
+
self.rbac = kube_client.RbacAuthorizationV1Api(self.api_client)
|
|
618
|
+
|
|
619
|
+
self.request = self.api_client.request
|
|
620
|
+
# This function is used by the k8s client to serialize OpenAPI objects
|
|
621
|
+
# into JSON before posting to the api. The JSON output can be used
|
|
622
|
+
# in place of OpenAPI objects in client function calls. This allows us
|
|
623
|
+
# to monkey-patch the JSON data with configs the api supports, but the
|
|
624
|
+
# Python client lib may not yet.
|
|
625
|
+
self.jsonify = self.api_client.sanitize_for_serialization
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def allowlist_denylist_to_requirements(
|
|
629
|
+
allowlist: DeployWhitelist, denylist: DeployBlacklist
|
|
630
|
+
) -> List[Tuple[str, str, List[str]]]:
|
|
631
|
+
"""Converts deploy_whitelist and deploy_blacklist to a list of
|
|
632
|
+
requirements, which can be converted to node affinities.
|
|
633
|
+
"""
|
|
634
|
+
requirements = []
|
|
635
|
+
# convert whitelist into a node selector req
|
|
636
|
+
if allowlist:
|
|
637
|
+
location_type, alloweds = allowlist
|
|
638
|
+
requirements.append((to_node_label(location_type), "In", alloweds))
|
|
639
|
+
# convert blacklist into multiple node selector reqs
|
|
640
|
+
if denylist:
|
|
641
|
+
# not going to prune for duplicates, or group blacklist items for
|
|
642
|
+
# same location_type. makes testing easier and k8s can handle it.
|
|
643
|
+
for location_type, not_allowed in denylist:
|
|
644
|
+
requirements.append((to_node_label(location_type), "NotIn", [not_allowed]))
|
|
645
|
+
return requirements
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def raw_selectors_to_requirements(
|
|
649
|
+
raw_selectors: Mapping[str, NodeSelectorConfig]
|
|
650
|
+
) -> List[Tuple[str, str, List[str]]]:
|
|
651
|
+
"""Converts certain node_selectors into requirements, which can be
|
|
652
|
+
converted to node affinities.
|
|
653
|
+
"""
|
|
654
|
+
requirements: List[Tuple[str, str, List[str]]] = []
|
|
655
|
+
|
|
656
|
+
for label, configs in raw_selectors.items():
|
|
657
|
+
operator_configs: List[NodeSelectorOperator] = []
|
|
658
|
+
|
|
659
|
+
if type(configs) is not list or len(configs) == 0:
|
|
660
|
+
continue
|
|
661
|
+
elif type(configs[0]) is str:
|
|
662
|
+
# specifying an array/list of strings for a label is shorthand
|
|
663
|
+
# for the "In" operator
|
|
664
|
+
operator_configs = [
|
|
665
|
+
NodeSelectorInNotIn(
|
|
666
|
+
{"operator": "In", "values": cast(List[str], configs)}
|
|
667
|
+
)
|
|
668
|
+
]
|
|
669
|
+
else:
|
|
670
|
+
# configs should already be a List[NodeSelectorOperator]
|
|
671
|
+
operator_configs = cast(List[NodeSelectorOperator], configs)
|
|
672
|
+
|
|
673
|
+
label = to_node_label(label)
|
|
674
|
+
for config in operator_configs:
|
|
675
|
+
if config["operator"] in {"In", "NotIn"}:
|
|
676
|
+
config = cast(NodeSelectorInNotIn, config)
|
|
677
|
+
values = config["values"]
|
|
678
|
+
elif config["operator"] in {"Exists", "DoesNotExist"}:
|
|
679
|
+
config = cast(NodeSelectorExistsDoesNotExist, config)
|
|
680
|
+
values = []
|
|
681
|
+
elif config["operator"] in {"Gt", "Lt"}:
|
|
682
|
+
config = cast(NodeSelectorGtLt, config)
|
|
683
|
+
# config["value"] is validated by jsonschema to be an int. but,
|
|
684
|
+
# k8s expects singleton list of the int represented as a str
|
|
685
|
+
# for these operators.
|
|
686
|
+
values = [str(config["value"])]
|
|
687
|
+
else:
|
|
688
|
+
raise ValueError(
|
|
689
|
+
f"Unknown k8s node affinity operator: {config['operator']}"
|
|
690
|
+
)
|
|
691
|
+
requirements.append((label, config["operator"], values))
|
|
692
|
+
|
|
693
|
+
return requirements
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def registration_label(namespace: str) -> str:
|
|
697
|
+
"""Returns namespace prefixed with registrations.{paasta_prefix}/, with
|
|
698
|
+
name part of label key truncated to 63 characters with hash as suffix
|
|
699
|
+
if needed.
|
|
700
|
+
"""
|
|
701
|
+
limited_namespace = limit_size_with_hash(namespace, limit=63, suffix=4)
|
|
702
|
+
return f"registrations.{PAASTA_ATTRIBUTE_PREFIX}{limited_namespace}"
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def contains_zone_label(node_selectors: Dict[str, NodeSelectorConfig]) -> bool:
|
|
706
|
+
return any(k in node_selectors for k in ZONE_LABELS)
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
class KubernetesDeploymentConfig(LongRunningServiceConfig):
|
|
710
|
+
config_dict: KubernetesDeploymentConfigDict
|
|
711
|
+
|
|
712
|
+
config_filename_prefix = "kubernetes"
|
|
713
|
+
|
|
714
|
+
def __init__(
|
|
715
|
+
self,
|
|
716
|
+
service: str,
|
|
717
|
+
cluster: str,
|
|
718
|
+
instance: str,
|
|
719
|
+
config_dict: KubernetesDeploymentConfigDict,
|
|
720
|
+
branch_dict: Optional[BranchDictV2],
|
|
721
|
+
soa_dir: str = DEFAULT_SOA_DIR,
|
|
722
|
+
) -> None:
|
|
723
|
+
super().__init__(
|
|
724
|
+
cluster=cluster,
|
|
725
|
+
instance=instance,
|
|
726
|
+
service=service,
|
|
727
|
+
config_dict=config_dict,
|
|
728
|
+
branch_dict=branch_dict,
|
|
729
|
+
soa_dir=soa_dir,
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
def copy(self) -> "KubernetesDeploymentConfig":
|
|
733
|
+
return self.__class__(
|
|
734
|
+
service=self.service,
|
|
735
|
+
instance=self.instance,
|
|
736
|
+
cluster=self.cluster,
|
|
737
|
+
config_dict=self.config_dict.copy(),
|
|
738
|
+
branch_dict=(
|
|
739
|
+
self.branch_dict.copy() if self.branch_dict is not None else None
|
|
740
|
+
),
|
|
741
|
+
soa_dir=self.soa_dir,
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
def get_kubernetes_namespace(self) -> str:
|
|
745
|
+
return self.get_namespace()
|
|
746
|
+
|
|
747
|
+
def get_cmd(self) -> Optional[List[str]]:
|
|
748
|
+
cmd = super(LongRunningServiceConfig, self).get_cmd()
|
|
749
|
+
if cmd:
|
|
750
|
+
if isinstance(cmd, str):
|
|
751
|
+
return ["sh", "-c", cmd]
|
|
752
|
+
elif isinstance(cmd, list):
|
|
753
|
+
return cmd
|
|
754
|
+
else:
|
|
755
|
+
raise ValueError("cmd should be str or list")
|
|
756
|
+
else:
|
|
757
|
+
return None
|
|
758
|
+
|
|
759
|
+
def get_bounce_method(self) -> str:
|
|
760
|
+
"""Get the bounce method specified in the service's kubernetes configuration."""
|
|
761
|
+
# map existing bounce methods to k8s equivalents.
|
|
762
|
+
# but if there's an EBS volume we must downthenup to free up the volume.
|
|
763
|
+
# in the future we may support stateful sets to dynamically create the volumes
|
|
764
|
+
bounce_method = self.config_dict.get("bounce_method", "crossover")
|
|
765
|
+
if self.get_aws_ebs_volumes() and not bounce_method == "downthenup":
|
|
766
|
+
raise Exception(
|
|
767
|
+
"If service instance defines an EBS volume it must use a downthenup bounce_method"
|
|
768
|
+
)
|
|
769
|
+
return bounce_method
|
|
770
|
+
|
|
771
|
+
# TODO: move the default scaling policy to system paasta configs
|
|
772
|
+
def get_autoscaling_scaling_policy(
|
|
773
|
+
self,
|
|
774
|
+
max_replicas: int,
|
|
775
|
+
autoscaling_params: AutoscalingParamsDict,
|
|
776
|
+
) -> Dict:
|
|
777
|
+
"""Returns the k8s HPA scaling policy in raw JSON. Requires k8s v1.18
|
|
778
|
+
to work.
|
|
779
|
+
"""
|
|
780
|
+
# The HPA scaling algorithm is as follows. Every sync period (default:
|
|
781
|
+
# 15 seconds), the HPA will:
|
|
782
|
+
# 1. determine what the desired capacity is from metrics
|
|
783
|
+
# 2. apply min/max replica scaling limits
|
|
784
|
+
# 3. rate-limit the scaling magnitude (e.g. scale down by no more than
|
|
785
|
+
# 30% of current replicas)
|
|
786
|
+
# 4. constrain the scaling magnitude by the period seconds (e.g. scale
|
|
787
|
+
# down by no more than 30% of current replicas per 60 seconds)
|
|
788
|
+
# 5. record the desired capacity, then pick the highest capacity from
|
|
789
|
+
# the stabilization window (default: last 300 seconds) as the final
|
|
790
|
+
# desired capacity.
|
|
791
|
+
# - the idea is to stabilize scaling against (heavily) fluctuating
|
|
792
|
+
# metrics
|
|
793
|
+
policy = {
|
|
794
|
+
"scaleDown": {
|
|
795
|
+
"stabilizationWindowSeconds": 300,
|
|
796
|
+
# the policy in a human-readable way: scale down every 60s by
|
|
797
|
+
# at most 30% of current replicas.
|
|
798
|
+
"selectPolicy": "Max",
|
|
799
|
+
"policies": [{"type": "Percent", "value": 30, "periodSeconds": 60}],
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
policy["scaleDown"].update(autoscaling_params.get("scaledown_policies", {}))
|
|
803
|
+
return policy
|
|
804
|
+
|
|
805
|
+
def namespace_external_metric_name(self, metric_name: str) -> str:
|
|
806
|
+
return f"{self.get_sanitised_deployment_name()}-{metric_name}"
|
|
807
|
+
|
|
808
|
+
def get_autoscaling_provider_spec(
|
|
809
|
+
self, name: str, namespace: str, provider: MetricsProviderDict
|
|
810
|
+
) -> Optional[V2MetricSpec]:
|
|
811
|
+
target = provider["setpoint"]
|
|
812
|
+
prometheus_hpa_metric_name = (
|
|
813
|
+
f"{self.namespace_external_metric_name(provider['type'])}-prom"
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
if provider["type"] == METRICS_PROVIDER_CPU:
|
|
817
|
+
return V2MetricSpec(
|
|
818
|
+
type="Resource",
|
|
819
|
+
resource=V2ResourceMetricSource(
|
|
820
|
+
name="cpu",
|
|
821
|
+
target=V2MetricTarget(
|
|
822
|
+
type="Utilization",
|
|
823
|
+
average_utilization=int(target * 100),
|
|
824
|
+
),
|
|
825
|
+
),
|
|
826
|
+
)
|
|
827
|
+
elif provider["type"] in {
|
|
828
|
+
METRICS_PROVIDER_UWSGI,
|
|
829
|
+
METRICS_PROVIDER_PISCINA,
|
|
830
|
+
METRICS_PROVIDER_GUNICORN,
|
|
831
|
+
METRICS_PROVIDER_ACTIVE_REQUESTS,
|
|
832
|
+
}:
|
|
833
|
+
return V2MetricSpec(
|
|
834
|
+
type="Object",
|
|
835
|
+
object=V2ObjectMetricSource(
|
|
836
|
+
metric=V2MetricIdentifier(name=prometheus_hpa_metric_name),
|
|
837
|
+
described_object=V2CrossVersionObjectReference(
|
|
838
|
+
api_version="apps/v1", kind="Deployment", name=name
|
|
839
|
+
),
|
|
840
|
+
target=V2MetricTarget(
|
|
841
|
+
type="Value",
|
|
842
|
+
# we average the number of instances needed to handle the current (or
|
|
843
|
+
# averaged) load instead of the load itself as this leads to more
|
|
844
|
+
# stable behavior. we return the percentage by which we want to
|
|
845
|
+
# scale, so the target in the HPA should always be 1.
|
|
846
|
+
# PAASTA-16756 for details
|
|
847
|
+
value=1,
|
|
848
|
+
),
|
|
849
|
+
),
|
|
850
|
+
)
|
|
851
|
+
elif provider["type"] == METRICS_PROVIDER_PROMQL:
|
|
852
|
+
return V2MetricSpec(
|
|
853
|
+
type="Object",
|
|
854
|
+
object=V2ObjectMetricSource(
|
|
855
|
+
metric=V2MetricIdentifier(name=prometheus_hpa_metric_name),
|
|
856
|
+
described_object=V2CrossVersionObjectReference(
|
|
857
|
+
api_version="apps/v1", kind="Deployment", name=name
|
|
858
|
+
),
|
|
859
|
+
target=V2MetricTarget(
|
|
860
|
+
# Use the setpoint specified by the user.
|
|
861
|
+
type="Value",
|
|
862
|
+
value=target,
|
|
863
|
+
),
|
|
864
|
+
),
|
|
865
|
+
)
|
|
866
|
+
elif provider["type"] == METRICS_PROVIDER_UWSGI_V2:
|
|
867
|
+
return V2MetricSpec(
|
|
868
|
+
type="Object",
|
|
869
|
+
object=V2ObjectMetricSource(
|
|
870
|
+
metric=V2MetricIdentifier(name=prometheus_hpa_metric_name),
|
|
871
|
+
described_object=V2CrossVersionObjectReference(
|
|
872
|
+
api_version="apps/v1", kind="Deployment", name=name
|
|
873
|
+
),
|
|
874
|
+
target=V2MetricTarget(
|
|
875
|
+
type="AverageValue",
|
|
876
|
+
average_value=target,
|
|
877
|
+
),
|
|
878
|
+
),
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
log.error(
|
|
882
|
+
f"Unknown metrics_provider specified: {provider['type']} for\
|
|
883
|
+
{name}/name in namespace{namespace}"
|
|
884
|
+
)
|
|
885
|
+
return None
|
|
886
|
+
|
|
887
|
+
def get_autoscaling_metric_spec(
|
|
888
|
+
self,
|
|
889
|
+
name: str,
|
|
890
|
+
cluster: str,
|
|
891
|
+
kube_client: KubeClient,
|
|
892
|
+
namespace: str,
|
|
893
|
+
) -> Optional[V2HorizontalPodAutoscaler]:
|
|
894
|
+
# Returns None if an HPA should not be attached based on the config,
|
|
895
|
+
# or the config is invalid.
|
|
896
|
+
|
|
897
|
+
if self.get_desired_state() == "stop":
|
|
898
|
+
return None
|
|
899
|
+
|
|
900
|
+
if not self.is_autoscaling_enabled():
|
|
901
|
+
return None
|
|
902
|
+
|
|
903
|
+
autoscaling_params = self.get_autoscaling_params()
|
|
904
|
+
if autoscaling_params["metrics_providers"][0]["decision_policy"] == "bespoke":
|
|
905
|
+
return None
|
|
906
|
+
|
|
907
|
+
min_replicas = self.get_min_instances()
|
|
908
|
+
max_replicas = self.get_max_instances()
|
|
909
|
+
if min_replicas == 0 or max_replicas == 0:
|
|
910
|
+
log.error(
|
|
911
|
+
f"Invalid value for min or max_instances on {name}: {min_replicas}, {max_replicas}"
|
|
912
|
+
)
|
|
913
|
+
return None
|
|
914
|
+
|
|
915
|
+
metrics = []
|
|
916
|
+
for provider in autoscaling_params["metrics_providers"]:
|
|
917
|
+
spec = self.get_autoscaling_provider_spec(name, namespace, provider)
|
|
918
|
+
if spec is not None:
|
|
919
|
+
metrics.append(spec)
|
|
920
|
+
scaling_policy = self.get_autoscaling_scaling_policy(
|
|
921
|
+
max_replicas,
|
|
922
|
+
autoscaling_params,
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
labels = {
|
|
926
|
+
paasta_prefixed("service"): self.service,
|
|
927
|
+
paasta_prefixed("instance"): self.instance,
|
|
928
|
+
paasta_prefixed("pool"): self.get_pool(),
|
|
929
|
+
paasta_prefixed("managed"): "true",
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
hpa = V2HorizontalPodAutoscaler(
|
|
933
|
+
kind="HorizontalPodAutoscaler",
|
|
934
|
+
metadata=V1ObjectMeta(
|
|
935
|
+
name=name, namespace=namespace, annotations=dict(), labels=labels
|
|
936
|
+
),
|
|
937
|
+
spec=V2HorizontalPodAutoscalerSpec(
|
|
938
|
+
behavior=scaling_policy,
|
|
939
|
+
max_replicas=max_replicas,
|
|
940
|
+
min_replicas=min_replicas,
|
|
941
|
+
metrics=metrics,
|
|
942
|
+
scale_target_ref=V2CrossVersionObjectReference(
|
|
943
|
+
api_version="apps/v1", kind="Deployment", name=name
|
|
944
|
+
),
|
|
945
|
+
),
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
return hpa
|
|
949
|
+
|
|
950
|
+
def get_deployment_strategy_config(self) -> V1DeploymentStrategy:
|
|
951
|
+
# get soa defined bounce_method
|
|
952
|
+
bounce_method = self.get_bounce_method()
|
|
953
|
+
# get k8s equivalent
|
|
954
|
+
strategy_type = KUBE_DEPLOY_STATEGY_MAP[bounce_method]
|
|
955
|
+
|
|
956
|
+
if strategy_type == "RollingUpdate":
|
|
957
|
+
max_surge = "100%"
|
|
958
|
+
if bounce_method == "crossover":
|
|
959
|
+
max_unavailable = "{}%".format(
|
|
960
|
+
int((1 - self.get_bounce_margin_factor()) * 100)
|
|
961
|
+
)
|
|
962
|
+
elif bounce_method == "brutal":
|
|
963
|
+
# `brutal` bounce method means a bounce margin factor of 0, do not call get_bounce_margin_factor
|
|
964
|
+
max_unavailable = "100%"
|
|
965
|
+
else:
|
|
966
|
+
raise Exception("Unknown bounce method for RollingUpdate.")
|
|
967
|
+
rolling_update = V1RollingUpdateDeployment
|
|
968
|
+
|
|
969
|
+
# this translates bounce_margin to k8s speak maxUnavailable
|
|
970
|
+
# for now we keep max_surge 100% but we could customise later
|
|
971
|
+
rolling_update = V1RollingUpdateDeployment(
|
|
972
|
+
max_surge=max_surge, max_unavailable=max_unavailable
|
|
973
|
+
)
|
|
974
|
+
else:
|
|
975
|
+
rolling_update = None
|
|
976
|
+
|
|
977
|
+
return V1DeploymentStrategy(type=strategy_type, rolling_update=rolling_update)
|
|
978
|
+
|
|
979
|
+
def get_sanitised_volume_name(self, volume_name: str, length_limit: int = 0) -> str:
|
|
980
|
+
"""I know but we really aren't allowed many characters..."""
|
|
981
|
+
volume_name = volume_name.rstrip("/")
|
|
982
|
+
sanitised = volume_name.replace("/", "slash-").replace(".", "dot-")
|
|
983
|
+
sanitised_name = sanitise_kubernetes_name(sanitised)
|
|
984
|
+
if length_limit and len(sanitised_name) > length_limit:
|
|
985
|
+
sanitised_name = (
|
|
986
|
+
sanitised_name[0 : length_limit - 6]
|
|
987
|
+
+ "--"
|
|
988
|
+
+ hashlib.md5(sanitised_name.encode("ascii")).hexdigest()[:4]
|
|
989
|
+
)
|
|
990
|
+
return sanitised_name
|
|
991
|
+
|
|
992
|
+
def get_docker_volume_name(self, docker_volume: DockerVolume) -> str:
|
|
993
|
+
return self.get_sanitised_volume_name(
|
|
994
|
+
"host--{name}".format(name=docker_volume["hostPath"]), length_limit=63
|
|
995
|
+
)
|
|
996
|
+
|
|
997
|
+
def get_persistent_volume_name(self, docker_volume: PersistentVolume) -> str:
|
|
998
|
+
return self.get_sanitised_volume_name(
|
|
999
|
+
"pv--{name}".format(name=docker_volume["container_path"]), length_limit=253
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
def get_aws_ebs_volume_name(self, aws_ebs_volume: AwsEbsVolume) -> str:
|
|
1003
|
+
return self.get_sanitised_volume_name(
|
|
1004
|
+
"aws-ebs--{name}{partition}".format(
|
|
1005
|
+
name=aws_ebs_volume["volume_id"],
|
|
1006
|
+
partition=aws_ebs_volume.get("partition", ""),
|
|
1007
|
+
)
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
def get_secret_volume_name(self, secret_volume: SecretVolume) -> str:
|
|
1011
|
+
return self.get_sanitised_volume_name(
|
|
1012
|
+
"secret--{name}".format(name=secret_volume["secret_name"]), length_limit=63
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
def get_projected_sa_volume_name(
|
|
1016
|
+
self, projected_sa_volume: ProjectedSAVolume
|
|
1017
|
+
) -> str:
|
|
1018
|
+
return self.get_sanitised_volume_name(
|
|
1019
|
+
"projected-sa--{audience}".format(audience=projected_sa_volume["audience"]),
|
|
1020
|
+
length_limit=63,
|
|
1021
|
+
)
|
|
1022
|
+
|
|
1023
|
+
def get_boto_secret_volume_name(self, service_name: str) -> str:
|
|
1024
|
+
return self.get_sanitised_volume_name(
|
|
1025
|
+
f"secret-boto-key-{service_name}", length_limit=63
|
|
1026
|
+
)
|
|
1027
|
+
|
|
1028
|
+
def get_crypto_secret_volume_name(self, service_name: str) -> str:
|
|
1029
|
+
return self.get_sanitised_volume_name(
|
|
1030
|
+
f"secret-crypto-key-{service_name}", length_limit=63
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
def read_only_mode(self, d: VolumeWithMode) -> bool:
|
|
1034
|
+
return d.get("mode", "RO") == "RO"
|
|
1035
|
+
|
|
1036
|
+
def get_readiness_check_script(
|
|
1037
|
+
self, system_paasta_config: SystemPaastaConfig
|
|
1038
|
+
) -> List[str]:
|
|
1039
|
+
"""Script to check if a service is up in smartstack / envoy"""
|
|
1040
|
+
enable_envoy_check = self.get_enable_envoy_readiness_check(system_paasta_config)
|
|
1041
|
+
enable_nerve_check = self.get_enable_nerve_readiness_check(system_paasta_config)
|
|
1042
|
+
if enable_nerve_check and enable_envoy_check:
|
|
1043
|
+
return system_paasta_config.get_envoy_nerve_readiness_check_script()
|
|
1044
|
+
elif enable_envoy_check:
|
|
1045
|
+
return system_paasta_config.get_envoy_readiness_check_script()
|
|
1046
|
+
else:
|
|
1047
|
+
return system_paasta_config.get_nerve_readiness_check_script()
|
|
1048
|
+
|
|
1049
|
+
def get_sidecar_containers(
|
|
1050
|
+
self,
|
|
1051
|
+
system_paasta_config: SystemPaastaConfig,
|
|
1052
|
+
service_namespace_config: ServiceNamespaceConfig,
|
|
1053
|
+
hacheck_sidecar_volumes: Sequence[DockerVolume],
|
|
1054
|
+
) -> Sequence[V1Container]:
|
|
1055
|
+
hacheck_container = self.get_hacheck_sidecar_container(
|
|
1056
|
+
system_paasta_config,
|
|
1057
|
+
service_namespace_config,
|
|
1058
|
+
hacheck_sidecar_volumes,
|
|
1059
|
+
)
|
|
1060
|
+
gunicorn_exporter_container = self.get_gunicorn_exporter_sidecar_container(
|
|
1061
|
+
system_paasta_config
|
|
1062
|
+
)
|
|
1063
|
+
|
|
1064
|
+
sidecars = []
|
|
1065
|
+
if hacheck_container:
|
|
1066
|
+
sidecars.append(hacheck_container)
|
|
1067
|
+
if gunicorn_exporter_container:
|
|
1068
|
+
sidecars.append(gunicorn_exporter_container)
|
|
1069
|
+
return sidecars
|
|
1070
|
+
|
|
1071
|
+
def get_readiness_check_prefix(
|
|
1072
|
+
self,
|
|
1073
|
+
system_paasta_config: SystemPaastaConfig,
|
|
1074
|
+
initial_delay: float,
|
|
1075
|
+
period_seconds: float,
|
|
1076
|
+
) -> List[str]:
|
|
1077
|
+
return [
|
|
1078
|
+
x.format(initial_delay=initial_delay, period_seconds=period_seconds)
|
|
1079
|
+
for x in system_paasta_config.get_readiness_check_prefix_template()
|
|
1080
|
+
]
|
|
1081
|
+
|
|
1082
|
+
def get_hacheck_sidecar_container(
|
|
1083
|
+
self,
|
|
1084
|
+
system_paasta_config: SystemPaastaConfig,
|
|
1085
|
+
service_namespace_config: ServiceNamespaceConfig,
|
|
1086
|
+
hacheck_sidecar_volumes: Sequence[DockerVolume],
|
|
1087
|
+
) -> Optional[V1Container]:
|
|
1088
|
+
registrations = " ".join(self.get_registrations())
|
|
1089
|
+
# s_m_j currently asserts that services are healthy in smartstack before
|
|
1090
|
+
# continuing a bounce. this readiness check lets us achieve the same thing
|
|
1091
|
+
readiness_probe: Optional[V1Probe]
|
|
1092
|
+
if service_namespace_config.is_in_smartstack() and (
|
|
1093
|
+
self.get_enable_nerve_readiness_check(system_paasta_config)
|
|
1094
|
+
or self.get_enable_envoy_readiness_check(system_paasta_config)
|
|
1095
|
+
):
|
|
1096
|
+
initial_delay = self.get_healthcheck_grace_period_seconds()
|
|
1097
|
+
# COMPINFRA-989, this used to be hardcoded to always be 10 seconds
|
|
1098
|
+
# and to not cause rolling updates on everything at once this is a config option for now
|
|
1099
|
+
if not system_paasta_config.get_hacheck_match_initial_delay():
|
|
1100
|
+
initial_delay = 10
|
|
1101
|
+
period_seconds = 10
|
|
1102
|
+
readiness_probe = V1Probe(
|
|
1103
|
+
_exec=V1ExecAction(
|
|
1104
|
+
command=self.get_readiness_check_prefix(
|
|
1105
|
+
system_paasta_config=system_paasta_config,
|
|
1106
|
+
initial_delay=initial_delay,
|
|
1107
|
+
period_seconds=period_seconds,
|
|
1108
|
+
)
|
|
1109
|
+
+ self.get_readiness_check_script(system_paasta_config)
|
|
1110
|
+
+ [str(self.get_container_port())]
|
|
1111
|
+
+ self.get_registrations()
|
|
1112
|
+
),
|
|
1113
|
+
initial_delay_seconds=initial_delay,
|
|
1114
|
+
period_seconds=period_seconds,
|
|
1115
|
+
)
|
|
1116
|
+
else:
|
|
1117
|
+
readiness_probe = None
|
|
1118
|
+
|
|
1119
|
+
hacheck_registrations_env = V1EnvVar(
|
|
1120
|
+
name="MESH_REGISTRATIONS",
|
|
1121
|
+
value=" ".join(self.get_registrations()),
|
|
1122
|
+
)
|
|
1123
|
+
|
|
1124
|
+
if service_namespace_config.is_in_smartstack():
|
|
1125
|
+
return V1Container(
|
|
1126
|
+
image=system_paasta_config.get_hacheck_sidecar_image_url(),
|
|
1127
|
+
lifecycle=V1Lifecycle(
|
|
1128
|
+
pre_stop=V1LifecycleHandler(
|
|
1129
|
+
_exec=V1ExecAction(
|
|
1130
|
+
command=[
|
|
1131
|
+
"/bin/sh",
|
|
1132
|
+
"-c",
|
|
1133
|
+
f"/usr/bin/hadown {registrations}; sleep {self.get_hacheck_prestop_sleep_seconds()}",
|
|
1134
|
+
]
|
|
1135
|
+
)
|
|
1136
|
+
)
|
|
1137
|
+
),
|
|
1138
|
+
resources=self.get_sidecar_resource_requirements(
|
|
1139
|
+
"hacheck",
|
|
1140
|
+
system_paasta_config,
|
|
1141
|
+
),
|
|
1142
|
+
name=HACHECK_POD_NAME,
|
|
1143
|
+
env=self.get_kubernetes_environment() + [hacheck_registrations_env],
|
|
1144
|
+
ports=[V1ContainerPort(container_port=6666)],
|
|
1145
|
+
readiness_probe=readiness_probe,
|
|
1146
|
+
volume_mounts=self.get_volume_mounts(
|
|
1147
|
+
docker_volumes=hacheck_sidecar_volumes,
|
|
1148
|
+
aws_ebs_volumes=[],
|
|
1149
|
+
persistent_volumes=[],
|
|
1150
|
+
secret_volumes=[],
|
|
1151
|
+
projected_sa_volumes=[],
|
|
1152
|
+
),
|
|
1153
|
+
)
|
|
1154
|
+
return None
|
|
1155
|
+
|
|
1156
|
+
def get_gunicorn_exporter_sidecar_container(
|
|
1157
|
+
self,
|
|
1158
|
+
system_paasta_config: SystemPaastaConfig,
|
|
1159
|
+
) -> Optional[V1Container]:
|
|
1160
|
+
|
|
1161
|
+
if self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN):
|
|
1162
|
+
return V1Container(
|
|
1163
|
+
image=system_paasta_config.get_gunicorn_exporter_sidecar_image_url(),
|
|
1164
|
+
resources=self.get_sidecar_resource_requirements(
|
|
1165
|
+
"gunicorn_exporter", system_paasta_config
|
|
1166
|
+
),
|
|
1167
|
+
name=GUNICORN_EXPORTER_POD_NAME,
|
|
1168
|
+
env=self.get_kubernetes_environment(),
|
|
1169
|
+
ports=[V1ContainerPort(container_port=9117)],
|
|
1170
|
+
lifecycle=V1Lifecycle(
|
|
1171
|
+
pre_stop=V1LifecycleHandler(
|
|
1172
|
+
_exec=V1ExecAction(
|
|
1173
|
+
command=[
|
|
1174
|
+
"/bin/sh",
|
|
1175
|
+
"-c",
|
|
1176
|
+
# we sleep for the same amount of time as we do after an hadown to ensure that we have accurate
|
|
1177
|
+
# metrics up until our Pod dies
|
|
1178
|
+
f"sleep {self.get_hacheck_prestop_sleep_seconds()}",
|
|
1179
|
+
]
|
|
1180
|
+
)
|
|
1181
|
+
)
|
|
1182
|
+
),
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
return None
|
|
1186
|
+
|
|
1187
|
+
def get_env(
|
|
1188
|
+
self, system_paasta_config: Optional["SystemPaastaConfig"] = None
|
|
1189
|
+
) -> Dict[str, str]:
|
|
1190
|
+
env = super().get_env(system_paasta_config=system_paasta_config)
|
|
1191
|
+
# see CLIENTOBS-64 and PAASTA-17558
|
|
1192
|
+
# this is deliberately set here to make sure it is only available for
|
|
1193
|
+
# k8s long-running services. putting this in `InstanceConfig.get_env` will
|
|
1194
|
+
# make it available for all workloads, which will cause big bounces and
|
|
1195
|
+
# continuous reconfiguring every time soa-configs is updated unless the
|
|
1196
|
+
# env var is deliberately excluded from config hashing for those workloads
|
|
1197
|
+
# as well.
|
|
1198
|
+
env["PAASTA_SOA_CONFIGS_SHA"] = read_soa_metadata(soa_dir=self.soa_dir).get(
|
|
1199
|
+
"git_sha", ""
|
|
1200
|
+
)
|
|
1201
|
+
|
|
1202
|
+
# We drop PAASTA_CLUSTER here because it will be added via `get_kubernetes_environment()`
|
|
1203
|
+
env.pop("PAASTA_CLUSTER", None)
|
|
1204
|
+
|
|
1205
|
+
return env
|
|
1206
|
+
|
|
1207
|
+
def get_env_vars_that_use_secrets(self) -> Tuple[Dict[str, str], Dict[str, str]]:
|
|
1208
|
+
"""Returns two dictionaries of environment variable name->value; the first is vars that use non-shared
|
|
1209
|
+
secrets, and the second is vars that use shared secrets.
|
|
1210
|
+
|
|
1211
|
+
The values of the dictionaries are the secret refs as formatted in yelpsoa-configs, e.g. "SECRET(foo)"
|
|
1212
|
+
or "SHARED_SECRET(bar)". These can be decoded with get_secret_name_from_ref.
|
|
1213
|
+
"""
|
|
1214
|
+
secret_env_vars = {}
|
|
1215
|
+
shared_secret_env_vars = {}
|
|
1216
|
+
for k, v in self.get_env().items():
|
|
1217
|
+
if is_secret_ref(v):
|
|
1218
|
+
if is_shared_secret(v):
|
|
1219
|
+
shared_secret_env_vars[k] = v
|
|
1220
|
+
else:
|
|
1221
|
+
secret_env_vars[k] = v
|
|
1222
|
+
return secret_env_vars, shared_secret_env_vars
|
|
1223
|
+
|
|
1224
|
+
def get_container_env(self) -> Sequence[V1EnvVar]:
|
|
1225
|
+
secret_env_vars, shared_secret_env_vars = self.get_env_vars_that_use_secrets()
|
|
1226
|
+
|
|
1227
|
+
user_env = [
|
|
1228
|
+
V1EnvVar(name=name, value=value)
|
|
1229
|
+
for name, value in self.get_env().items()
|
|
1230
|
+
if name
|
|
1231
|
+
not in list(secret_env_vars.keys()) + list(shared_secret_env_vars.keys())
|
|
1232
|
+
]
|
|
1233
|
+
user_env += self.get_kubernetes_secret_env_vars(
|
|
1234
|
+
secret_env_vars=secret_env_vars,
|
|
1235
|
+
shared_secret_env_vars=shared_secret_env_vars,
|
|
1236
|
+
)
|
|
1237
|
+
return user_env + self.get_kubernetes_environment() # type: ignore
|
|
1238
|
+
|
|
1239
|
+
def get_kubernetes_secret_env_vars(
|
|
1240
|
+
self,
|
|
1241
|
+
secret_env_vars: Mapping[str, str],
|
|
1242
|
+
shared_secret_env_vars: Mapping[str, str],
|
|
1243
|
+
) -> Sequence[V1EnvVar]:
|
|
1244
|
+
ret = []
|
|
1245
|
+
for k, v in secret_env_vars.items():
|
|
1246
|
+
secret = get_secret_name_from_ref(v)
|
|
1247
|
+
ret.append(
|
|
1248
|
+
V1EnvVar(
|
|
1249
|
+
name=k,
|
|
1250
|
+
value_from=V1EnvVarSource(
|
|
1251
|
+
secret_key_ref=V1SecretKeySelector(
|
|
1252
|
+
name=get_paasta_secret_name(
|
|
1253
|
+
self.get_namespace(), self.get_service(), secret
|
|
1254
|
+
),
|
|
1255
|
+
key=secret,
|
|
1256
|
+
optional=False,
|
|
1257
|
+
)
|
|
1258
|
+
),
|
|
1259
|
+
)
|
|
1260
|
+
)
|
|
1261
|
+
for k, v in shared_secret_env_vars.items():
|
|
1262
|
+
secret = get_secret_name_from_ref(v)
|
|
1263
|
+
ret.append(
|
|
1264
|
+
V1EnvVar(
|
|
1265
|
+
name=k,
|
|
1266
|
+
value_from=V1EnvVarSource(
|
|
1267
|
+
secret_key_ref=V1SecretKeySelector(
|
|
1268
|
+
name=get_paasta_secret_name(
|
|
1269
|
+
self.get_namespace(), SHARED_SECRET_SERVICE, secret
|
|
1270
|
+
),
|
|
1271
|
+
key=secret,
|
|
1272
|
+
optional=False,
|
|
1273
|
+
)
|
|
1274
|
+
),
|
|
1275
|
+
)
|
|
1276
|
+
)
|
|
1277
|
+
return ret
|
|
1278
|
+
|
|
1279
|
+
def get_kubernetes_environment(self) -> List[V1EnvVar]:
|
|
1280
|
+
kubernetes_env = [
|
|
1281
|
+
V1EnvVar(
|
|
1282
|
+
name="PAASTA_POD_IP",
|
|
1283
|
+
value_from=V1EnvVarSource(
|
|
1284
|
+
field_ref=V1ObjectFieldSelector(field_path="status.podIP")
|
|
1285
|
+
),
|
|
1286
|
+
),
|
|
1287
|
+
V1EnvVar(
|
|
1288
|
+
# this is used by some functions of operator-sdk
|
|
1289
|
+
# it uses this environment variable to get the pods
|
|
1290
|
+
name="POD_NAME",
|
|
1291
|
+
value_from=V1EnvVarSource(
|
|
1292
|
+
field_ref=V1ObjectFieldSelector(field_path="metadata.name")
|
|
1293
|
+
),
|
|
1294
|
+
),
|
|
1295
|
+
V1EnvVar(
|
|
1296
|
+
name="PAASTA_HOST",
|
|
1297
|
+
value_from=V1EnvVarSource(
|
|
1298
|
+
field_ref=V1ObjectFieldSelector(field_path="spec.nodeName")
|
|
1299
|
+
),
|
|
1300
|
+
),
|
|
1301
|
+
V1EnvVar(
|
|
1302
|
+
name="PAASTA_CLUSTER",
|
|
1303
|
+
value_from=V1EnvVarSource(
|
|
1304
|
+
field_ref=V1ObjectFieldSelector(
|
|
1305
|
+
field_path="metadata.labels['"
|
|
1306
|
+
+ paasta_prefixed("cluster")
|
|
1307
|
+
+ "']"
|
|
1308
|
+
)
|
|
1309
|
+
),
|
|
1310
|
+
),
|
|
1311
|
+
]
|
|
1312
|
+
return kubernetes_env
|
|
1313
|
+
|
|
1314
|
+
def get_resource_requirements(self) -> V1ResourceRequirements:
|
|
1315
|
+
limits = {
|
|
1316
|
+
"cpu": self.get_cpus() + self.get_cpu_burst_add(),
|
|
1317
|
+
"memory": f"{self.get_mem()}Mi",
|
|
1318
|
+
"ephemeral-storage": f"{self.get_disk()}Mi",
|
|
1319
|
+
}
|
|
1320
|
+
requests = {
|
|
1321
|
+
"cpu": self.get_cpus(),
|
|
1322
|
+
"memory": f"{self.get_mem()}Mi",
|
|
1323
|
+
"ephemeral-storage": f"{self.get_disk()}Mi",
|
|
1324
|
+
}
|
|
1325
|
+
if self.get_gpus():
|
|
1326
|
+
limits[GPU_RESOURCE_NAME] = self.get_gpus()
|
|
1327
|
+
requests[GPU_RESOURCE_NAME] = self.get_gpus()
|
|
1328
|
+
return V1ResourceRequirements(limits=limits, requests=requests)
|
|
1329
|
+
|
|
1330
|
+
def get_sidecar_resource_requirements(
|
|
1331
|
+
self,
|
|
1332
|
+
sidecar_name: str,
|
|
1333
|
+
system_paasta_config: SystemPaastaConfig,
|
|
1334
|
+
) -> V1ResourceRequirements:
|
|
1335
|
+
"""
|
|
1336
|
+
Sidecar request/limits are set with varying levels of priority, with
|
|
1337
|
+
elements further down the list taking precedence:
|
|
1338
|
+
* hard-coded paasta default
|
|
1339
|
+
* SystemPaastaConfig
|
|
1340
|
+
* per-service soaconfig overrides
|
|
1341
|
+
|
|
1342
|
+
Additionally, for the time being we do not expose a way to set
|
|
1343
|
+
limits separately from requests - these values will always mirror
|
|
1344
|
+
each other
|
|
1345
|
+
|
|
1346
|
+
NOTE: changing any of these will cause a bounce of all services that
|
|
1347
|
+
run the sidecars affected by the resource change
|
|
1348
|
+
"""
|
|
1349
|
+
config = self.config_dict.get("sidecar_resource_requirements", {}).get(
|
|
1350
|
+
sidecar_name, {}
|
|
1351
|
+
)
|
|
1352
|
+
sidecar_requirements_config = (
|
|
1353
|
+
system_paasta_config.get_sidecar_requirements_config().get(
|
|
1354
|
+
sidecar_name, DEFAULT_SIDECAR_REQUEST
|
|
1355
|
+
)
|
|
1356
|
+
)
|
|
1357
|
+
requests: KubeContainerResourceRequest = {
|
|
1358
|
+
"cpu": sidecar_requirements_config.get("cpu"),
|
|
1359
|
+
"memory": sidecar_requirements_config.get("memory"),
|
|
1360
|
+
"ephemeral-storage": sidecar_requirements_config.get("ephemeral-storage"),
|
|
1361
|
+
}
|
|
1362
|
+
requests.update(config.get("requests", {}))
|
|
1363
|
+
|
|
1364
|
+
limits: KubeContainerResourceRequest = {
|
|
1365
|
+
"cpu": requests["cpu"],
|
|
1366
|
+
"memory": requests["memory"],
|
|
1367
|
+
"ephemeral-storage": requests["ephemeral-storage"],
|
|
1368
|
+
}
|
|
1369
|
+
limits.update(config.get("limits", {}))
|
|
1370
|
+
|
|
1371
|
+
return V1ResourceRequirements(
|
|
1372
|
+
limits=limits,
|
|
1373
|
+
requests=requests,
|
|
1374
|
+
)
|
|
1375
|
+
|
|
1376
|
+
def get_liveness_probe(
|
|
1377
|
+
self, service_namespace_config: ServiceNamespaceConfig
|
|
1378
|
+
) -> Optional[V1Probe]:
|
|
1379
|
+
mode = self.get_healthcheck_mode(service_namespace_config)
|
|
1380
|
+
if mode is None:
|
|
1381
|
+
return None
|
|
1382
|
+
|
|
1383
|
+
initial_delay_seconds = self.get_healthcheck_grace_period_seconds()
|
|
1384
|
+
period_seconds = self.get_healthcheck_interval_seconds()
|
|
1385
|
+
timeout_seconds = self.get_healthcheck_timeout_seconds()
|
|
1386
|
+
failure_threshold = self.get_healthcheck_max_consecutive_failures()
|
|
1387
|
+
probe = V1Probe(
|
|
1388
|
+
failure_threshold=failure_threshold,
|
|
1389
|
+
initial_delay_seconds=initial_delay_seconds,
|
|
1390
|
+
period_seconds=period_seconds,
|
|
1391
|
+
timeout_seconds=timeout_seconds,
|
|
1392
|
+
)
|
|
1393
|
+
|
|
1394
|
+
if mode == "http" or mode == "https":
|
|
1395
|
+
path = self.get_healthcheck_uri(service_namespace_config)
|
|
1396
|
+
probe.http_get = V1HTTPGetAction(
|
|
1397
|
+
path=path, port=self.get_container_port(), scheme=mode.upper()
|
|
1398
|
+
)
|
|
1399
|
+
elif mode == "tcp":
|
|
1400
|
+
probe.tcp_socket = V1TCPSocketAction(port=self.get_container_port())
|
|
1401
|
+
elif mode == "cmd":
|
|
1402
|
+
probe._exec = V1ExecAction(
|
|
1403
|
+
command=["/bin/sh", "-c", self.get_healthcheck_cmd()]
|
|
1404
|
+
)
|
|
1405
|
+
else:
|
|
1406
|
+
raise InvalidHealthcheckMode(
|
|
1407
|
+
"Unknown mode: %s. Only acceptable healthcheck modes are http/https/tcp/cmd"
|
|
1408
|
+
% mode
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1411
|
+
return probe
|
|
1412
|
+
|
|
1413
|
+
def get_security_context(self) -> Optional[V1SecurityContext]:
|
|
1414
|
+
cap_add = self.config_dict.get("cap_add", None)
|
|
1415
|
+
context_kwargs = (
|
|
1416
|
+
# passing parameter like this to avoid all services to bounce
|
|
1417
|
+
# when this change is released
|
|
1418
|
+
{"privileged": self.config_dict["privileged"]}
|
|
1419
|
+
if "privileged" in self.config_dict
|
|
1420
|
+
else {}
|
|
1421
|
+
)
|
|
1422
|
+
if cap_add is None:
|
|
1423
|
+
return V1SecurityContext(
|
|
1424
|
+
capabilities=V1Capabilities(drop=CAPS_DROP),
|
|
1425
|
+
**context_kwargs,
|
|
1426
|
+
)
|
|
1427
|
+
else:
|
|
1428
|
+
return V1SecurityContext(
|
|
1429
|
+
# XXX: we should probably generally work in sets, but V1Capabilities is typed as accepting
|
|
1430
|
+
# lists of string only
|
|
1431
|
+
capabilities=V1Capabilities(
|
|
1432
|
+
add=cap_add,
|
|
1433
|
+
# NOTE: this is necessary as containerd differs in behavior from dockershim: in dockershim
|
|
1434
|
+
# dropped capabilities were overriden if the same capability was added - but in containerd
|
|
1435
|
+
# the dropped capabilities appear to have higher priority.
|
|
1436
|
+
# WARNING: this must be sorted - otherwise the order of the capabilities will be different
|
|
1437
|
+
# on every setup_kubernetes_job run and cause unnecessary redeployments
|
|
1438
|
+
drop=sorted(list(set(CAPS_DROP) - set(cap_add))),
|
|
1439
|
+
),
|
|
1440
|
+
**context_kwargs,
|
|
1441
|
+
)
|
|
1442
|
+
|
|
1443
|
+
def get_kubernetes_containers(
|
|
1444
|
+
self,
|
|
1445
|
+
docker_volumes: Sequence[DockerVolume],
|
|
1446
|
+
hacheck_sidecar_volumes: Sequence[DockerVolume],
|
|
1447
|
+
system_paasta_config: SystemPaastaConfig,
|
|
1448
|
+
aws_ebs_volumes: Sequence[AwsEbsVolume],
|
|
1449
|
+
secret_volumes: Sequence[SecretVolume],
|
|
1450
|
+
service_namespace_config: ServiceNamespaceConfig,
|
|
1451
|
+
include_sidecars: bool = True,
|
|
1452
|
+
) -> Sequence[V1Container]:
|
|
1453
|
+
ports = [self.get_container_port()]
|
|
1454
|
+
# MONK-1130
|
|
1455
|
+
# The prometheus_port is used for scraping metrics from the main
|
|
1456
|
+
# container in the pod. Prometheus discovers ports using the kubernetes
|
|
1457
|
+
# API and creates scrape targets for all the exported container ports.
|
|
1458
|
+
# A better way of doing this would to export the prometheus port as pod
|
|
1459
|
+
# annotations but this is not currently supported.
|
|
1460
|
+
# https://github.com/prometheus/prometheus/issues/3756
|
|
1461
|
+
prometheus_port = self.get_prometheus_port()
|
|
1462
|
+
if prometheus_port and prometheus_port not in ports:
|
|
1463
|
+
ports.append(prometheus_port)
|
|
1464
|
+
|
|
1465
|
+
service_container = V1Container(
|
|
1466
|
+
image=self.get_docker_url(),
|
|
1467
|
+
command=self.get_cmd(),
|
|
1468
|
+
args=self.get_args(),
|
|
1469
|
+
env=self.get_container_env(),
|
|
1470
|
+
resources=self.get_resource_requirements(),
|
|
1471
|
+
lifecycle=V1Lifecycle(
|
|
1472
|
+
pre_stop=self.get_kubernetes_container_termination_action(
|
|
1473
|
+
service_namespace_config
|
|
1474
|
+
)
|
|
1475
|
+
),
|
|
1476
|
+
name=self.get_sanitised_instance_name(),
|
|
1477
|
+
liveness_probe=self.get_liveness_probe(service_namespace_config),
|
|
1478
|
+
readiness_probe=self.get_readiness_probe(service_namespace_config),
|
|
1479
|
+
ports=[V1ContainerPort(container_port=port) for port in ports],
|
|
1480
|
+
security_context=self.get_security_context(),
|
|
1481
|
+
volume_mounts=self.get_volume_mounts(
|
|
1482
|
+
docker_volumes=docker_volumes,
|
|
1483
|
+
aws_ebs_volumes=aws_ebs_volumes,
|
|
1484
|
+
persistent_volumes=self.get_persistent_volumes(),
|
|
1485
|
+
secret_volumes=secret_volumes,
|
|
1486
|
+
projected_sa_volumes=self.get_projected_sa_volumes(),
|
|
1487
|
+
),
|
|
1488
|
+
)
|
|
1489
|
+
containers = [service_container]
|
|
1490
|
+
if include_sidecars:
|
|
1491
|
+
containers += self.get_sidecar_containers( # type: ignore
|
|
1492
|
+
system_paasta_config=system_paasta_config,
|
|
1493
|
+
service_namespace_config=service_namespace_config,
|
|
1494
|
+
hacheck_sidecar_volumes=hacheck_sidecar_volumes,
|
|
1495
|
+
)
|
|
1496
|
+
return containers
|
|
1497
|
+
|
|
1498
|
+
def get_readiness_probe(
|
|
1499
|
+
self, service_namespace_config: ServiceNamespaceConfig
|
|
1500
|
+
) -> Optional[V1Probe]:
|
|
1501
|
+
if service_namespace_config.is_in_smartstack():
|
|
1502
|
+
return None
|
|
1503
|
+
else:
|
|
1504
|
+
return self.get_liveness_probe(service_namespace_config)
|
|
1505
|
+
|
|
1506
|
+
def get_lifecycle_dict(self) -> KubeLifecycleDict:
|
|
1507
|
+
return self.config_dict.get("lifecycle", KubeLifecycleDict({}))
|
|
1508
|
+
|
|
1509
|
+
def get_prestop_sleep_seconds(self, is_in_smartstack: bool) -> int:
|
|
1510
|
+
if is_in_smartstack:
|
|
1511
|
+
default = 30
|
|
1512
|
+
else:
|
|
1513
|
+
default = 0
|
|
1514
|
+
return self.get_lifecycle_dict().get("pre_stop_drain_seconds", default)
|
|
1515
|
+
|
|
1516
|
+
def get_hacheck_prestop_sleep_seconds(self) -> int:
|
|
1517
|
+
"""The number of seconds to sleep between hadown and terminating the hacheck container. We want hacheck to be
|
|
1518
|
+
up for slightly longer than the main container is, so we default to pre_stop_drain_seconds + 1.
|
|
1519
|
+
|
|
1520
|
+
It doesn't super matter if hacheck goes down before the main container -- if it's down, healthchecks will fail
|
|
1521
|
+
and the service will be removed from smartstack, which is the same effect we get after running hadown.
|
|
1522
|
+
"""
|
|
1523
|
+
|
|
1524
|
+
# Everywhere this value is currently used (hacheck sidecar or gunicorn sidecar), we can pretty safely
|
|
1525
|
+
# assume that the service is in smartstack.
|
|
1526
|
+
return self.get_prestop_sleep_seconds(is_in_smartstack=True) + 1
|
|
1527
|
+
|
|
1528
|
+
def get_pre_stop_wait_for_connections_to_complete(
|
|
1529
|
+
self, service_namespace_config: ServiceNamespaceConfig
|
|
1530
|
+
) -> bool:
|
|
1531
|
+
return self.get_lifecycle_dict().get(
|
|
1532
|
+
"pre_stop_wait_for_connections_to_complete",
|
|
1533
|
+
service_namespace_config.is_in_smartstack()
|
|
1534
|
+
and service_namespace_config.get_longest_timeout_ms() >= 20000,
|
|
1535
|
+
)
|
|
1536
|
+
|
|
1537
|
+
def get_kubernetes_container_termination_action(
|
|
1538
|
+
self,
|
|
1539
|
+
service_namespace_config: ServiceNamespaceConfig,
|
|
1540
|
+
) -> V1LifecycleHandler:
|
|
1541
|
+
command = self.get_lifecycle_dict().get("pre_stop_command", [])
|
|
1542
|
+
# default pre stop hook for the container
|
|
1543
|
+
if not command:
|
|
1544
|
+
pre_stop_sleep_seconds = self.get_prestop_sleep_seconds(
|
|
1545
|
+
service_namespace_config.is_in_smartstack()
|
|
1546
|
+
)
|
|
1547
|
+
if self.get_pre_stop_wait_for_connections_to_complete(
|
|
1548
|
+
service_namespace_config
|
|
1549
|
+
):
|
|
1550
|
+
# This pre-stop command:
|
|
1551
|
+
# 1. Waits for pre_stop_sleep_seconds seconds (to give hadown time to take effect). This avoids a
|
|
1552
|
+
# potential race condition where step 2 detects no connections in flight and the pod is terminated
|
|
1553
|
+
# immediately, but because the pod is still listed in Envoy somewhere, it receives a new connection
|
|
1554
|
+
# just as the pod is terminated.
|
|
1555
|
+
# 2. Every second, checks if there are any established connections to the pod. It exits when there are no
|
|
1556
|
+
# established connections.
|
|
1557
|
+
# It exits when all connections are closed, which should mean the pod can be safely terminated.
|
|
1558
|
+
# The first four fields of /proc/net/tcp are:
|
|
1559
|
+
# 1. slot number (which is not relevant here, but it's a decimal number left-padded with whitespace)
|
|
1560
|
+
# 2. local address:port (both in hex)
|
|
1561
|
+
# 3. remote address:port (both in hex)
|
|
1562
|
+
# 4. state (in hex)
|
|
1563
|
+
# State 01 means ESTABLISHED.
|
|
1564
|
+
hex_port = hex(self.get_container_port()).upper()[2:]
|
|
1565
|
+
command = [
|
|
1566
|
+
"/bin/sh",
|
|
1567
|
+
"-c",
|
|
1568
|
+
f"sleep {pre_stop_sleep_seconds}; while grep '^ *[0-9]*: ........:{hex_port} ........:.... 01 ' /proc/net/tcp; do sleep 1; echo; done",
|
|
1569
|
+
]
|
|
1570
|
+
else:
|
|
1571
|
+
command = [
|
|
1572
|
+
"/bin/sh",
|
|
1573
|
+
"-c",
|
|
1574
|
+
f"sleep {pre_stop_sleep_seconds}",
|
|
1575
|
+
]
|
|
1576
|
+
|
|
1577
|
+
if isinstance(command, str):
|
|
1578
|
+
command = [command]
|
|
1579
|
+
return V1LifecycleHandler(_exec=V1ExecAction(command=command))
|
|
1580
|
+
|
|
1581
|
+
def get_pod_volumes(
|
|
1582
|
+
self,
|
|
1583
|
+
docker_volumes: Sequence[DockerVolume],
|
|
1584
|
+
aws_ebs_volumes: Sequence[AwsEbsVolume],
|
|
1585
|
+
secret_volumes: Sequence[SecretVolume],
|
|
1586
|
+
projected_sa_volumes: Sequence[ProjectedSAVolume],
|
|
1587
|
+
) -> Sequence[V1Volume]:
|
|
1588
|
+
pod_volumes = []
|
|
1589
|
+
unique_docker_volumes = {
|
|
1590
|
+
self.get_docker_volume_name(docker_volume): docker_volume
|
|
1591
|
+
for docker_volume in docker_volumes
|
|
1592
|
+
}
|
|
1593
|
+
for name, docker_volume in unique_docker_volumes.items():
|
|
1594
|
+
pod_volumes.append(
|
|
1595
|
+
V1Volume(
|
|
1596
|
+
host_path=V1HostPathVolumeSource(path=docker_volume["hostPath"]),
|
|
1597
|
+
name=name,
|
|
1598
|
+
)
|
|
1599
|
+
)
|
|
1600
|
+
unique_aws_ebs_volumes = {
|
|
1601
|
+
self.get_aws_ebs_volume_name(aws_ebs_volume): aws_ebs_volume
|
|
1602
|
+
for aws_ebs_volume in aws_ebs_volumes
|
|
1603
|
+
}
|
|
1604
|
+
for name, aws_ebs_volume in unique_aws_ebs_volumes.items():
|
|
1605
|
+
pod_volumes.append(
|
|
1606
|
+
V1Volume(
|
|
1607
|
+
aws_elastic_block_store=V1AWSElasticBlockStoreVolumeSource(
|
|
1608
|
+
volume_id=aws_ebs_volume["volume_id"],
|
|
1609
|
+
fs_type=aws_ebs_volume.get("fs_type"),
|
|
1610
|
+
partition=aws_ebs_volume.get("partition"),
|
|
1611
|
+
# k8s wants RW volume even if it's later mounted RO
|
|
1612
|
+
read_only=False,
|
|
1613
|
+
),
|
|
1614
|
+
name=name,
|
|
1615
|
+
)
|
|
1616
|
+
)
|
|
1617
|
+
for secret_volume in secret_volumes:
|
|
1618
|
+
if "items" in secret_volume:
|
|
1619
|
+
items = [
|
|
1620
|
+
V1KeyToPath(
|
|
1621
|
+
key=item["key"],
|
|
1622
|
+
mode=mode_to_int(item.get("mode")),
|
|
1623
|
+
path=item["path"],
|
|
1624
|
+
)
|
|
1625
|
+
for item in secret_volume["items"]
|
|
1626
|
+
]
|
|
1627
|
+
else:
|
|
1628
|
+
items = None
|
|
1629
|
+
pod_volumes.append(
|
|
1630
|
+
V1Volume(
|
|
1631
|
+
name=self.get_secret_volume_name(secret_volume),
|
|
1632
|
+
secret=V1SecretVolumeSource(
|
|
1633
|
+
secret_name=get_paasta_secret_name(
|
|
1634
|
+
self.get_namespace(),
|
|
1635
|
+
self.get_service(),
|
|
1636
|
+
secret_volume["secret_name"],
|
|
1637
|
+
),
|
|
1638
|
+
default_mode=mode_to_int(secret_volume.get("default_mode")),
|
|
1639
|
+
items=items,
|
|
1640
|
+
optional=False,
|
|
1641
|
+
),
|
|
1642
|
+
)
|
|
1643
|
+
)
|
|
1644
|
+
for projected_volume in projected_sa_volumes:
|
|
1645
|
+
pod_volumes.append(
|
|
1646
|
+
V1Volume(
|
|
1647
|
+
name=self.get_projected_sa_volume_name(projected_volume),
|
|
1648
|
+
projected=V1ProjectedVolumeSource(
|
|
1649
|
+
sources=[
|
|
1650
|
+
V1VolumeProjection(
|
|
1651
|
+
service_account_token=V1ServiceAccountTokenProjection(
|
|
1652
|
+
audience=projected_volume["audience"],
|
|
1653
|
+
expiration_seconds=projected_volume.get(
|
|
1654
|
+
"expiration_seconds",
|
|
1655
|
+
DEFAULT_PROJECTED_SA_EXPIRATION_SECONDS,
|
|
1656
|
+
),
|
|
1657
|
+
path=PROJECTED_SA_TOKEN_PATH,
|
|
1658
|
+
)
|
|
1659
|
+
)
|
|
1660
|
+
],
|
|
1661
|
+
),
|
|
1662
|
+
),
|
|
1663
|
+
)
|
|
1664
|
+
|
|
1665
|
+
boto_volume = self.get_boto_volume()
|
|
1666
|
+
if boto_volume:
|
|
1667
|
+
pod_volumes.append(boto_volume)
|
|
1668
|
+
|
|
1669
|
+
crypto_volume = self.get_crypto_volume()
|
|
1670
|
+
if crypto_volume:
|
|
1671
|
+
pod_volumes.append(crypto_volume)
|
|
1672
|
+
|
|
1673
|
+
datastore_credentials_secrets_volume = (
|
|
1674
|
+
self.get_datastore_credentials_secrets_volume()
|
|
1675
|
+
)
|
|
1676
|
+
if datastore_credentials_secrets_volume:
|
|
1677
|
+
pod_volumes.append(datastore_credentials_secrets_volume)
|
|
1678
|
+
|
|
1679
|
+
return pod_volumes
|
|
1680
|
+
|
|
1681
|
+
def get_datastore_credentials(self) -> DatastoreCredentialsConfig:
|
|
1682
|
+
datastore_credentials = self.config_dict.get("datastore_credentials", {})
|
|
1683
|
+
return datastore_credentials
|
|
1684
|
+
|
|
1685
|
+
def get_datastore_credentials_secret_name(self) -> str:
|
|
1686
|
+
return _get_secret_name(
|
|
1687
|
+
self.get_namespace(),
|
|
1688
|
+
"datastore-credentials",
|
|
1689
|
+
self.get_service(),
|
|
1690
|
+
self.get_instance(),
|
|
1691
|
+
)
|
|
1692
|
+
|
|
1693
|
+
def get_datastore_secret_volume_name(self) -> str:
|
|
1694
|
+
"""
|
|
1695
|
+
Volume names must abide to DNS mappings of 63 chars or less, so we limit it here and replace _ with --.
|
|
1696
|
+
"""
|
|
1697
|
+
return self.get_sanitised_volume_name(
|
|
1698
|
+
f"secret-datastore-creds-{self.get_sanitised_deployment_name()}",
|
|
1699
|
+
length_limit=63,
|
|
1700
|
+
)
|
|
1701
|
+
|
|
1702
|
+
def get_datastore_credentials_secrets_volume(self) -> V1Volume:
|
|
1703
|
+
"""
|
|
1704
|
+
All credentials are stored in 1 Kubernetes Secret, which are mapped on an item->path
|
|
1705
|
+
structure to /datastore/<datastore>/<credential>/<password file>.
|
|
1706
|
+
"""
|
|
1707
|
+
datastore_credentials = self.get_datastore_credentials()
|
|
1708
|
+
if not datastore_credentials:
|
|
1709
|
+
return None
|
|
1710
|
+
|
|
1711
|
+
# Assume k8s secret exists if its configmap signature exists
|
|
1712
|
+
secret_hash = self.get_datastore_credentials_secret_hash()
|
|
1713
|
+
if not secret_hash:
|
|
1714
|
+
log.warning(
|
|
1715
|
+
f"Expected to find datastore_credentials secret signature {self.get_datastore_credentials_secret_name()} for {self.get_service()}.{self.get_instance()} on {self.get_namespace()}"
|
|
1716
|
+
)
|
|
1717
|
+
return None
|
|
1718
|
+
|
|
1719
|
+
secrets_with_custom_mountpaths = []
|
|
1720
|
+
|
|
1721
|
+
for datastore, credentials in datastore_credentials.items():
|
|
1722
|
+
# mypy loses type hints on '.items' and throws false positives. unfortunately have to type: ignore
|
|
1723
|
+
# https://github.com/python/mypy/issues/7178
|
|
1724
|
+
for credential in credentials: # type: ignore
|
|
1725
|
+
secrets_with_custom_mountpaths.append(
|
|
1726
|
+
{
|
|
1727
|
+
"key": get_vault_key_secret_name(
|
|
1728
|
+
f"secrets/datastore/{datastore}/{credential}"
|
|
1729
|
+
),
|
|
1730
|
+
"mode": mode_to_int("0444"),
|
|
1731
|
+
"path": f"{datastore}/{credential}/credentials",
|
|
1732
|
+
}
|
|
1733
|
+
)
|
|
1734
|
+
|
|
1735
|
+
return V1Volume(
|
|
1736
|
+
name=self.get_datastore_secret_volume_name(),
|
|
1737
|
+
secret=V1SecretVolumeSource(
|
|
1738
|
+
secret_name=self.get_datastore_credentials_secret_name(),
|
|
1739
|
+
default_mode=mode_to_int("0444"),
|
|
1740
|
+
items=secrets_with_custom_mountpaths,
|
|
1741
|
+
optional=False,
|
|
1742
|
+
),
|
|
1743
|
+
)
|
|
1744
|
+
|
|
1745
|
+
def get_boto_volume(self) -> Optional[V1Volume]:
|
|
1746
|
+
required_boto_keys = self.config_dict.get("boto_keys", [])
|
|
1747
|
+
service_name = self.get_sanitised_deployment_name()
|
|
1748
|
+
if not required_boto_keys:
|
|
1749
|
+
return None
|
|
1750
|
+
items = []
|
|
1751
|
+
for boto_key in required_boto_keys:
|
|
1752
|
+
for filetype in ["sh", "yaml", "cfg", "json"]:
|
|
1753
|
+
this_key = boto_key + "." + filetype
|
|
1754
|
+
secret_name = this_key.replace(".", "-").replace("_", "--")
|
|
1755
|
+
item = V1KeyToPath(
|
|
1756
|
+
key=secret_name,
|
|
1757
|
+
mode=mode_to_int("0444"),
|
|
1758
|
+
path=this_key,
|
|
1759
|
+
)
|
|
1760
|
+
items.append(item)
|
|
1761
|
+
# Assume k8s secret exists if its configmap signature exists
|
|
1762
|
+
secret_hash = self.get_boto_secret_hash()
|
|
1763
|
+
if not secret_hash:
|
|
1764
|
+
log.warning(
|
|
1765
|
+
f"Expected to find boto_cfg secret signature {self.get_boto_secret_signature_name()} for {self.get_service()}.{self.get_instance()} on {self.get_namespace()}"
|
|
1766
|
+
)
|
|
1767
|
+
return None
|
|
1768
|
+
|
|
1769
|
+
volume = V1Volume(
|
|
1770
|
+
name=self.get_boto_secret_volume_name(service_name),
|
|
1771
|
+
secret=V1SecretVolumeSource(
|
|
1772
|
+
secret_name=self.get_boto_secret_name(),
|
|
1773
|
+
default_mode=mode_to_int("0444"),
|
|
1774
|
+
items=items,
|
|
1775
|
+
),
|
|
1776
|
+
)
|
|
1777
|
+
return volume
|
|
1778
|
+
|
|
1779
|
+
def get_crypto_keys_from_config(self) -> List[str]:
|
|
1780
|
+
crypto_keys = self.config_dict.get("crypto_keys", {})
|
|
1781
|
+
return [
|
|
1782
|
+
*(f"public/{key}" for key in crypto_keys.get("encrypt", [])),
|
|
1783
|
+
*(f"private/{key}" for key in crypto_keys.get("decrypt", [])),
|
|
1784
|
+
]
|
|
1785
|
+
|
|
1786
|
+
def get_crypto_volume(self) -> Optional[V1Volume]:
|
|
1787
|
+
required_crypto_keys = self.get_crypto_keys_from_config()
|
|
1788
|
+
if not required_crypto_keys:
|
|
1789
|
+
return None
|
|
1790
|
+
|
|
1791
|
+
if not self.get_crypto_secret_hash():
|
|
1792
|
+
log.warning(
|
|
1793
|
+
f"Expected to find crypto_keys secret signature {self.get_crypto_secret_name()} {self.get_boto_secret_signature_name()} for {self.get_service()}.{self.get_instance()} on {self.get_namespace()}"
|
|
1794
|
+
)
|
|
1795
|
+
return None
|
|
1796
|
+
|
|
1797
|
+
return V1Volume(
|
|
1798
|
+
name=self.get_crypto_secret_volume_name(
|
|
1799
|
+
self.get_sanitised_deployment_name()
|
|
1800
|
+
),
|
|
1801
|
+
secret=V1SecretVolumeSource(
|
|
1802
|
+
secret_name=self.get_crypto_secret_name(),
|
|
1803
|
+
default_mode=mode_to_int("0444"),
|
|
1804
|
+
items=[
|
|
1805
|
+
V1KeyToPath(
|
|
1806
|
+
# key should exist in data section of k8s secret
|
|
1807
|
+
key=get_vault_key_secret_name(crypto_key),
|
|
1808
|
+
# path is equivalent to Vault key directory structure
|
|
1809
|
+
# e.g. private/foo will create /etc/crypto_keys/private/foo.json
|
|
1810
|
+
path=f"{crypto_key}.json",
|
|
1811
|
+
mode=mode_to_int("0444"),
|
|
1812
|
+
)
|
|
1813
|
+
for crypto_key in required_crypto_keys
|
|
1814
|
+
],
|
|
1815
|
+
optional=True,
|
|
1816
|
+
),
|
|
1817
|
+
)
|
|
1818
|
+
|
|
1819
|
+
def get_volume_mounts(
|
|
1820
|
+
self,
|
|
1821
|
+
docker_volumes: Sequence[DockerVolume],
|
|
1822
|
+
aws_ebs_volumes: Sequence[AwsEbsVolume],
|
|
1823
|
+
persistent_volumes: Sequence[PersistentVolume],
|
|
1824
|
+
secret_volumes: Sequence[SecretVolume],
|
|
1825
|
+
projected_sa_volumes: Sequence[ProjectedSAVolume],
|
|
1826
|
+
) -> Sequence[V1VolumeMount]:
|
|
1827
|
+
volume_mounts = (
|
|
1828
|
+
[
|
|
1829
|
+
V1VolumeMount(
|
|
1830
|
+
mount_path=docker_volume["containerPath"],
|
|
1831
|
+
name=self.get_docker_volume_name(docker_volume),
|
|
1832
|
+
read_only=self.read_only_mode(docker_volume),
|
|
1833
|
+
)
|
|
1834
|
+
for docker_volume in docker_volumes
|
|
1835
|
+
]
|
|
1836
|
+
+ [
|
|
1837
|
+
V1VolumeMount(
|
|
1838
|
+
mount_path=aws_ebs_volume["container_path"],
|
|
1839
|
+
name=self.get_aws_ebs_volume_name(aws_ebs_volume),
|
|
1840
|
+
read_only=self.read_only_mode(aws_ebs_volume),
|
|
1841
|
+
)
|
|
1842
|
+
for aws_ebs_volume in aws_ebs_volumes
|
|
1843
|
+
]
|
|
1844
|
+
+ [
|
|
1845
|
+
V1VolumeMount(
|
|
1846
|
+
mount_path=volume["container_path"],
|
|
1847
|
+
name=self.get_persistent_volume_name(volume),
|
|
1848
|
+
read_only=self.read_only_mode(volume),
|
|
1849
|
+
)
|
|
1850
|
+
for volume in persistent_volumes
|
|
1851
|
+
]
|
|
1852
|
+
+ [
|
|
1853
|
+
V1VolumeMount(
|
|
1854
|
+
mount_path=volume["container_path"],
|
|
1855
|
+
name=self.get_secret_volume_name(volume),
|
|
1856
|
+
read_only=True,
|
|
1857
|
+
)
|
|
1858
|
+
for volume in secret_volumes
|
|
1859
|
+
]
|
|
1860
|
+
+ [
|
|
1861
|
+
V1VolumeMount(
|
|
1862
|
+
mount_path=volume["container_path"],
|
|
1863
|
+
name=self.get_projected_sa_volume_name(volume),
|
|
1864
|
+
read_only=True,
|
|
1865
|
+
)
|
|
1866
|
+
for volume in projected_sa_volumes
|
|
1867
|
+
]
|
|
1868
|
+
)
|
|
1869
|
+
if self.config_dict.get("boto_keys", []):
|
|
1870
|
+
secret_hash = self.get_boto_secret_hash()
|
|
1871
|
+
service_name = self.get_sanitised_deployment_name()
|
|
1872
|
+
if secret_hash:
|
|
1873
|
+
mount = V1VolumeMount(
|
|
1874
|
+
mount_path="/etc/boto_cfg",
|
|
1875
|
+
name=self.get_boto_secret_volume_name(service_name),
|
|
1876
|
+
read_only=True,
|
|
1877
|
+
)
|
|
1878
|
+
for existing_mount in volume_mounts:
|
|
1879
|
+
if existing_mount.mount_path == "/etc/boto_cfg":
|
|
1880
|
+
volume_mounts.remove(existing_mount)
|
|
1881
|
+
break
|
|
1882
|
+
volume_mounts.append(mount)
|
|
1883
|
+
|
|
1884
|
+
if self.config_dict.get("crypto_keys", []):
|
|
1885
|
+
if self.get_crypto_secret_hash():
|
|
1886
|
+
mount = V1VolumeMount(
|
|
1887
|
+
mount_path="/etc/crypto_keys",
|
|
1888
|
+
name=self.get_crypto_secret_volume_name(
|
|
1889
|
+
self.get_sanitised_deployment_name()
|
|
1890
|
+
),
|
|
1891
|
+
read_only=True,
|
|
1892
|
+
)
|
|
1893
|
+
for existing_mount in volume_mounts:
|
|
1894
|
+
if existing_mount.mount_path == "/etc/crypto_keys":
|
|
1895
|
+
volume_mounts.remove(existing_mount)
|
|
1896
|
+
break
|
|
1897
|
+
volume_mounts.append(mount)
|
|
1898
|
+
|
|
1899
|
+
datastore_credentials = self.get_datastore_credentials()
|
|
1900
|
+
if datastore_credentials:
|
|
1901
|
+
if self.get_datastore_credentials_secret_hash():
|
|
1902
|
+
volume_mounts.append(
|
|
1903
|
+
V1VolumeMount(
|
|
1904
|
+
mount_path=f"/datastore",
|
|
1905
|
+
name=self.get_datastore_secret_volume_name(),
|
|
1906
|
+
read_only=True,
|
|
1907
|
+
)
|
|
1908
|
+
)
|
|
1909
|
+
|
|
1910
|
+
return volume_mounts
|
|
1911
|
+
|
|
1912
|
+
def get_boto_secret_name(self) -> str:
|
|
1913
|
+
"""
|
|
1914
|
+
Namespace is ignored so that there are no bounces with existing boto_keys secrets
|
|
1915
|
+
"""
|
|
1916
|
+
return limit_size_with_hash(
|
|
1917
|
+
f"paasta-boto-key-{self.get_sanitised_deployment_name()}"
|
|
1918
|
+
)
|
|
1919
|
+
|
|
1920
|
+
def get_crypto_secret_name(self) -> str:
|
|
1921
|
+
return _get_secret_name(
|
|
1922
|
+
self.get_namespace(), "crypto-key", self.get_service(), self.get_instance()
|
|
1923
|
+
)
|
|
1924
|
+
|
|
1925
|
+
def get_boto_secret_signature_name(self) -> str:
|
|
1926
|
+
"""
|
|
1927
|
+
Keep the following signature naming convention so that bounces do not happen because boto_keys configmap signatures already exist, see PAASTA-17910
|
|
1928
|
+
|
|
1929
|
+
Note: Since hashing is done only on a portion of secret, it may explode if service or instance names are too long
|
|
1930
|
+
"""
|
|
1931
|
+
secret_instance = limit_size_with_hash(
|
|
1932
|
+
f"paasta-boto-key-{self.get_sanitised_deployment_name()}"
|
|
1933
|
+
)
|
|
1934
|
+
return f"{self.get_namespace()}-secret-{self.get_sanitised_service_name()}-{secret_instance}-signature"
|
|
1935
|
+
|
|
1936
|
+
def get_crypto_secret_signature_name(self) -> str:
|
|
1937
|
+
return _get_secret_signature_name(
|
|
1938
|
+
self.get_namespace(), "crypto-key", self.get_service(), self.get_instance()
|
|
1939
|
+
)
|
|
1940
|
+
|
|
1941
|
+
def get_datastore_credentials_signature_name(self) -> str:
|
|
1942
|
+
"""
|
|
1943
|
+
All datastore credentials are stored in a single Kubernetes secret, so they share a name
|
|
1944
|
+
"""
|
|
1945
|
+
return _get_secret_signature_name(
|
|
1946
|
+
self.get_namespace(),
|
|
1947
|
+
"datastore-credentials",
|
|
1948
|
+
self.get_service(),
|
|
1949
|
+
# key is on instances, which get their own configurations
|
|
1950
|
+
key_name=self.get_instance(),
|
|
1951
|
+
)
|
|
1952
|
+
|
|
1953
|
+
def get_boto_secret_hash(self) -> Optional[str]:
|
|
1954
|
+
return get_secret_signature(
|
|
1955
|
+
kube_client=KubeClient(),
|
|
1956
|
+
signature_name=self.get_boto_secret_signature_name(),
|
|
1957
|
+
namespace=self.get_namespace(),
|
|
1958
|
+
)
|
|
1959
|
+
|
|
1960
|
+
def get_crypto_secret_hash(self) -> Optional[str]:
|
|
1961
|
+
return get_secret_signature(
|
|
1962
|
+
kube_client=KubeClient(),
|
|
1963
|
+
signature_name=self.get_crypto_secret_signature_name(),
|
|
1964
|
+
namespace=self.get_namespace(),
|
|
1965
|
+
)
|
|
1966
|
+
|
|
1967
|
+
def get_datastore_credentials_secret_hash(self) -> Optional[str]:
|
|
1968
|
+
return get_secret_signature(
|
|
1969
|
+
kube_client=KubeClient(),
|
|
1970
|
+
signature_name=self.get_datastore_credentials_signature_name(),
|
|
1971
|
+
namespace=self.get_namespace(),
|
|
1972
|
+
)
|
|
1973
|
+
|
|
1974
|
+
def get_sanitised_service_name(self) -> str:
|
|
1975
|
+
return sanitise_kubernetes_name(self.get_service())
|
|
1976
|
+
|
|
1977
|
+
def get_sanitised_instance_name(self) -> str:
|
|
1978
|
+
return sanitise_kubernetes_name(self.get_instance())
|
|
1979
|
+
|
|
1980
|
+
def get_autoscaled_instances(self) -> Optional[int]:
|
|
1981
|
+
try:
|
|
1982
|
+
if self.get_persistent_volumes():
|
|
1983
|
+
return (
|
|
1984
|
+
KubeClient()
|
|
1985
|
+
.deployments.read_namespaced_stateful_set(
|
|
1986
|
+
name=self.get_sanitised_deployment_name(),
|
|
1987
|
+
namespace=self.get_namespace(),
|
|
1988
|
+
)
|
|
1989
|
+
.spec.replicas
|
|
1990
|
+
)
|
|
1991
|
+
else:
|
|
1992
|
+
return (
|
|
1993
|
+
KubeClient()
|
|
1994
|
+
.deployments.read_namespaced_deployment(
|
|
1995
|
+
name=self.get_sanitised_deployment_name(),
|
|
1996
|
+
namespace=self.get_namespace(),
|
|
1997
|
+
)
|
|
1998
|
+
.spec.replicas
|
|
1999
|
+
)
|
|
2000
|
+
except ApiException as e:
|
|
2001
|
+
log.error(e)
|
|
2002
|
+
log.debug(
|
|
2003
|
+
"Error occured when trying to connect to Kubernetes API, \
|
|
2004
|
+
returning max_instances (%d)"
|
|
2005
|
+
% self.get_max_instances()
|
|
2006
|
+
)
|
|
2007
|
+
return None
|
|
2008
|
+
|
|
2009
|
+
def get_min_instances(self) -> Optional[int]:
|
|
2010
|
+
return self.config_dict.get(
|
|
2011
|
+
"min_instances",
|
|
2012
|
+
1,
|
|
2013
|
+
)
|
|
2014
|
+
|
|
2015
|
+
def get_max_instances(self) -> Optional[int]:
|
|
2016
|
+
return self.config_dict.get(
|
|
2017
|
+
"max_instances",
|
|
2018
|
+
None,
|
|
2019
|
+
)
|
|
2020
|
+
|
|
2021
|
+
def set_autoscaled_instances(
|
|
2022
|
+
self, instance_count: int, kube_client: KubeClient
|
|
2023
|
+
) -> None:
|
|
2024
|
+
"""Set the number of instances in the same way that the autoscaler does."""
|
|
2025
|
+
set_instances_for_kubernetes_service(
|
|
2026
|
+
kube_client=kube_client, service_config=self, instance_count=instance_count
|
|
2027
|
+
)
|
|
2028
|
+
|
|
2029
|
+
def get_desired_instances(self) -> int:
|
|
2030
|
+
"""For now if we have an EBS instance it means we can only have 1 instance
|
|
2031
|
+
since we can't attach to multiple instances. In the future we might support
|
|
2032
|
+
statefulsets which are clever enough to manage EBS for you"""
|
|
2033
|
+
instances = super().get_desired_instances()
|
|
2034
|
+
if self.get_aws_ebs_volumes() and instances not in [1, 0]:
|
|
2035
|
+
raise Exception(
|
|
2036
|
+
"Number of instances must be 1 or 0 if an EBS volume is defined."
|
|
2037
|
+
)
|
|
2038
|
+
return instances
|
|
2039
|
+
|
|
2040
|
+
def get_volume_claim_templates(self) -> Sequence[V1PersistentVolumeClaim]:
|
|
2041
|
+
return [
|
|
2042
|
+
V1PersistentVolumeClaim(
|
|
2043
|
+
metadata=V1ObjectMeta(name=self.get_persistent_volume_name(volume)),
|
|
2044
|
+
spec=V1PersistentVolumeClaimSpec(
|
|
2045
|
+
# must be ReadWriteOnce for EBS
|
|
2046
|
+
access_modes=["ReadWriteOnce"],
|
|
2047
|
+
storage_class_name=self.get_storage_class_name(volume),
|
|
2048
|
+
resources=V1ResourceRequirements(
|
|
2049
|
+
requests={"storage": f"{volume['size']}Gi"}
|
|
2050
|
+
),
|
|
2051
|
+
),
|
|
2052
|
+
)
|
|
2053
|
+
for volume in self.get_persistent_volumes()
|
|
2054
|
+
]
|
|
2055
|
+
|
|
2056
|
+
def get_storage_class_name(self, volume: PersistentVolume) -> str:
|
|
2057
|
+
try:
|
|
2058
|
+
system_paasta_config = load_system_paasta_config()
|
|
2059
|
+
supported_storage_classes = (
|
|
2060
|
+
system_paasta_config.get_supported_storage_classes()
|
|
2061
|
+
)
|
|
2062
|
+
except PaastaNotConfiguredError:
|
|
2063
|
+
log.warning("No PaaSTA configuration was found, returning default value")
|
|
2064
|
+
supported_storage_classes = []
|
|
2065
|
+
storage_class_name = volume.get("storage_class_name", "ebs")
|
|
2066
|
+
if storage_class_name not in supported_storage_classes:
|
|
2067
|
+
log.warning(f"storage class {storage_class_name} is not supported")
|
|
2068
|
+
storage_class_name = DEFAULT_STORAGE_CLASS_NAME
|
|
2069
|
+
return storage_class_name
|
|
2070
|
+
|
|
2071
|
+
def get_kubernetes_metadata(self, git_sha: str) -> V1ObjectMeta:
|
|
2072
|
+
return V1ObjectMeta(
|
|
2073
|
+
name=self.get_sanitised_deployment_name(),
|
|
2074
|
+
namespace=self.get_namespace(),
|
|
2075
|
+
labels={
|
|
2076
|
+
"yelp.com/owner": PAASTA_WORKLOAD_OWNER,
|
|
2077
|
+
"yelp.com/paasta_service": self.get_service(),
|
|
2078
|
+
"yelp.com/paasta_instance": self.get_instance(),
|
|
2079
|
+
"yelp.com/paasta_git_sha": git_sha,
|
|
2080
|
+
paasta_prefixed("service"): self.get_service(),
|
|
2081
|
+
paasta_prefixed("instance"): self.get_instance(),
|
|
2082
|
+
paasta_prefixed("git_sha"): git_sha,
|
|
2083
|
+
paasta_prefixed("cluster"): self.cluster,
|
|
2084
|
+
paasta_prefixed("autoscaled"): str(
|
|
2085
|
+
self.is_autoscaling_enabled()
|
|
2086
|
+
).lower(),
|
|
2087
|
+
paasta_prefixed("paasta.yelp.com/pool"): self.get_pool(),
|
|
2088
|
+
paasta_prefixed("managed"): "true",
|
|
2089
|
+
},
|
|
2090
|
+
)
|
|
2091
|
+
|
|
2092
|
+
def get_sanitised_deployment_name(self) -> str:
|
|
2093
|
+
return get_kubernetes_app_name(self.get_service(), self.get_instance())
|
|
2094
|
+
|
|
2095
|
+
def get_min_task_uptime(self) -> int:
|
|
2096
|
+
return self.config_dict.get("bounce_health_params", {}).get(
|
|
2097
|
+
"min_task_uptime", 0
|
|
2098
|
+
)
|
|
2099
|
+
|
|
2100
|
+
def get_enable_nerve_readiness_check(
|
|
2101
|
+
self, system_paasta_config: SystemPaastaConfig
|
|
2102
|
+
) -> bool:
|
|
2103
|
+
"""Enables a k8s readiness check on the Pod to ensure that all registrations
|
|
2104
|
+
are UP on the local synapse haproxy"""
|
|
2105
|
+
return self.config_dict.get("bounce_health_params", {}).get(
|
|
2106
|
+
"check_haproxy", system_paasta_config.get_enable_nerve_readiness_check()
|
|
2107
|
+
)
|
|
2108
|
+
|
|
2109
|
+
def get_enable_envoy_readiness_check(
|
|
2110
|
+
self, system_paasta_config: SystemPaastaConfig
|
|
2111
|
+
) -> bool:
|
|
2112
|
+
"""Enables a k8s readiness check on the Pod to ensure that all registrations
|
|
2113
|
+
are UP on the local Envoy"""
|
|
2114
|
+
return self.config_dict.get("bounce_health_params", {}).get(
|
|
2115
|
+
"check_envoy", system_paasta_config.get_enable_envoy_readiness_check()
|
|
2116
|
+
)
|
|
2117
|
+
|
|
2118
|
+
def get_namespace(self) -> str:
|
|
2119
|
+
"""Get namespace from config, default to 'paasta'"""
|
|
2120
|
+
return self.config_dict.get(
|
|
2121
|
+
"namespace", f"paastasvc-{self.get_sanitised_service_name()}"
|
|
2122
|
+
)
|
|
2123
|
+
|
|
2124
|
+
def get_pod_management_policy(self) -> str:
|
|
2125
|
+
"""Get sts pod_management_policy from config, default to 'OrderedReady'"""
|
|
2126
|
+
return self.config_dict.get("pod_management_policy", "OrderedReady")
|
|
2127
|
+
|
|
2128
|
+
def format_kubernetes_job(
|
|
2129
|
+
self,
|
|
2130
|
+
job_label: str,
|
|
2131
|
+
deadline_seconds: int = 3600,
|
|
2132
|
+
keep_routable_ip: bool = False,
|
|
2133
|
+
include_sidecars: bool = False,
|
|
2134
|
+
) -> V1Job:
|
|
2135
|
+
"""Create the config for launching the deployment as a Job
|
|
2136
|
+
|
|
2137
|
+
:param str job_label: value to set for the "job type" label
|
|
2138
|
+
:param int deadline_seconds: maximum allowed duration for the job
|
|
2139
|
+
:param bool keep_routable_ip: maintain routable IP annotation in pod template
|
|
2140
|
+
:param bool include_sidecars: do not discard sidecar containers when building pod spec
|
|
2141
|
+
:return: job object
|
|
2142
|
+
"""
|
|
2143
|
+
additional_labels = {paasta_prefixed(JOB_TYPE_LABEL_NAME): job_label}
|
|
2144
|
+
try:
|
|
2145
|
+
docker_url = self.get_docker_url()
|
|
2146
|
+
git_sha = get_git_sha_from_dockerurl(docker_url, long=True)
|
|
2147
|
+
system_paasta_config = load_system_paasta_config()
|
|
2148
|
+
image_version = self.get_image_version()
|
|
2149
|
+
if image_version is not None:
|
|
2150
|
+
additional_labels[paasta_prefixed("image_version")] = image_version
|
|
2151
|
+
pod_template = self.get_pod_template_spec(
|
|
2152
|
+
git_sha=git_sha,
|
|
2153
|
+
system_paasta_config=system_paasta_config,
|
|
2154
|
+
restart_on_failure=False,
|
|
2155
|
+
include_sidecars=include_sidecars,
|
|
2156
|
+
force_no_routable_ip=not keep_routable_ip,
|
|
2157
|
+
)
|
|
2158
|
+
pod_template.metadata.labels.update(additional_labels)
|
|
2159
|
+
complete_config = V1Job(
|
|
2160
|
+
api_version="batch/v1",
|
|
2161
|
+
kind="Job",
|
|
2162
|
+
metadata=self.get_kubernetes_metadata(git_sha),
|
|
2163
|
+
spec=V1JobSpec(
|
|
2164
|
+
active_deadline_seconds=deadline_seconds,
|
|
2165
|
+
ttl_seconds_after_finished=0, # remove job resource after completion
|
|
2166
|
+
template=pod_template,
|
|
2167
|
+
),
|
|
2168
|
+
)
|
|
2169
|
+
complete_config.metadata.labels.update(additional_labels)
|
|
2170
|
+
except Exception as e:
|
|
2171
|
+
raise InvalidKubernetesConfig(e, self.get_service(), self.get_instance())
|
|
2172
|
+
log.debug(
|
|
2173
|
+
f"Complete configuration for job instance is: {complete_config}",
|
|
2174
|
+
)
|
|
2175
|
+
return complete_config
|
|
2176
|
+
|
|
2177
|
+
def format_kubernetes_app(self) -> Union[V1Deployment, V1StatefulSet]:
|
|
2178
|
+
"""Create the configuration that will be passed to the Kubernetes REST API."""
|
|
2179
|
+
|
|
2180
|
+
try:
|
|
2181
|
+
system_paasta_config = load_system_paasta_config()
|
|
2182
|
+
docker_url = self.get_docker_url()
|
|
2183
|
+
git_sha = get_git_sha_from_dockerurl(docker_url, long=True)
|
|
2184
|
+
complete_config: Union[V1StatefulSet, V1Deployment]
|
|
2185
|
+
if self.get_persistent_volumes():
|
|
2186
|
+
complete_config = V1StatefulSet(
|
|
2187
|
+
api_version="apps/v1",
|
|
2188
|
+
kind="StatefulSet",
|
|
2189
|
+
metadata=self.get_kubernetes_metadata(git_sha),
|
|
2190
|
+
spec=V1StatefulSetSpec(
|
|
2191
|
+
service_name=self.get_sanitised_deployment_name(),
|
|
2192
|
+
volume_claim_templates=self.get_volume_claim_templates(),
|
|
2193
|
+
replicas=self.get_desired_instances(),
|
|
2194
|
+
revision_history_limit=0,
|
|
2195
|
+
selector=V1LabelSelector(
|
|
2196
|
+
match_labels={
|
|
2197
|
+
"paasta.yelp.com/service": self.get_service(),
|
|
2198
|
+
"paasta.yelp.com/instance": self.get_instance(),
|
|
2199
|
+
}
|
|
2200
|
+
),
|
|
2201
|
+
template=self.get_pod_template_spec(
|
|
2202
|
+
git_sha=git_sha, system_paasta_config=system_paasta_config
|
|
2203
|
+
),
|
|
2204
|
+
pod_management_policy=self.get_pod_management_policy(),
|
|
2205
|
+
),
|
|
2206
|
+
)
|
|
2207
|
+
else:
|
|
2208
|
+
complete_config = V1Deployment(
|
|
2209
|
+
api_version="apps/v1",
|
|
2210
|
+
kind="Deployment",
|
|
2211
|
+
metadata=self.get_kubernetes_metadata(git_sha),
|
|
2212
|
+
spec=V1DeploymentSpec(
|
|
2213
|
+
replicas=self.get_desired_instances(),
|
|
2214
|
+
min_ready_seconds=self.get_min_task_uptime(),
|
|
2215
|
+
selector=V1LabelSelector(
|
|
2216
|
+
match_labels={
|
|
2217
|
+
"paasta.yelp.com/service": self.get_service(),
|
|
2218
|
+
"paasta.yelp.com/instance": self.get_instance(),
|
|
2219
|
+
}
|
|
2220
|
+
),
|
|
2221
|
+
revision_history_limit=0,
|
|
2222
|
+
template=self.get_pod_template_spec(
|
|
2223
|
+
git_sha=git_sha, system_paasta_config=system_paasta_config
|
|
2224
|
+
),
|
|
2225
|
+
strategy=self.get_deployment_strategy_config(),
|
|
2226
|
+
),
|
|
2227
|
+
)
|
|
2228
|
+
|
|
2229
|
+
prometheus_shard = self.get_prometheus_shard()
|
|
2230
|
+
if prometheus_shard:
|
|
2231
|
+
complete_config.metadata.labels[
|
|
2232
|
+
"paasta.yelp.com/prometheus_shard"
|
|
2233
|
+
] = prometheus_shard
|
|
2234
|
+
|
|
2235
|
+
image_version = self.get_image_version()
|
|
2236
|
+
if image_version is not None:
|
|
2237
|
+
complete_config.metadata.labels[
|
|
2238
|
+
"paasta.yelp.com/image_version"
|
|
2239
|
+
] = image_version
|
|
2240
|
+
|
|
2241
|
+
# DO NOT ADD LABELS AFTER THIS LINE
|
|
2242
|
+
config_hash = get_config_hash(
|
|
2243
|
+
self.sanitize_for_config_hash(complete_config),
|
|
2244
|
+
force_bounce=self.get_force_bounce(),
|
|
2245
|
+
)
|
|
2246
|
+
complete_config.metadata.labels["yelp.com/paasta_config_sha"] = config_hash
|
|
2247
|
+
complete_config.metadata.labels["paasta.yelp.com/config_sha"] = config_hash
|
|
2248
|
+
|
|
2249
|
+
complete_config.spec.template.metadata.labels[
|
|
2250
|
+
"yelp.com/paasta_config_sha"
|
|
2251
|
+
] = config_hash
|
|
2252
|
+
complete_config.spec.template.metadata.labels[
|
|
2253
|
+
"paasta.yelp.com/config_sha"
|
|
2254
|
+
] = config_hash
|
|
2255
|
+
except Exception as e:
|
|
2256
|
+
raise InvalidKubernetesConfig(e, self.get_service(), self.get_instance())
|
|
2257
|
+
log.debug("Complete configuration for instance is: %s", complete_config)
|
|
2258
|
+
return complete_config
|
|
2259
|
+
|
|
2260
|
+
def get_kubernetes_service_account_name(self) -> Optional[str]:
|
|
2261
|
+
return self.config_dict.get("service_account_name", None)
|
|
2262
|
+
|
|
2263
|
+
def is_istio_sidecar_injection_enabled(self) -> bool:
|
|
2264
|
+
return self.config_dict.get("is_istio_sidecar_injection_enabled", False)
|
|
2265
|
+
|
|
2266
|
+
def has_routable_ip(
|
|
2267
|
+
self,
|
|
2268
|
+
service_namespace_config: ServiceNamespaceConfig,
|
|
2269
|
+
system_paasta_config: SystemPaastaConfig,
|
|
2270
|
+
) -> str:
|
|
2271
|
+
"""Return whether the routable_ip label should be true or false.
|
|
2272
|
+
|
|
2273
|
+
Services with a `prometheus_port` defined or that use certain sidecars must have a routable IP
|
|
2274
|
+
address to allow Prometheus shards to scrape metrics.
|
|
2275
|
+
"""
|
|
2276
|
+
if (
|
|
2277
|
+
self.config_dict.get("routable_ip", False)
|
|
2278
|
+
or service_namespace_config.is_in_smartstack()
|
|
2279
|
+
or self.get_prometheus_port() is not None
|
|
2280
|
+
or self.should_use_metrics_provider(METRICS_PROVIDER_UWSGI)
|
|
2281
|
+
or self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN)
|
|
2282
|
+
):
|
|
2283
|
+
return "true"
|
|
2284
|
+
return "false"
|
|
2285
|
+
|
|
2286
|
+
def should_enable_aws_lb_readiness_gate(self) -> bool:
|
|
2287
|
+
return self.config_dict.get("enable_aws_lb_readiness_gate", False)
|
|
2288
|
+
|
|
2289
|
+
def get_pod_template_spec(
|
|
2290
|
+
self,
|
|
2291
|
+
git_sha: str,
|
|
2292
|
+
system_paasta_config: SystemPaastaConfig,
|
|
2293
|
+
restart_on_failure: bool = True,
|
|
2294
|
+
include_sidecars: bool = True,
|
|
2295
|
+
force_no_routable_ip: bool = False,
|
|
2296
|
+
) -> V1PodTemplateSpec:
|
|
2297
|
+
service_namespace_config = load_service_namespace_config(
|
|
2298
|
+
service=self.service, namespace=self.get_nerve_namespace()
|
|
2299
|
+
)
|
|
2300
|
+
docker_volumes = self.get_volumes(
|
|
2301
|
+
system_volumes=system_paasta_config.get_volumes(),
|
|
2302
|
+
)
|
|
2303
|
+
|
|
2304
|
+
hacheck_sidecar_volumes = system_paasta_config.get_hacheck_sidecar_volumes()
|
|
2305
|
+
has_routable_ip = (
|
|
2306
|
+
"false"
|
|
2307
|
+
if force_no_routable_ip
|
|
2308
|
+
else self.has_routable_ip(service_namespace_config, system_paasta_config)
|
|
2309
|
+
)
|
|
2310
|
+
annotations: KubePodAnnotations = {
|
|
2311
|
+
"smartstack_registrations": json.dumps(self.get_registrations()),
|
|
2312
|
+
"paasta.yelp.com/routable_ip": has_routable_ip,
|
|
2313
|
+
}
|
|
2314
|
+
|
|
2315
|
+
# The HPAMetrics collector needs these annotations to tell it to pull
|
|
2316
|
+
# metrics from these pods
|
|
2317
|
+
# TODO: see if we can remove this as we're no longer using sfx data to scale
|
|
2318
|
+
if self.get_autoscaling_metrics_provider(METRICS_PROVIDER_UWSGI) is not None:
|
|
2319
|
+
annotations["autoscaling"] = METRICS_PROVIDER_UWSGI
|
|
2320
|
+
|
|
2321
|
+
pod_spec_kwargs = {}
|
|
2322
|
+
pod_spec_kwargs.update(system_paasta_config.get_pod_defaults())
|
|
2323
|
+
pod_spec_kwargs.update(
|
|
2324
|
+
service_account_name=self.get_kubernetes_service_account_name(),
|
|
2325
|
+
containers=self.get_kubernetes_containers(
|
|
2326
|
+
docker_volumes=docker_volumes,
|
|
2327
|
+
hacheck_sidecar_volumes=hacheck_sidecar_volumes,
|
|
2328
|
+
aws_ebs_volumes=self.get_aws_ebs_volumes(),
|
|
2329
|
+
secret_volumes=self.get_secret_volumes(),
|
|
2330
|
+
system_paasta_config=system_paasta_config,
|
|
2331
|
+
service_namespace_config=service_namespace_config,
|
|
2332
|
+
include_sidecars=include_sidecars,
|
|
2333
|
+
),
|
|
2334
|
+
share_process_namespace=True,
|
|
2335
|
+
node_selector=self.get_node_selector(),
|
|
2336
|
+
restart_policy="Always" if restart_on_failure else "Never",
|
|
2337
|
+
volumes=self.get_pod_volumes(
|
|
2338
|
+
docker_volumes=docker_volumes + hacheck_sidecar_volumes,
|
|
2339
|
+
aws_ebs_volumes=self.get_aws_ebs_volumes(),
|
|
2340
|
+
secret_volumes=self.get_secret_volumes(),
|
|
2341
|
+
projected_sa_volumes=self.get_projected_sa_volumes(),
|
|
2342
|
+
),
|
|
2343
|
+
)
|
|
2344
|
+
# need to check if there are node selectors/affinities. if there are none
|
|
2345
|
+
# and we create an empty affinity object, k8s will deselect all nodes.
|
|
2346
|
+
node_affinity = self.get_node_affinity(
|
|
2347
|
+
system_paasta_config.get_pool_node_affinities()
|
|
2348
|
+
)
|
|
2349
|
+
if node_affinity is not None:
|
|
2350
|
+
pod_spec_kwargs["affinity"] = V1Affinity(node_affinity=node_affinity)
|
|
2351
|
+
|
|
2352
|
+
pod_anti_affinity = self.get_pod_anti_affinity()
|
|
2353
|
+
if pod_anti_affinity is not None:
|
|
2354
|
+
affinity = pod_spec_kwargs.get("affinity", V1Affinity())
|
|
2355
|
+
affinity.pod_anti_affinity = pod_anti_affinity
|
|
2356
|
+
pod_spec_kwargs["affinity"] = affinity
|
|
2357
|
+
|
|
2358
|
+
# PAASTA-17941: Allow configuring topology spread constraints per cluster
|
|
2359
|
+
pod_topology_spread_constraints = create_pod_topology_spread_constraints(
|
|
2360
|
+
service=self.get_service(),
|
|
2361
|
+
instance=self.get_instance(),
|
|
2362
|
+
topology_spread_constraints=self.get_topology_spread_constraints(
|
|
2363
|
+
system_paasta_config.get_topology_spread_constraints()
|
|
2364
|
+
),
|
|
2365
|
+
)
|
|
2366
|
+
if pod_topology_spread_constraints:
|
|
2367
|
+
constraints = pod_spec_kwargs.get("topology_spread_constraints", [])
|
|
2368
|
+
constraints += pod_topology_spread_constraints
|
|
2369
|
+
pod_spec_kwargs["topology_spread_constraints"] = constraints
|
|
2370
|
+
|
|
2371
|
+
termination_grace_period = self.get_termination_grace_period(
|
|
2372
|
+
service_namespace_config
|
|
2373
|
+
)
|
|
2374
|
+
if termination_grace_period is not None:
|
|
2375
|
+
pod_spec_kwargs[
|
|
2376
|
+
"termination_grace_period_seconds"
|
|
2377
|
+
] = termination_grace_period
|
|
2378
|
+
|
|
2379
|
+
fs_group = self.get_fs_group()
|
|
2380
|
+
|
|
2381
|
+
if self.get_iam_role_provider() == "aws":
|
|
2382
|
+
annotations["iam.amazonaws.com/role"] = ""
|
|
2383
|
+
iam_role = self.get_iam_role()
|
|
2384
|
+
if iam_role:
|
|
2385
|
+
pod_spec_kwargs["service_account_name"] = get_service_account_name(
|
|
2386
|
+
iam_role
|
|
2387
|
+
)
|
|
2388
|
+
if fs_group is None:
|
|
2389
|
+
# We need some reasoable default for group id of a process
|
|
2390
|
+
# running inside the container. Seems like most of such
|
|
2391
|
+
# programs run as `nobody`, let's use that as a default.
|
|
2392
|
+
#
|
|
2393
|
+
# PAASTA-16919: This should be removed when
|
|
2394
|
+
# https://github.com/aws/amazon-eks-pod-identity-webhook/issues/8
|
|
2395
|
+
# is fixed.
|
|
2396
|
+
fs_group = 65534
|
|
2397
|
+
else:
|
|
2398
|
+
annotations["iam.amazonaws.com/role"] = self.get_iam_role()
|
|
2399
|
+
|
|
2400
|
+
if fs_group is not None:
|
|
2401
|
+
pod_spec_kwargs["security_context"] = V1PodSecurityContext(
|
|
2402
|
+
fs_group=fs_group
|
|
2403
|
+
)
|
|
2404
|
+
|
|
2405
|
+
# prometheus_path is used to override the default scrape path in Prometheus
|
|
2406
|
+
prometheus_path = self.get_prometheus_path()
|
|
2407
|
+
if prometheus_path:
|
|
2408
|
+
annotations["paasta.yelp.com/prometheus_path"] = prometheus_path
|
|
2409
|
+
|
|
2410
|
+
# prometheus_port is used to override the default scrape port in Prometheus
|
|
2411
|
+
prometheus_port = self.get_prometheus_port()
|
|
2412
|
+
if prometheus_port:
|
|
2413
|
+
annotations["paasta.yelp.com/prometheus_port"] = str(prometheus_port)
|
|
2414
|
+
|
|
2415
|
+
# Default Pod labels
|
|
2416
|
+
labels: KubePodLabels = {
|
|
2417
|
+
"yelp.com/paasta_service": self.get_service(),
|
|
2418
|
+
"yelp.com/paasta_instance": self.get_instance(),
|
|
2419
|
+
"yelp.com/paasta_git_sha": git_sha,
|
|
2420
|
+
# NOTE: we can't use the paasta_prefixed() helper here
|
|
2421
|
+
# since mypy expects TypedDict keys to be string literals
|
|
2422
|
+
"paasta.yelp.com/service": self.get_service(),
|
|
2423
|
+
"paasta.yelp.com/instance": self.get_instance(),
|
|
2424
|
+
"paasta.yelp.com/git_sha": git_sha,
|
|
2425
|
+
"paasta.yelp.com/autoscaled": str(self.is_autoscaling_enabled()).lower(),
|
|
2426
|
+
"paasta.yelp.com/pool": self.get_pool(),
|
|
2427
|
+
"paasta.yelp.com/cluster": self.cluster,
|
|
2428
|
+
"yelp.com/owner": "compute_infra_platform_experience",
|
|
2429
|
+
"paasta.yelp.com/managed": "true",
|
|
2430
|
+
}
|
|
2431
|
+
if service_namespace_config.is_in_smartstack():
|
|
2432
|
+
labels["paasta.yelp.com/weight"] = str(self.get_weight())
|
|
2433
|
+
|
|
2434
|
+
# Allow the Prometheus Operator's Pod Service Monitor for specified
|
|
2435
|
+
# shard to find this pod
|
|
2436
|
+
prometheus_shard = self.get_prometheus_shard()
|
|
2437
|
+
if prometheus_shard:
|
|
2438
|
+
labels["paasta.yelp.com/prometheus_shard"] = prometheus_shard
|
|
2439
|
+
|
|
2440
|
+
image_version = self.get_image_version()
|
|
2441
|
+
if image_version is not None:
|
|
2442
|
+
labels["paasta.yelp.com/image_version"] = image_version
|
|
2443
|
+
|
|
2444
|
+
if system_paasta_config.get_kubernetes_add_registration_labels():
|
|
2445
|
+
# Allow Kubernetes Services to easily find
|
|
2446
|
+
# pods belonging to a certain smartstack namespace
|
|
2447
|
+
for registration in self.get_registrations():
|
|
2448
|
+
labels[registration_label(registration)] = "true" # type: ignore
|
|
2449
|
+
|
|
2450
|
+
if self.is_istio_sidecar_injection_enabled():
|
|
2451
|
+
labels["sidecar.istio.io/inject"] = "true"
|
|
2452
|
+
|
|
2453
|
+
# not all services use autoscaling, so we label those that do in order to have
|
|
2454
|
+
# prometheus selectively discover/scrape them
|
|
2455
|
+
if self.should_use_metrics_provider(METRICS_PROVIDER_UWSGI):
|
|
2456
|
+
# UWSGI no longer needs a label to indicate it needs to be scraped as all pods are checked for the uwsgi stats port by our centralized uwsgi-exporter
|
|
2457
|
+
# But we do still need deploy_group for relabeling properly
|
|
2458
|
+
# this should probably eventually be made into a default label,
|
|
2459
|
+
# but for now we're fine with it being behind these feature toggles.
|
|
2460
|
+
# ideally, we'd also have the docker image here for ease-of-use
|
|
2461
|
+
# in Prometheus relabeling, but that information is over the
|
|
2462
|
+
# character limit for k8s labels (63 chars)
|
|
2463
|
+
labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
|
|
2464
|
+
|
|
2465
|
+
elif self.should_use_metrics_provider(METRICS_PROVIDER_PISCINA):
|
|
2466
|
+
labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
|
|
2467
|
+
labels["paasta.yelp.com/scrape_piscina_prometheus"] = "true"
|
|
2468
|
+
|
|
2469
|
+
elif self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN):
|
|
2470
|
+
labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
|
|
2471
|
+
labels["paasta.yelp.com/scrape_gunicorn_prometheus"] = "true"
|
|
2472
|
+
|
|
2473
|
+
# the default AWS LB Controller behavior is to enable this by-namespace
|
|
2474
|
+
# ...but that's kinda annoying to do in a toggleable way - so let's instead
|
|
2475
|
+
# toggle based on pod labels (which of course, will require changing the controller
|
|
2476
|
+
# settings :p)
|
|
2477
|
+
if self.should_enable_aws_lb_readiness_gate():
|
|
2478
|
+
labels["elbv2.k8s.aws/pod-readiness-gate-inject"] = "enabled"
|
|
2479
|
+
|
|
2480
|
+
return V1PodTemplateSpec(
|
|
2481
|
+
metadata=V1ObjectMeta(
|
|
2482
|
+
labels=labels,
|
|
2483
|
+
annotations=annotations,
|
|
2484
|
+
),
|
|
2485
|
+
spec=V1PodSpec(**pod_spec_kwargs),
|
|
2486
|
+
)
|
|
2487
|
+
|
|
2488
|
+
def get_node_selector(self) -> Mapping[str, str]:
|
|
2489
|
+
"""Converts simple node restrictions into node selectors. Unlike node
|
|
2490
|
+
affinities, selectors will show up in `kubectl describe`.
|
|
2491
|
+
"""
|
|
2492
|
+
raw_selectors: Mapping[str, Any] = self.config_dict.get("node_selectors", {})
|
|
2493
|
+
node_selectors = {
|
|
2494
|
+
to_node_label(label): value
|
|
2495
|
+
for label, value in raw_selectors.items()
|
|
2496
|
+
if type(value) is str
|
|
2497
|
+
}
|
|
2498
|
+
node_selectors["yelp.com/pool"] = self.get_pool()
|
|
2499
|
+
return node_selectors
|
|
2500
|
+
|
|
2501
|
+
def get_node_affinity(
|
|
2502
|
+
self, pool_node_affinities: Dict[str, Dict[str, List[str]]] = None
|
|
2503
|
+
) -> Optional[V1NodeAffinity]:
|
|
2504
|
+
"""Converts deploy_whitelist and deploy_blacklist in node affinities.
|
|
2505
|
+
|
|
2506
|
+
note: At the time of writing, `kubectl describe` does not show affinities,
|
|
2507
|
+
only selectors. To see affinities, use `kubectl get pod -o json` instead.
|
|
2508
|
+
"""
|
|
2509
|
+
requirements = allowlist_denylist_to_requirements(
|
|
2510
|
+
allowlist=self.get_deploy_whitelist(),
|
|
2511
|
+
denylist=self.get_deploy_blacklist(),
|
|
2512
|
+
)
|
|
2513
|
+
node_selectors = self.config_dict.get("node_selectors", {})
|
|
2514
|
+
requirements.extend(
|
|
2515
|
+
raw_selectors_to_requirements(
|
|
2516
|
+
raw_selectors=node_selectors,
|
|
2517
|
+
)
|
|
2518
|
+
)
|
|
2519
|
+
|
|
2520
|
+
# PAASTA-18198: To improve AZ balance with Karpenter, we temporarily allow specifying zone affinities per pool
|
|
2521
|
+
if pool_node_affinities and self.get_pool() in pool_node_affinities:
|
|
2522
|
+
current_pool_node_affinities = pool_node_affinities[self.get_pool()]
|
|
2523
|
+
# If the service already has a node selector for a zone, we don't want to override it
|
|
2524
|
+
if current_pool_node_affinities and not contains_zone_label(node_selectors):
|
|
2525
|
+
requirements.extend(
|
|
2526
|
+
raw_selectors_to_requirements(
|
|
2527
|
+
raw_selectors=current_pool_node_affinities,
|
|
2528
|
+
)
|
|
2529
|
+
)
|
|
2530
|
+
|
|
2531
|
+
preferred_terms = []
|
|
2532
|
+
for node_selectors_prefered_config_dict in self.config_dict.get(
|
|
2533
|
+
"node_selectors_preferred", []
|
|
2534
|
+
):
|
|
2535
|
+
preferred_terms.append(
|
|
2536
|
+
V1PreferredSchedulingTerm(
|
|
2537
|
+
weight=node_selectors_prefered_config_dict["weight"],
|
|
2538
|
+
preference=V1NodeSelectorTerm(
|
|
2539
|
+
match_expressions=[
|
|
2540
|
+
V1NodeSelectorRequirement(
|
|
2541
|
+
key=key,
|
|
2542
|
+
operator=op,
|
|
2543
|
+
values=vs,
|
|
2544
|
+
)
|
|
2545
|
+
for key, op, vs in raw_selectors_to_requirements(
|
|
2546
|
+
raw_selectors=node_selectors_prefered_config_dict[
|
|
2547
|
+
"preferences"
|
|
2548
|
+
]
|
|
2549
|
+
)
|
|
2550
|
+
]
|
|
2551
|
+
),
|
|
2552
|
+
)
|
|
2553
|
+
)
|
|
2554
|
+
|
|
2555
|
+
# package everything into a node affinity - lots of layers :P
|
|
2556
|
+
if len(requirements) == 0 and len(preferred_terms) == 0:
|
|
2557
|
+
return None
|
|
2558
|
+
|
|
2559
|
+
required_term = (
|
|
2560
|
+
V1NodeSelectorTerm(
|
|
2561
|
+
match_expressions=[
|
|
2562
|
+
V1NodeSelectorRequirement(
|
|
2563
|
+
key=key,
|
|
2564
|
+
operator=op,
|
|
2565
|
+
values=vs,
|
|
2566
|
+
)
|
|
2567
|
+
for key, op, vs in requirements
|
|
2568
|
+
]
|
|
2569
|
+
)
|
|
2570
|
+
if requirements
|
|
2571
|
+
else None
|
|
2572
|
+
)
|
|
2573
|
+
|
|
2574
|
+
if not preferred_terms:
|
|
2575
|
+
preferred_terms = None
|
|
2576
|
+
|
|
2577
|
+
return V1NodeAffinity(
|
|
2578
|
+
required_during_scheduling_ignored_during_execution=(
|
|
2579
|
+
V1NodeSelector(node_selector_terms=[required_term])
|
|
2580
|
+
if required_term
|
|
2581
|
+
else None
|
|
2582
|
+
),
|
|
2583
|
+
preferred_during_scheduling_ignored_during_execution=preferred_terms,
|
|
2584
|
+
)
|
|
2585
|
+
|
|
2586
|
+
def get_pod_required_anti_affinity_terms(
|
|
2587
|
+
self,
|
|
2588
|
+
) -> Optional[List[V1PodAffinityTerm]]:
|
|
2589
|
+
conditions = self.config_dict.get("anti_affinity", [])
|
|
2590
|
+
if not conditions:
|
|
2591
|
+
return None
|
|
2592
|
+
|
|
2593
|
+
if not isinstance(conditions, list):
|
|
2594
|
+
conditions = [conditions]
|
|
2595
|
+
|
|
2596
|
+
affinity_terms = []
|
|
2597
|
+
for condition in conditions:
|
|
2598
|
+
label_selector = self._kube_affinity_condition_to_label_selector(condition)
|
|
2599
|
+
if label_selector:
|
|
2600
|
+
affinity_terms.append(
|
|
2601
|
+
V1PodAffinityTerm(
|
|
2602
|
+
# Topology of a hostname means the pod of this service
|
|
2603
|
+
# cannot be scheduled on host containing another pod
|
|
2604
|
+
# matching the label_selector
|
|
2605
|
+
topology_key="kubernetes.io/hostname",
|
|
2606
|
+
label_selector=label_selector,
|
|
2607
|
+
)
|
|
2608
|
+
)
|
|
2609
|
+
return affinity_terms
|
|
2610
|
+
|
|
2611
|
+
def get_pod_preferred_anti_affinity_terms(
|
|
2612
|
+
self,
|
|
2613
|
+
) -> Optional[List[V1WeightedPodAffinityTerm]]:
|
|
2614
|
+
conditions = self.config_dict.get("anti_affinity_preferred", [])
|
|
2615
|
+
if not conditions:
|
|
2616
|
+
return None
|
|
2617
|
+
|
|
2618
|
+
if not isinstance(conditions, list):
|
|
2619
|
+
conditions = [conditions]
|
|
2620
|
+
|
|
2621
|
+
affinity_terms = []
|
|
2622
|
+
for condition in conditions:
|
|
2623
|
+
label_selector = self._kube_affinity_condition_to_label_selector(condition)
|
|
2624
|
+
if label_selector:
|
|
2625
|
+
affinity_terms.append(
|
|
2626
|
+
V1WeightedPodAffinityTerm(
|
|
2627
|
+
# Topology of a hostname means the pod of this service
|
|
2628
|
+
# cannot be scheduled on host containing another pod
|
|
2629
|
+
# matching the label_selector
|
|
2630
|
+
topology_key="kubernetes.io/hostname",
|
|
2631
|
+
label_selector=label_selector,
|
|
2632
|
+
weight=condition["weight"],
|
|
2633
|
+
)
|
|
2634
|
+
)
|
|
2635
|
+
return affinity_terms
|
|
2636
|
+
|
|
2637
|
+
def get_pod_anti_affinity(self) -> Optional[V1PodAntiAffinity]:
|
|
2638
|
+
"""
|
|
2639
|
+
Converts the given anti-affinity on service and instance to pod
|
|
2640
|
+
affinities with the "paasta.yelp.com" prefixed label selector
|
|
2641
|
+
:return:
|
|
2642
|
+
"""
|
|
2643
|
+
|
|
2644
|
+
required_terms = self.get_pod_required_anti_affinity_terms()
|
|
2645
|
+
preferred_terms = self.get_pod_preferred_anti_affinity_terms()
|
|
2646
|
+
|
|
2647
|
+
if required_terms is None and preferred_terms is None:
|
|
2648
|
+
return None
|
|
2649
|
+
|
|
2650
|
+
return V1PodAntiAffinity(
|
|
2651
|
+
required_during_scheduling_ignored_during_execution=required_terms,
|
|
2652
|
+
preferred_during_scheduling_ignored_during_execution=preferred_terms,
|
|
2653
|
+
)
|
|
2654
|
+
|
|
2655
|
+
def _kube_affinity_condition_to_label_selector(
|
|
2656
|
+
self, condition: KubeAffinityCondition
|
|
2657
|
+
) -> Optional[V1LabelSelector]:
|
|
2658
|
+
"""Converts the given condition to label selectors with paasta prefix"""
|
|
2659
|
+
labels = {}
|
|
2660
|
+
if "service" in condition:
|
|
2661
|
+
labels[PAASTA_ATTRIBUTE_PREFIX + "service"] = condition.get("service")
|
|
2662
|
+
if "instance" in condition:
|
|
2663
|
+
labels[PAASTA_ATTRIBUTE_PREFIX + "instance"] = condition.get("instance")
|
|
2664
|
+
return V1LabelSelector(match_labels=labels) if labels else None
|
|
2665
|
+
|
|
2666
|
+
def sanitize_for_config_hash(
|
|
2667
|
+
self, config: Union[V1Deployment, V1StatefulSet]
|
|
2668
|
+
) -> Mapping[str, Any]:
|
|
2669
|
+
"""Removes some data from config to make it suitable for
|
|
2670
|
+
calculation of config hash.
|
|
2671
|
+
|
|
2672
|
+
:param config: complete_config hash to sanitise
|
|
2673
|
+
:returns: sanitised copy of complete_config hash
|
|
2674
|
+
"""
|
|
2675
|
+
ahash = config.to_dict() # deep convert to dict
|
|
2676
|
+
ahash["paasta_secrets"] = get_kubernetes_secret_hashes(
|
|
2677
|
+
service=self.get_service(),
|
|
2678
|
+
environment_variables=self.get_env(),
|
|
2679
|
+
namespace=self.get_namespace(),
|
|
2680
|
+
)
|
|
2681
|
+
|
|
2682
|
+
# remove data we dont want used to hash configs
|
|
2683
|
+
# replica count
|
|
2684
|
+
if ahash["spec"] is not None:
|
|
2685
|
+
ahash["spec"].pop("replicas", None)
|
|
2686
|
+
|
|
2687
|
+
if ahash["metadata"] is not None:
|
|
2688
|
+
ahash["metadata"]["namespace"] = None
|
|
2689
|
+
|
|
2690
|
+
# soa-configs SHA
|
|
2691
|
+
try:
|
|
2692
|
+
for container in ahash["spec"]["template"]["spec"]["containers"]:
|
|
2693
|
+
container["env"] = [
|
|
2694
|
+
e
|
|
2695
|
+
for e in container["env"]
|
|
2696
|
+
if e.get("name", "") != "PAASTA_SOA_CONFIGS_SHA"
|
|
2697
|
+
]
|
|
2698
|
+
except TypeError: # any of the values can be None
|
|
2699
|
+
pass
|
|
2700
|
+
|
|
2701
|
+
return ahash
|
|
2702
|
+
|
|
2703
|
+
def get_termination_grace_period(
|
|
2704
|
+
self, service_namespace_config: ServiceNamespaceConfig
|
|
2705
|
+
) -> Optional[int]:
|
|
2706
|
+
"""Return the number of seconds that kubernetes should wait for pre-stop hooks to finish (or for the main
|
|
2707
|
+
process to exit after signaling) before forcefully terminating the pod.
|
|
2708
|
+
|
|
2709
|
+
For smartstack services, defaults to a value long enough to allow the default pre-stop hook to finish.
|
|
2710
|
+
For non-smartstack services, defaults to None (kubernetes default of 30s).
|
|
2711
|
+
"""
|
|
2712
|
+
|
|
2713
|
+
if service_namespace_config.is_in_smartstack():
|
|
2714
|
+
default = self.get_hacheck_prestop_sleep_seconds() + 1
|
|
2715
|
+
if self.get_pre_stop_wait_for_connections_to_complete(
|
|
2716
|
+
service_namespace_config
|
|
2717
|
+
):
|
|
2718
|
+
# If the max timeout is more than 30 minutes, cap it to 30 minutes.
|
|
2719
|
+
# Most services with ultra-long timeouts are probably able to handle SIGTERM gracefully anyway.
|
|
2720
|
+
default += int(
|
|
2721
|
+
math.ceil(
|
|
2722
|
+
min(
|
|
2723
|
+
1800,
|
|
2724
|
+
service_namespace_config.get_longest_timeout_ms() / 1000,
|
|
2725
|
+
)
|
|
2726
|
+
)
|
|
2727
|
+
)
|
|
2728
|
+
else:
|
|
2729
|
+
default = None
|
|
2730
|
+
|
|
2731
|
+
return self.get_lifecycle_dict().get(
|
|
2732
|
+
"termination_grace_period_seconds", default
|
|
2733
|
+
)
|
|
2734
|
+
|
|
2735
|
+
def get_prometheus_shard(self) -> Optional[str]:
|
|
2736
|
+
return self.config_dict.get("prometheus_shard")
|
|
2737
|
+
|
|
2738
|
+
def get_prometheus_path(self) -> Optional[str]:
|
|
2739
|
+
return self.config_dict.get("prometheus_path")
|
|
2740
|
+
|
|
2741
|
+
def get_prometheus_port(self) -> Optional[int]:
|
|
2742
|
+
return self.config_dict.get("prometheus_port")
|
|
2743
|
+
|
|
2744
|
+
def get_topology_spread_constraints(
|
|
2745
|
+
self,
|
|
2746
|
+
default_pod_topology_spread_constraints: List[TopologySpreadConstraintDict],
|
|
2747
|
+
) -> List[TopologySpreadConstraintDict]:
|
|
2748
|
+
return self.config_dict.get(
|
|
2749
|
+
"topology_spread_constraints", default_pod_topology_spread_constraints
|
|
2750
|
+
)
|
|
2751
|
+
|
|
2752
|
+
def get_projected_sa_volumes(self) -> List[ProjectedSAVolume]:
|
|
2753
|
+
return add_volumes_for_authenticating_services(
|
|
2754
|
+
service_name=self.service,
|
|
2755
|
+
config_volumes=super().get_projected_sa_volumes(),
|
|
2756
|
+
soa_dir=self.soa_dir,
|
|
2757
|
+
)
|
|
2758
|
+
|
|
2759
|
+
|
|
2760
|
+
def get_kubernetes_secret_hashes(
|
|
2761
|
+
environment_variables: Mapping[str, str], service: str, namespace: str
|
|
2762
|
+
) -> Mapping[str, str]:
|
|
2763
|
+
hashes = {}
|
|
2764
|
+
to_get_hash = []
|
|
2765
|
+
for v in environment_variables.values():
|
|
2766
|
+
if is_secret_ref(v):
|
|
2767
|
+
to_get_hash.append(v)
|
|
2768
|
+
if to_get_hash:
|
|
2769
|
+
kube_client = KubeClient()
|
|
2770
|
+
for value in to_get_hash:
|
|
2771
|
+
hashes[value] = get_secret_signature(
|
|
2772
|
+
kube_client=kube_client,
|
|
2773
|
+
signature_name=get_paasta_secret_signature_name(
|
|
2774
|
+
namespace,
|
|
2775
|
+
SHARED_SECRET_SERVICE if is_shared_secret(value) else service,
|
|
2776
|
+
get_secret_name_from_ref(value),
|
|
2777
|
+
),
|
|
2778
|
+
namespace=namespace,
|
|
2779
|
+
)
|
|
2780
|
+
return hashes
|
|
2781
|
+
|
|
2782
|
+
|
|
2783
|
+
def get_k8s_pods() -> Mapping[str, Any]:
|
|
2784
|
+
return requests.get("http://127.0.0.1:10255/pods").json()
|
|
2785
|
+
|
|
2786
|
+
|
|
2787
|
+
def get_all_kubernetes_services_running_here() -> List[Tuple[str, str, int]]:
|
|
2788
|
+
"""Returns all k8s paasta services, even if not in smartstack. Returns a service, instance, port
|
|
2789
|
+
tuple to match the return value of other similar functions"""
|
|
2790
|
+
services = []
|
|
2791
|
+
try:
|
|
2792
|
+
pods = get_k8s_pods()
|
|
2793
|
+
except requests.exceptions.ConnectionError:
|
|
2794
|
+
log.debug("Failed to connect to the kublet when trying to get pods")
|
|
2795
|
+
return []
|
|
2796
|
+
for pod in pods["items"]:
|
|
2797
|
+
try:
|
|
2798
|
+
service = pod["metadata"]["labels"]["paasta.yelp.com/service"]
|
|
2799
|
+
instance = pod["metadata"]["labels"]["paasta.yelp.com/instance"]
|
|
2800
|
+
services.append((service, instance, 0))
|
|
2801
|
+
except KeyError:
|
|
2802
|
+
log.debug(f"Skipping listing what looks like a non-paasta pod: {pod}")
|
|
2803
|
+
return services
|
|
2804
|
+
|
|
2805
|
+
|
|
2806
|
+
def get_kubernetes_services_running_here(
|
|
2807
|
+
exclude_terminating: bool = False,
|
|
2808
|
+
) -> Sequence[KubernetesServiceRegistration]:
|
|
2809
|
+
services = []
|
|
2810
|
+
pods = get_k8s_pods()
|
|
2811
|
+
for pod in pods["items"]:
|
|
2812
|
+
if (
|
|
2813
|
+
pod["status"]["phase"] != "Running"
|
|
2814
|
+
or "smartstack_registrations" not in pod["metadata"].get("annotations", {})
|
|
2815
|
+
or (exclude_terminating and pod["metadata"].get("deletionTimestamp"))
|
|
2816
|
+
):
|
|
2817
|
+
continue
|
|
2818
|
+
try:
|
|
2819
|
+
port = None
|
|
2820
|
+
for container in pod["spec"]["containers"]:
|
|
2821
|
+
if container["name"] != HACHECK_POD_NAME:
|
|
2822
|
+
port = container["ports"][0]["containerPort"]
|
|
2823
|
+
break
|
|
2824
|
+
|
|
2825
|
+
try:
|
|
2826
|
+
weight = int(pod["metadata"]["labels"]["paasta.yelp.com/weight"])
|
|
2827
|
+
except (KeyError, ValueError):
|
|
2828
|
+
weight = 10
|
|
2829
|
+
|
|
2830
|
+
services.append(
|
|
2831
|
+
KubernetesServiceRegistration(
|
|
2832
|
+
name=pod["metadata"]["labels"]["paasta.yelp.com/service"],
|
|
2833
|
+
instance=pod["metadata"]["labels"]["paasta.yelp.com/instance"],
|
|
2834
|
+
port=port,
|
|
2835
|
+
pod_ip=pod["status"]["podIP"],
|
|
2836
|
+
registrations=json.loads(
|
|
2837
|
+
pod["metadata"]["annotations"]["smartstack_registrations"]
|
|
2838
|
+
),
|
|
2839
|
+
weight=weight,
|
|
2840
|
+
)
|
|
2841
|
+
)
|
|
2842
|
+
except KeyError as e:
|
|
2843
|
+
log.warning(
|
|
2844
|
+
f"Found running paasta pod but missing {e} key so not registering with nerve"
|
|
2845
|
+
)
|
|
2846
|
+
return services
|
|
2847
|
+
|
|
2848
|
+
|
|
2849
|
+
def get_kubernetes_services_running_here_for_nerve(
|
|
2850
|
+
cluster: Optional[str], soa_dir: str
|
|
2851
|
+
) -> List[Tuple[str, ServiceNamespaceConfig]]:
|
|
2852
|
+
try:
|
|
2853
|
+
system_paasta_config = load_system_paasta_config()
|
|
2854
|
+
if not cluster:
|
|
2855
|
+
cluster = system_paasta_config.get_cluster()
|
|
2856
|
+
# In the cases where there is *no* cluster or in the case
|
|
2857
|
+
# where there isn't a Paasta configuration file at *all*, then
|
|
2858
|
+
# there must be no kubernetes services running here, so we catch
|
|
2859
|
+
# these custom exceptions and return [].
|
|
2860
|
+
if not system_paasta_config.get_register_k8s_pods():
|
|
2861
|
+
return []
|
|
2862
|
+
exclude_terminating = (
|
|
2863
|
+
not system_paasta_config.get_nerve_register_k8s_terminating()
|
|
2864
|
+
)
|
|
2865
|
+
|
|
2866
|
+
except PaastaNotConfiguredError:
|
|
2867
|
+
log.warning("No PaaSTA config so skipping registering k8s pods in nerve")
|
|
2868
|
+
return []
|
|
2869
|
+
kubernetes_services = get_kubernetes_services_running_here(
|
|
2870
|
+
exclude_terminating=exclude_terminating
|
|
2871
|
+
)
|
|
2872
|
+
nerve_list = []
|
|
2873
|
+
for kubernetes_service in kubernetes_services:
|
|
2874
|
+
try:
|
|
2875
|
+
for registration in kubernetes_service.registrations:
|
|
2876
|
+
reg_service, reg_namespace, _, __ = decompose_job_id(registration)
|
|
2877
|
+
try:
|
|
2878
|
+
nerve_dict = load_service_namespace_config(
|
|
2879
|
+
service=reg_service, namespace=reg_namespace, soa_dir=soa_dir
|
|
2880
|
+
)
|
|
2881
|
+
except Exception as e:
|
|
2882
|
+
log.warning(str(e))
|
|
2883
|
+
log.warning(
|
|
2884
|
+
f"Could not get smartstack config for {reg_service}.{reg_namespace}, skipping"
|
|
2885
|
+
)
|
|
2886
|
+
# but the show must go on!
|
|
2887
|
+
continue
|
|
2888
|
+
if not nerve_dict.is_in_smartstack():
|
|
2889
|
+
continue
|
|
2890
|
+
nerve_dict["port"] = kubernetes_service.port
|
|
2891
|
+
nerve_dict["service_ip"] = kubernetes_service.pod_ip
|
|
2892
|
+
if system_paasta_config.get_kubernetes_use_hacheck_sidecar():
|
|
2893
|
+
nerve_dict["hacheck_ip"] = kubernetes_service.pod_ip
|
|
2894
|
+
else:
|
|
2895
|
+
nerve_dict["extra_healthcheck_headers"] = {
|
|
2896
|
+
"X-Nerve-Check-IP": kubernetes_service.pod_ip
|
|
2897
|
+
}
|
|
2898
|
+
nerve_dict["weight"] = kubernetes_service.weight
|
|
2899
|
+
nerve_list.append((registration, nerve_dict))
|
|
2900
|
+
except KeyError:
|
|
2901
|
+
continue # SOA configs got deleted for this app, it'll get cleaned up
|
|
2902
|
+
|
|
2903
|
+
return nerve_list
|
|
2904
|
+
|
|
2905
|
+
|
|
2906
|
+
def force_delete_pods(
|
|
2907
|
+
service: str,
|
|
2908
|
+
paasta_service: str,
|
|
2909
|
+
instance: str,
|
|
2910
|
+
namespace: str,
|
|
2911
|
+
kube_client: KubeClient,
|
|
2912
|
+
) -> None:
|
|
2913
|
+
# Note that KubeClient.deployments.delete_namespaced_deployment must be called prior to this method.
|
|
2914
|
+
pods_to_delete = a_sync.block(
|
|
2915
|
+
pods_for_service_instance,
|
|
2916
|
+
paasta_service,
|
|
2917
|
+
instance,
|
|
2918
|
+
kube_client,
|
|
2919
|
+
namespace=namespace,
|
|
2920
|
+
)
|
|
2921
|
+
delete_options = V1DeleteOptions()
|
|
2922
|
+
for pod in pods_to_delete:
|
|
2923
|
+
kube_client.core.delete_namespaced_pod(
|
|
2924
|
+
pod.metadata.name, namespace, body=delete_options, grace_period_seconds=0
|
|
2925
|
+
)
|
|
2926
|
+
|
|
2927
|
+
|
|
2928
|
+
@time_cache(ttl=60)
|
|
2929
|
+
def get_all_namespaces(
|
|
2930
|
+
kube_client: KubeClient, label_selector: Optional[str] = None
|
|
2931
|
+
) -> List[str]:
|
|
2932
|
+
namespaces = kube_client.core.list_namespace(label_selector=label_selector)
|
|
2933
|
+
return [item.metadata.name for item in namespaces.items]
|
|
2934
|
+
|
|
2935
|
+
|
|
2936
|
+
def get_all_managed_namespaces(kube_client: KubeClient) -> List[str]:
|
|
2937
|
+
return get_all_namespaces(
|
|
2938
|
+
kube_client=kube_client, label_selector=f"{paasta_prefixed('managed')}=true"
|
|
2939
|
+
)
|
|
2940
|
+
|
|
2941
|
+
|
|
2942
|
+
def get_matching_namespaces(
|
|
2943
|
+
all_namespaces: Iterable[str],
|
|
2944
|
+
namespace_prefix: Optional[str],
|
|
2945
|
+
additional_namespaces: Container[str],
|
|
2946
|
+
) -> List[str]:
|
|
2947
|
+
return [
|
|
2948
|
+
n
|
|
2949
|
+
for n in all_namespaces
|
|
2950
|
+
if (namespace_prefix is not None and n.startswith(namespace_prefix))
|
|
2951
|
+
or n in additional_namespaces
|
|
2952
|
+
]
|
|
2953
|
+
|
|
2954
|
+
|
|
2955
|
+
@functools.lru_cache()
|
|
2956
|
+
def ensure_namespace(kube_client: KubeClient, namespace: str) -> None:
|
|
2957
|
+
paasta_namespace = V1Namespace(
|
|
2958
|
+
metadata=V1ObjectMeta(
|
|
2959
|
+
name=namespace,
|
|
2960
|
+
labels={
|
|
2961
|
+
"name": namespace,
|
|
2962
|
+
paasta_prefixed("owner"): "compute_infra_platform_experience",
|
|
2963
|
+
paasta_prefixed("managed"): "true",
|
|
2964
|
+
},
|
|
2965
|
+
)
|
|
2966
|
+
)
|
|
2967
|
+
namespace_names = get_all_namespaces(kube_client)
|
|
2968
|
+
if namespace not in namespace_names:
|
|
2969
|
+
log.warning(f"Creating namespace: {namespace} as it does not exist")
|
|
2970
|
+
try:
|
|
2971
|
+
kube_client.core.create_namespace(body=paasta_namespace)
|
|
2972
|
+
except ApiException as e:
|
|
2973
|
+
if e.status == 409:
|
|
2974
|
+
log.warning(
|
|
2975
|
+
"Got HTTP 409 when creating namespace; it must already exist. Continuing."
|
|
2976
|
+
)
|
|
2977
|
+
else:
|
|
2978
|
+
raise
|
|
2979
|
+
|
|
2980
|
+
ensure_paasta_api_rolebinding(kube_client, namespace)
|
|
2981
|
+
ensure_paasta_namespace_limits(kube_client, namespace)
|
|
2982
|
+
|
|
2983
|
+
|
|
2984
|
+
def ensure_paasta_api_rolebinding(kube_client: KubeClient, namespace: str) -> None:
|
|
2985
|
+
rolebindings = get_all_role_bindings(kube_client, namespace=namespace)
|
|
2986
|
+
rolebinding_names = [item.metadata.name for item in rolebindings]
|
|
2987
|
+
if "paasta-api-server-per-namespace" not in rolebinding_names:
|
|
2988
|
+
log.warning(
|
|
2989
|
+
f"Creating rolebinding paasta-api-server-per-namespace on {namespace} namespace as it does not exist"
|
|
2990
|
+
)
|
|
2991
|
+
role_binding = V1RoleBinding(
|
|
2992
|
+
metadata=V1ObjectMeta(
|
|
2993
|
+
name="paasta-api-server-per-namespace",
|
|
2994
|
+
namespace=namespace,
|
|
2995
|
+
),
|
|
2996
|
+
role_ref=V1RoleRef(
|
|
2997
|
+
api_group="rbac.authorization.k8s.io",
|
|
2998
|
+
kind="ClusterRole",
|
|
2999
|
+
name="paasta-api-server-per-namespace",
|
|
3000
|
+
),
|
|
3001
|
+
subjects=[
|
|
3002
|
+
V1Subject(
|
|
3003
|
+
kind="User",
|
|
3004
|
+
name="yelp.com/paasta-api-server",
|
|
3005
|
+
),
|
|
3006
|
+
],
|
|
3007
|
+
)
|
|
3008
|
+
kube_client.rbac.create_namespaced_role_binding(
|
|
3009
|
+
namespace=namespace, body=role_binding
|
|
3010
|
+
)
|
|
3011
|
+
|
|
3012
|
+
|
|
3013
|
+
def ensure_paasta_namespace_limits(kube_client: KubeClient, namespace: str) -> None:
|
|
3014
|
+
if not namespace.startswith("paastasvc-"):
|
|
3015
|
+
log.debug(
|
|
3016
|
+
f"Not creating LimitRange because {namespace} does not start with paastasvc-"
|
|
3017
|
+
)
|
|
3018
|
+
return
|
|
3019
|
+
|
|
3020
|
+
limits = get_all_limit_ranges(kube_client, namespace=namespace)
|
|
3021
|
+
limits_names = {item.metadata.name for item in limits}
|
|
3022
|
+
if "limit-mem-cpu-disk-per-container" not in limits_names:
|
|
3023
|
+
log.warning(
|
|
3024
|
+
f"Creating limit: limit-mem-cpu-disk-per-container on {namespace} namespace as it does not exist"
|
|
3025
|
+
)
|
|
3026
|
+
limit = V1LimitRange(
|
|
3027
|
+
metadata=V1ObjectMeta(
|
|
3028
|
+
name="limit-mem-cpu-disk-per-container",
|
|
3029
|
+
namespace=namespace,
|
|
3030
|
+
),
|
|
3031
|
+
spec=V1LimitRangeSpec(
|
|
3032
|
+
limits=[
|
|
3033
|
+
V1LimitRangeItem(
|
|
3034
|
+
type="Container",
|
|
3035
|
+
default={
|
|
3036
|
+
"cpu": "1",
|
|
3037
|
+
"memory": "1024Mi",
|
|
3038
|
+
"ephemeral-storage": "1Gi",
|
|
3039
|
+
},
|
|
3040
|
+
default_request={
|
|
3041
|
+
"cpu": "1",
|
|
3042
|
+
"memory": "1024Mi",
|
|
3043
|
+
"ephemeral-storage": "1Gi",
|
|
3044
|
+
},
|
|
3045
|
+
)
|
|
3046
|
+
]
|
|
3047
|
+
),
|
|
3048
|
+
)
|
|
3049
|
+
kube_client.core.create_namespaced_limit_range(namespace=namespace, body=limit)
|
|
3050
|
+
|
|
3051
|
+
|
|
3052
|
+
def list_deployments_in_all_namespaces(
|
|
3053
|
+
kube_client: KubeClient, label_selector: str
|
|
3054
|
+
) -> List[KubeDeployment]:
|
|
3055
|
+
deployments = kube_client.deployments.list_deployment_for_all_namespaces(
|
|
3056
|
+
label_selector=label_selector
|
|
3057
|
+
)
|
|
3058
|
+
stateful_sets = kube_client.deployments.list_stateful_set_for_all_namespaces(
|
|
3059
|
+
label_selector=label_selector
|
|
3060
|
+
)
|
|
3061
|
+
return [
|
|
3062
|
+
KubeDeployment(
|
|
3063
|
+
service=item.metadata.labels["paasta.yelp.com/service"],
|
|
3064
|
+
instance=item.metadata.labels["paasta.yelp.com/instance"],
|
|
3065
|
+
git_sha=item.metadata.labels.get("paasta.yelp.com/git_sha", ""),
|
|
3066
|
+
image_version=item.metadata.labels.get(
|
|
3067
|
+
"paasta.yelp.com/image_version", None
|
|
3068
|
+
),
|
|
3069
|
+
namespace=item.metadata.namespace,
|
|
3070
|
+
config_sha=item.metadata.labels.get("paasta.yelp.com/config_sha", ""),
|
|
3071
|
+
replicas=(
|
|
3072
|
+
item.spec.replicas
|
|
3073
|
+
if item.metadata.labels.get(paasta_prefixed("autoscaled"), "false")
|
|
3074
|
+
== "false"
|
|
3075
|
+
else None
|
|
3076
|
+
),
|
|
3077
|
+
)
|
|
3078
|
+
for item in deployments.items + stateful_sets.items
|
|
3079
|
+
]
|
|
3080
|
+
|
|
3081
|
+
|
|
3082
|
+
def list_deployments(
|
|
3083
|
+
kube_client: KubeClient,
|
|
3084
|
+
*,
|
|
3085
|
+
namespace: str,
|
|
3086
|
+
label_selector: str = "",
|
|
3087
|
+
) -> Sequence[KubeDeployment]:
|
|
3088
|
+
|
|
3089
|
+
deployments = kube_client.deployments.list_namespaced_deployment(
|
|
3090
|
+
namespace=namespace, label_selector=label_selector
|
|
3091
|
+
)
|
|
3092
|
+
stateful_sets = kube_client.deployments.list_namespaced_stateful_set(
|
|
3093
|
+
namespace=namespace, label_selector=label_selector
|
|
3094
|
+
)
|
|
3095
|
+
return [
|
|
3096
|
+
KubeDeployment(
|
|
3097
|
+
service=item.metadata.labels["paasta.yelp.com/service"],
|
|
3098
|
+
instance=item.metadata.labels["paasta.yelp.com/instance"],
|
|
3099
|
+
git_sha=item.metadata.labels.get("paasta.yelp.com/git_sha", ""),
|
|
3100
|
+
image_version=item.metadata.labels.get(
|
|
3101
|
+
"paasta.yelp.com/image_version", None
|
|
3102
|
+
),
|
|
3103
|
+
namespace=item.metadata.namespace,
|
|
3104
|
+
config_sha=item.metadata.labels["paasta.yelp.com/config_sha"],
|
|
3105
|
+
replicas=(
|
|
3106
|
+
item.spec.replicas
|
|
3107
|
+
if item.metadata.labels.get(paasta_prefixed("autoscaled"), "false")
|
|
3108
|
+
== "false"
|
|
3109
|
+
else None
|
|
3110
|
+
),
|
|
3111
|
+
)
|
|
3112
|
+
for item in deployments.items + stateful_sets.items
|
|
3113
|
+
]
|
|
3114
|
+
|
|
3115
|
+
|
|
3116
|
+
def list_deployments_in_managed_namespaces(
|
|
3117
|
+
kube_client: KubeClient,
|
|
3118
|
+
label_selector: str,
|
|
3119
|
+
) -> List[KubeDeployment]:
|
|
3120
|
+
ret: List[KubeDeployment] = []
|
|
3121
|
+
for namespace in get_all_managed_namespaces(kube_client):
|
|
3122
|
+
try:
|
|
3123
|
+
ret.extend(
|
|
3124
|
+
list_deployments(
|
|
3125
|
+
kube_client=kube_client,
|
|
3126
|
+
label_selector=label_selector,
|
|
3127
|
+
namespace=namespace,
|
|
3128
|
+
)
|
|
3129
|
+
)
|
|
3130
|
+
except ApiException as exc:
|
|
3131
|
+
log.error(
|
|
3132
|
+
f"Error fetching deployments from namespace {namespace}: "
|
|
3133
|
+
f"status: {exc.status}, reason: {exc.reason}."
|
|
3134
|
+
)
|
|
3135
|
+
return ret
|
|
3136
|
+
|
|
3137
|
+
|
|
3138
|
+
def recent_container_restart(
|
|
3139
|
+
restart_count: int,
|
|
3140
|
+
last_state: Optional[str],
|
|
3141
|
+
last_timestamp: Optional[int],
|
|
3142
|
+
time_window_s: int = 900, # 15 mins
|
|
3143
|
+
) -> bool:
|
|
3144
|
+
min_timestamp = datetime.now(timezone.utc).timestamp() - time_window_s
|
|
3145
|
+
return (
|
|
3146
|
+
restart_count > 0
|
|
3147
|
+
and last_state == "terminated"
|
|
3148
|
+
and last_timestamp is not None
|
|
3149
|
+
and last_timestamp > min_timestamp
|
|
3150
|
+
)
|
|
3151
|
+
|
|
3152
|
+
|
|
3153
|
+
@async_timeout()
|
|
3154
|
+
async def get_tail_lines_for_kubernetes_container(
|
|
3155
|
+
kube_client: KubeClient,
|
|
3156
|
+
pod: V1Pod,
|
|
3157
|
+
container: V1ContainerStatus,
|
|
3158
|
+
num_tail_lines: int,
|
|
3159
|
+
previous: bool = False,
|
|
3160
|
+
) -> MutableMapping[str, Any]:
|
|
3161
|
+
tail_lines: MutableMapping[str, Any] = {
|
|
3162
|
+
"stdout": [],
|
|
3163
|
+
"stderr": [],
|
|
3164
|
+
"error_message": "",
|
|
3165
|
+
}
|
|
3166
|
+
|
|
3167
|
+
if container.name != HACHECK_POD_NAME:
|
|
3168
|
+
error = ""
|
|
3169
|
+
if container.state.waiting:
|
|
3170
|
+
error = container.state.waiting.message or ""
|
|
3171
|
+
elif container.state.terminated:
|
|
3172
|
+
error = container.state.terminated.message or ""
|
|
3173
|
+
tail_lines["error_message"] = error
|
|
3174
|
+
|
|
3175
|
+
try:
|
|
3176
|
+
if num_tail_lines > 0:
|
|
3177
|
+
log = kube_client.core.read_namespaced_pod_log(
|
|
3178
|
+
name=pod.metadata.name,
|
|
3179
|
+
namespace=pod.metadata.namespace,
|
|
3180
|
+
container=container.name,
|
|
3181
|
+
tail_lines=num_tail_lines,
|
|
3182
|
+
previous=previous,
|
|
3183
|
+
)
|
|
3184
|
+
tail_lines["stdout"].extend(log.split("\n"))
|
|
3185
|
+
except ApiException as e:
|
|
3186
|
+
# there is a potential race condition in which a pod's containers
|
|
3187
|
+
# have not failed, but have when we get the container's logs. in this
|
|
3188
|
+
# case, use the error from the exception, though it is less accurate.
|
|
3189
|
+
if error == "":
|
|
3190
|
+
body = json.loads(e.body)
|
|
3191
|
+
error = body.get("message", "")
|
|
3192
|
+
tail_lines["error_message"] = f"couldn't read stdout/stderr: '{error}'"
|
|
3193
|
+
|
|
3194
|
+
return tail_lines
|
|
3195
|
+
|
|
3196
|
+
|
|
3197
|
+
async def get_pod_event_messages(
|
|
3198
|
+
kube_client: KubeClient, pod: V1Pod, max_age_in_seconds: Optional[int] = None
|
|
3199
|
+
) -> List[Dict]:
|
|
3200
|
+
pod_events = await get_events_for_object(
|
|
3201
|
+
kube_client, pod, "Pod", max_age_in_seconds
|
|
3202
|
+
)
|
|
3203
|
+
pod_event_messages = []
|
|
3204
|
+
if pod_events:
|
|
3205
|
+
for event in pod_events:
|
|
3206
|
+
message = {
|
|
3207
|
+
"message": event.message,
|
|
3208
|
+
"timeStamp": str(event.last_timestamp),
|
|
3209
|
+
}
|
|
3210
|
+
pod_event_messages.append(message)
|
|
3211
|
+
return pod_event_messages
|
|
3212
|
+
|
|
3213
|
+
|
|
3214
|
+
def format_pod_event_messages(
|
|
3215
|
+
pod_event_messages: List[Dict], pod_name: str
|
|
3216
|
+
) -> List[str]:
|
|
3217
|
+
rows: List[str] = list()
|
|
3218
|
+
rows.append(PaastaColors.blue(f" Pod Events for {pod_name}"))
|
|
3219
|
+
for message in pod_event_messages:
|
|
3220
|
+
if "error" in message:
|
|
3221
|
+
rows.append(PaastaColors.yellow(f' Error: {message["error"]}'))
|
|
3222
|
+
else:
|
|
3223
|
+
timestamp = message.get("time_stamp", "unknown time")
|
|
3224
|
+
message_text = message.get("message", "")
|
|
3225
|
+
rows.append(f" Event at {timestamp}: {message_text}")
|
|
3226
|
+
return rows
|
|
3227
|
+
|
|
3228
|
+
|
|
3229
|
+
def format_tail_lines_for_kubernetes_pod(
|
|
3230
|
+
pod_containers: Sequence,
|
|
3231
|
+
pod_name: str,
|
|
3232
|
+
) -> List[str]:
|
|
3233
|
+
errors: List[str] = []
|
|
3234
|
+
lines: List[str] = []
|
|
3235
|
+
tail_line_prefixes = (
|
|
3236
|
+
("tail_lines", "current"),
|
|
3237
|
+
("previous_tail_lines", "previous (pre-restart)"),
|
|
3238
|
+
)
|
|
3239
|
+
|
|
3240
|
+
for container in pod_containers:
|
|
3241
|
+
for tail_line_key, stream_prefix in tail_line_prefixes:
|
|
3242
|
+
tail_lines = getattr(container, tail_line_key, None)
|
|
3243
|
+
if tail_lines is None:
|
|
3244
|
+
break
|
|
3245
|
+
if tail_lines.error_message:
|
|
3246
|
+
errors.append(PaastaColors.red(f" {tail_lines.error_message}"))
|
|
3247
|
+
|
|
3248
|
+
for stream_name in ("stdout", "stderr"):
|
|
3249
|
+
stream_lines = getattr(tail_lines, stream_name, [])
|
|
3250
|
+
if len(stream_lines) > 0:
|
|
3251
|
+
lines.append(
|
|
3252
|
+
PaastaColors.blue(
|
|
3253
|
+
f" {stream_prefix} {stream_name} tail for {container.name} "
|
|
3254
|
+
f"in pod {pod_name}"
|
|
3255
|
+
)
|
|
3256
|
+
)
|
|
3257
|
+
lines.extend(f" {line}" for line in stream_lines)
|
|
3258
|
+
|
|
3259
|
+
rows: List[str] = []
|
|
3260
|
+
if errors:
|
|
3261
|
+
rows.append(
|
|
3262
|
+
PaastaColors.blue(
|
|
3263
|
+
f" errors for container {container.name} in pod {pod_name}"
|
|
3264
|
+
)
|
|
3265
|
+
)
|
|
3266
|
+
rows.extend(errors)
|
|
3267
|
+
rows.append("")
|
|
3268
|
+
rows.extend(lines)
|
|
3269
|
+
return rows
|
|
3270
|
+
|
|
3271
|
+
|
|
3272
|
+
def create_custom_resource(
|
|
3273
|
+
kube_client: KubeClient,
|
|
3274
|
+
formatted_resource: Mapping[str, Any],
|
|
3275
|
+
version: str,
|
|
3276
|
+
kind: KubeKind,
|
|
3277
|
+
group: str,
|
|
3278
|
+
) -> None:
|
|
3279
|
+
return kube_client.custom.create_namespaced_custom_object(
|
|
3280
|
+
group=group,
|
|
3281
|
+
version=version,
|
|
3282
|
+
namespace=f"paasta-{kind.plural}",
|
|
3283
|
+
plural=kind.plural,
|
|
3284
|
+
body=formatted_resource,
|
|
3285
|
+
)
|
|
3286
|
+
|
|
3287
|
+
|
|
3288
|
+
def update_custom_resource(
|
|
3289
|
+
kube_client: KubeClient,
|
|
3290
|
+
formatted_resource: Mapping[str, Any],
|
|
3291
|
+
version: str,
|
|
3292
|
+
name: str,
|
|
3293
|
+
kind: KubeKind,
|
|
3294
|
+
group: str,
|
|
3295
|
+
) -> None:
|
|
3296
|
+
co = kube_client.custom.get_namespaced_custom_object(
|
|
3297
|
+
name=name,
|
|
3298
|
+
group=group,
|
|
3299
|
+
version=version,
|
|
3300
|
+
namespace=f"paasta-{kind.plural}",
|
|
3301
|
+
plural=kind.plural,
|
|
3302
|
+
)
|
|
3303
|
+
formatted_resource["metadata"]["resourceVersion"] = co["metadata"][
|
|
3304
|
+
"resourceVersion"
|
|
3305
|
+
]
|
|
3306
|
+
return kube_client.custom.replace_namespaced_custom_object(
|
|
3307
|
+
name=name,
|
|
3308
|
+
group=group,
|
|
3309
|
+
version=version,
|
|
3310
|
+
namespace=f"paasta-{kind.plural}",
|
|
3311
|
+
plural=kind.plural,
|
|
3312
|
+
body=formatted_resource,
|
|
3313
|
+
)
|
|
3314
|
+
|
|
3315
|
+
|
|
3316
|
+
def list_custom_resources(
|
|
3317
|
+
kind: KubeKind,
|
|
3318
|
+
version: str,
|
|
3319
|
+
kube_client: KubeClient,
|
|
3320
|
+
group: str,
|
|
3321
|
+
label_selector: str = "",
|
|
3322
|
+
) -> Sequence[KubeCustomResource]:
|
|
3323
|
+
crs = kube_client.custom.list_namespaced_custom_object(
|
|
3324
|
+
group=group,
|
|
3325
|
+
version=version,
|
|
3326
|
+
label_selector=label_selector,
|
|
3327
|
+
plural=kind.plural,
|
|
3328
|
+
namespace=f"paasta-{kind.plural}",
|
|
3329
|
+
)
|
|
3330
|
+
kube_custom_resources = []
|
|
3331
|
+
for cr in crs["items"]:
|
|
3332
|
+
try:
|
|
3333
|
+
kube_custom_resources.append(
|
|
3334
|
+
KubeCustomResource(
|
|
3335
|
+
service=cr["metadata"]["labels"]["paasta.yelp.com/service"],
|
|
3336
|
+
instance=cr["metadata"]["labels"]["paasta.yelp.com/instance"],
|
|
3337
|
+
config_sha=cr["metadata"]["labels"]["paasta.yelp.com/config_sha"],
|
|
3338
|
+
git_sha=cr["metadata"]["labels"].get("paasta.yelp.com/git_sha", ""),
|
|
3339
|
+
kind=cr["kind"],
|
|
3340
|
+
namespace=cr["metadata"]["namespace"],
|
|
3341
|
+
name=cr["metadata"]["name"],
|
|
3342
|
+
)
|
|
3343
|
+
)
|
|
3344
|
+
except KeyError as e:
|
|
3345
|
+
log.debug(
|
|
3346
|
+
f"Ignoring custom resource that is missing paasta label {e}: {cr}"
|
|
3347
|
+
)
|
|
3348
|
+
continue
|
|
3349
|
+
return kube_custom_resources
|
|
3350
|
+
|
|
3351
|
+
|
|
3352
|
+
def delete_custom_resource(
|
|
3353
|
+
kube_client: KubeClient,
|
|
3354
|
+
name: str,
|
|
3355
|
+
namespace: str,
|
|
3356
|
+
group: str,
|
|
3357
|
+
version: str,
|
|
3358
|
+
plural: str,
|
|
3359
|
+
) -> None:
|
|
3360
|
+
return kube_client.custom.delete_namespaced_custom_object(
|
|
3361
|
+
name=name,
|
|
3362
|
+
namespace=namespace,
|
|
3363
|
+
group=group,
|
|
3364
|
+
version=version,
|
|
3365
|
+
plural=plural,
|
|
3366
|
+
body=V1DeleteOptions(),
|
|
3367
|
+
)
|
|
3368
|
+
|
|
3369
|
+
|
|
3370
|
+
def max_unavailable(instance_count: int, bounce_margin_factor: float) -> int:
|
|
3371
|
+
if instance_count == 0:
|
|
3372
|
+
return 0
|
|
3373
|
+
else:
|
|
3374
|
+
return max(
|
|
3375
|
+
instance_count - int(math.ceil(instance_count * bounce_margin_factor)), 1
|
|
3376
|
+
)
|
|
3377
|
+
|
|
3378
|
+
|
|
3379
|
+
def pod_disruption_budget_for_service_instance(
|
|
3380
|
+
service: str,
|
|
3381
|
+
instance: str,
|
|
3382
|
+
max_unavailable: Union[str, int],
|
|
3383
|
+
namespace: str,
|
|
3384
|
+
) -> V1PodDisruptionBudget:
|
|
3385
|
+
return V1PodDisruptionBudget(
|
|
3386
|
+
metadata=V1ObjectMeta(
|
|
3387
|
+
name=get_kubernetes_app_name(service, instance),
|
|
3388
|
+
namespace=namespace,
|
|
3389
|
+
),
|
|
3390
|
+
spec=V1PodDisruptionBudgetSpec(
|
|
3391
|
+
max_unavailable=max_unavailable,
|
|
3392
|
+
selector=V1LabelSelector(
|
|
3393
|
+
match_labels={
|
|
3394
|
+
"paasta.yelp.com/service": service,
|
|
3395
|
+
"paasta.yelp.com/instance": instance,
|
|
3396
|
+
}
|
|
3397
|
+
),
|
|
3398
|
+
),
|
|
3399
|
+
)
|
|
3400
|
+
|
|
3401
|
+
|
|
3402
|
+
def create_pod_disruption_budget(
|
|
3403
|
+
kube_client: KubeClient,
|
|
3404
|
+
pod_disruption_budget: V1PodDisruptionBudget,
|
|
3405
|
+
namespace: str,
|
|
3406
|
+
) -> None:
|
|
3407
|
+
return kube_client.policy.create_namespaced_pod_disruption_budget(
|
|
3408
|
+
namespace=namespace, body=pod_disruption_budget
|
|
3409
|
+
)
|
|
3410
|
+
|
|
3411
|
+
|
|
3412
|
+
def set_instances_for_kubernetes_service(
|
|
3413
|
+
kube_client: KubeClient,
|
|
3414
|
+
service_config: KubernetesDeploymentConfig,
|
|
3415
|
+
instance_count: int,
|
|
3416
|
+
) -> None:
|
|
3417
|
+
name = service_config.get_sanitised_deployment_name()
|
|
3418
|
+
formatted_application = service_config.format_kubernetes_app()
|
|
3419
|
+
formatted_application.spec.replicas = instance_count
|
|
3420
|
+
if service_config.get_persistent_volumes():
|
|
3421
|
+
kube_client.deployments.patch_namespaced_stateful_set_scale(
|
|
3422
|
+
name=name,
|
|
3423
|
+
namespace=service_config.get_namespace(),
|
|
3424
|
+
body=formatted_application,
|
|
3425
|
+
)
|
|
3426
|
+
else:
|
|
3427
|
+
kube_client.deployments.patch_namespaced_deployment_scale(
|
|
3428
|
+
name=name,
|
|
3429
|
+
namespace=service_config.get_namespace(),
|
|
3430
|
+
body=formatted_application,
|
|
3431
|
+
)
|
|
3432
|
+
|
|
3433
|
+
|
|
3434
|
+
def get_annotations_for_kubernetes_service(
|
|
3435
|
+
kube_client: KubeClient, service_config: KubernetesDeploymentConfig
|
|
3436
|
+
) -> Dict:
|
|
3437
|
+
name = service_config.get_sanitised_deployment_name()
|
|
3438
|
+
if service_config.get_persistent_volumes():
|
|
3439
|
+
k8s_service = kube_client.deployments.read_namespaced_stateful_set(
|
|
3440
|
+
name=name, namespace=service_config.get_namespace()
|
|
3441
|
+
)
|
|
3442
|
+
else:
|
|
3443
|
+
k8s_service = kube_client.deployments.read_namespaced_deployment(
|
|
3444
|
+
name=name, namespace=service_config.get_namespace()
|
|
3445
|
+
)
|
|
3446
|
+
return k8s_service.metadata.annotations if k8s_service.metadata.annotations else {}
|
|
3447
|
+
|
|
3448
|
+
|
|
3449
|
+
def write_annotation_for_kubernetes_service(
|
|
3450
|
+
kube_client: KubeClient,
|
|
3451
|
+
service_config: KubernetesDeploymentConfig,
|
|
3452
|
+
formatted_application: Union[V1Deployment, V1StatefulSet],
|
|
3453
|
+
annotation: Dict,
|
|
3454
|
+
) -> None:
|
|
3455
|
+
name = formatted_application.metadata.name
|
|
3456
|
+
formatted_application.metadata.annotations = annotation
|
|
3457
|
+
if service_config.get_persistent_volumes():
|
|
3458
|
+
kube_client.deployments.patch_namespaced_stateful_set(
|
|
3459
|
+
name=name,
|
|
3460
|
+
namespace=service_config.get_namespace(),
|
|
3461
|
+
body=formatted_application,
|
|
3462
|
+
)
|
|
3463
|
+
else:
|
|
3464
|
+
kube_client.deployments.patch_namespaced_deployment(
|
|
3465
|
+
name=name,
|
|
3466
|
+
namespace=service_config.get_namespace(),
|
|
3467
|
+
body=formatted_application,
|
|
3468
|
+
)
|
|
3469
|
+
|
|
3470
|
+
|
|
3471
|
+
def list_all_paasta_deployments(kube_client: KubeClient) -> Sequence[KubeDeployment]:
|
|
3472
|
+
"""Gets deployments in all namespaces by passing the service label selector"""
|
|
3473
|
+
label_selectors = "paasta.yelp.com/service"
|
|
3474
|
+
return list_deployments_in_all_namespaces(
|
|
3475
|
+
kube_client=kube_client, label_selector=label_selectors
|
|
3476
|
+
)
|
|
3477
|
+
|
|
3478
|
+
|
|
3479
|
+
def list_all_deployments(
|
|
3480
|
+
kube_client: KubeClient, namespace: str
|
|
3481
|
+
) -> Sequence[KubeDeployment]:
|
|
3482
|
+
return list_deployments(kube_client=kube_client, namespace=namespace)
|
|
3483
|
+
|
|
3484
|
+
|
|
3485
|
+
def list_matching_deployments(
|
|
3486
|
+
service: str,
|
|
3487
|
+
instance: str,
|
|
3488
|
+
*,
|
|
3489
|
+
namespace: str,
|
|
3490
|
+
kube_client: KubeClient,
|
|
3491
|
+
) -> Sequence[KubeDeployment]:
|
|
3492
|
+
return list_deployments(
|
|
3493
|
+
kube_client,
|
|
3494
|
+
label_selector=f"paasta.yelp.com/service={service},paasta.yelp.com/instance={instance}",
|
|
3495
|
+
namespace=namespace,
|
|
3496
|
+
)
|
|
3497
|
+
|
|
3498
|
+
|
|
3499
|
+
def list_matching_deployments_in_all_namespaces(
|
|
3500
|
+
service: str,
|
|
3501
|
+
instance: str,
|
|
3502
|
+
kube_client: KubeClient,
|
|
3503
|
+
) -> List[KubeDeployment]:
|
|
3504
|
+
return list_deployments_in_all_namespaces(
|
|
3505
|
+
kube_client,
|
|
3506
|
+
f"paasta.yelp.com/service={service},paasta.yelp.com/instance={instance}",
|
|
3507
|
+
)
|
|
3508
|
+
|
|
3509
|
+
|
|
3510
|
+
@async_timeout()
|
|
3511
|
+
async def replicasets_for_service_instance(
|
|
3512
|
+
service: str, instance: str, kube_client: KubeClient, namespace: str
|
|
3513
|
+
) -> Sequence[V1ReplicaSet]:
|
|
3514
|
+
async_list_replica_set = a_sync.to_async(
|
|
3515
|
+
kube_client.deployments.list_namespaced_replica_set
|
|
3516
|
+
)
|
|
3517
|
+
response = await async_list_replica_set(
|
|
3518
|
+
label_selector=f"paasta.yelp.com/service={service},paasta.yelp.com/instance={instance}",
|
|
3519
|
+
namespace=namespace,
|
|
3520
|
+
)
|
|
3521
|
+
return response.items
|
|
3522
|
+
|
|
3523
|
+
|
|
3524
|
+
@async_timeout()
|
|
3525
|
+
async def controller_revisions_for_service_instance(
|
|
3526
|
+
service: str, instance: str, kube_client: KubeClient, namespace: str
|
|
3527
|
+
) -> Sequence[V1ControllerRevision]:
|
|
3528
|
+
async_list_controller_revisions = a_sync.to_async(
|
|
3529
|
+
kube_client.deployments.list_namespaced_controller_revision
|
|
3530
|
+
)
|
|
3531
|
+
response = await async_list_controller_revisions(
|
|
3532
|
+
label_selector=f"paasta.yelp.com/service={service},paasta.yelp.com/instance={instance}",
|
|
3533
|
+
namespace=namespace,
|
|
3534
|
+
)
|
|
3535
|
+
return response.items
|
|
3536
|
+
|
|
3537
|
+
|
|
3538
|
+
@async_timeout(15)
|
|
3539
|
+
async def pods_for_service_instance(
|
|
3540
|
+
service: str, instance: str, kube_client: KubeClient, namespace: str
|
|
3541
|
+
) -> Sequence[V1Pod]:
|
|
3542
|
+
async_list_pods = a_sync.to_async(kube_client.core.list_namespaced_pod)
|
|
3543
|
+
response = await async_list_pods(
|
|
3544
|
+
label_selector=f"paasta.yelp.com/service={service},paasta.yelp.com/instance={instance}",
|
|
3545
|
+
namespace=namespace,
|
|
3546
|
+
)
|
|
3547
|
+
return response.items
|
|
3548
|
+
|
|
3549
|
+
|
|
3550
|
+
def get_pods_by_node(kube_client: KubeClient, node: V1Node) -> Sequence[V1Pod]:
|
|
3551
|
+
return kube_client.core.list_pod_for_all_namespaces(
|
|
3552
|
+
field_selector=f"spec.nodeName={node.metadata.name}"
|
|
3553
|
+
).items
|
|
3554
|
+
|
|
3555
|
+
|
|
3556
|
+
def get_all_pods(kube_client: KubeClient, namespace: Optional[str]) -> List[V1Pod]:
|
|
3557
|
+
if namespace:
|
|
3558
|
+
return kube_client.core.list_namespaced_pod(namespace=namespace).items
|
|
3559
|
+
else:
|
|
3560
|
+
return kube_client.core.list_pod_for_all_namespaces().items
|
|
3561
|
+
|
|
3562
|
+
|
|
3563
|
+
@time_cache(ttl=300)
|
|
3564
|
+
def get_all_pods_cached(kube_client: KubeClient, namespace: str) -> Sequence[V1Pod]:
|
|
3565
|
+
pods: Sequence[V1Pod] = get_all_pods(kube_client, namespace)
|
|
3566
|
+
return pods
|
|
3567
|
+
|
|
3568
|
+
|
|
3569
|
+
def filter_pods_by_service_instance(
|
|
3570
|
+
pod_list: Sequence[V1Pod], service: str, instance: str
|
|
3571
|
+
) -> Sequence[V1Pod]:
|
|
3572
|
+
return [
|
|
3573
|
+
pod
|
|
3574
|
+
for pod in pod_list
|
|
3575
|
+
if pod.metadata.labels is not None
|
|
3576
|
+
and pod.metadata.labels.get("paasta.yelp.com/service", "") == service
|
|
3577
|
+
and pod.metadata.labels.get("paasta.yelp.com/instance", "") == instance
|
|
3578
|
+
]
|
|
3579
|
+
|
|
3580
|
+
|
|
3581
|
+
def group_pods_by_service_instance(
|
|
3582
|
+
pods: Sequence[V1Pod],
|
|
3583
|
+
) -> Dict[str, Dict[str, List[V1Pod]]]:
|
|
3584
|
+
pods_by_service_instance: Dict[str, Dict[str, List[V1Pod]]] = {}
|
|
3585
|
+
for pod in pods:
|
|
3586
|
+
if pod.metadata.labels is not None:
|
|
3587
|
+
service = pod.metadata.labels.get("paasta.yelp.com/service")
|
|
3588
|
+
instance = pod.metadata.labels.get("paasta.yelp.com/instance")
|
|
3589
|
+
|
|
3590
|
+
if service and instance:
|
|
3591
|
+
if service not in pods_by_service_instance:
|
|
3592
|
+
pods_by_service_instance[service] = {}
|
|
3593
|
+
if instance not in pods_by_service_instance[service]:
|
|
3594
|
+
pods_by_service_instance[service][instance] = []
|
|
3595
|
+
|
|
3596
|
+
pods_by_service_instance[service][instance].append(pod)
|
|
3597
|
+
|
|
3598
|
+
return pods_by_service_instance
|
|
3599
|
+
|
|
3600
|
+
|
|
3601
|
+
def _is_it_ready(
|
|
3602
|
+
it: Union[V1Pod, V1Node],
|
|
3603
|
+
) -> bool:
|
|
3604
|
+
ready_conditions = [
|
|
3605
|
+
cond.status == "True"
|
|
3606
|
+
for cond in it.status.conditions or []
|
|
3607
|
+
if cond.type == "Ready"
|
|
3608
|
+
]
|
|
3609
|
+
return all(ready_conditions) if ready_conditions else False
|
|
3610
|
+
|
|
3611
|
+
|
|
3612
|
+
is_pod_ready = _is_it_ready
|
|
3613
|
+
is_node_ready = _is_it_ready
|
|
3614
|
+
|
|
3615
|
+
|
|
3616
|
+
def is_pod_completed(pod: V1Pod) -> bool:
|
|
3617
|
+
condition = get_pod_condition(pod, "ContainersReady")
|
|
3618
|
+
return condition.reason == "PodCompleted" if condition else False
|
|
3619
|
+
|
|
3620
|
+
|
|
3621
|
+
def is_pod_scheduled(pod: V1Pod) -> bool:
|
|
3622
|
+
scheduled_condition = get_pod_condition(pod, "PodScheduled")
|
|
3623
|
+
return scheduled_condition.status == "True" if scheduled_condition else False
|
|
3624
|
+
|
|
3625
|
+
|
|
3626
|
+
def get_pod_condition(pod: V1Pod, condition: str) -> V1PodCondition:
|
|
3627
|
+
conditions = [
|
|
3628
|
+
cond for cond in pod.status.conditions or [] if cond.type == condition
|
|
3629
|
+
]
|
|
3630
|
+
if conditions:
|
|
3631
|
+
return conditions[0]
|
|
3632
|
+
return None
|
|
3633
|
+
|
|
3634
|
+
|
|
3635
|
+
class PodStatus(Enum):
|
|
3636
|
+
PENDING = (1,)
|
|
3637
|
+
RUNNING = (2,)
|
|
3638
|
+
SUCCEEDED = (3,)
|
|
3639
|
+
FAILED = (4,)
|
|
3640
|
+
UNKNOWN = (5,)
|
|
3641
|
+
|
|
3642
|
+
|
|
3643
|
+
_POD_STATUS_NAME_TO_STATUS = {s.name.upper(): s for s in PodStatus}
|
|
3644
|
+
|
|
3645
|
+
|
|
3646
|
+
def get_pod_status(
|
|
3647
|
+
pod: V1Pod,
|
|
3648
|
+
) -> PodStatus:
|
|
3649
|
+
# TODO: we probably also need to deduce extended statuses here, like
|
|
3650
|
+
# `CrashLoopBackOff`, `ContainerCreating` timeout, and etc.
|
|
3651
|
+
return _POD_STATUS_NAME_TO_STATUS[pod.status.phase.upper()]
|
|
3652
|
+
|
|
3653
|
+
|
|
3654
|
+
def parse_container_resources(resources: Mapping[str, str]) -> KubeContainerResources:
|
|
3655
|
+
cpu_str = resources.get("cpu")
|
|
3656
|
+
if not cpu_str:
|
|
3657
|
+
cpus = None
|
|
3658
|
+
elif cpu_str[-1] == "m":
|
|
3659
|
+
cpus = float(cpu_str[:-1]) / 1000
|
|
3660
|
+
else:
|
|
3661
|
+
cpus = float(cpu_str)
|
|
3662
|
+
|
|
3663
|
+
mem_str = resources.get("memory")
|
|
3664
|
+
if not mem_str:
|
|
3665
|
+
mem_mb = None
|
|
3666
|
+
else:
|
|
3667
|
+
mem_mb = parse_size(mem_str) / 1000000
|
|
3668
|
+
|
|
3669
|
+
disk_str = resources.get("ephemeral-storage")
|
|
3670
|
+
if not disk_str:
|
|
3671
|
+
disk_mb = None
|
|
3672
|
+
else:
|
|
3673
|
+
disk_mb = parse_size(disk_str) / 1000000
|
|
3674
|
+
|
|
3675
|
+
return KubeContainerResources(cpus=cpus, mem=mem_mb, disk=disk_mb)
|
|
3676
|
+
|
|
3677
|
+
|
|
3678
|
+
def get_active_versions_for_service(
|
|
3679
|
+
obj_list: Sequence[Union[V1Pod, V1ReplicaSet, V1Deployment, V1StatefulSet]],
|
|
3680
|
+
) -> Set[Tuple[DeploymentVersion, str]]:
|
|
3681
|
+
ret = set()
|
|
3682
|
+
|
|
3683
|
+
for obj in obj_list:
|
|
3684
|
+
config_sha = obj.metadata.labels.get("paasta.yelp.com/config_sha")
|
|
3685
|
+
if config_sha and config_sha.startswith("config"):
|
|
3686
|
+
config_sha = config_sha[len("config") :]
|
|
3687
|
+
|
|
3688
|
+
git_sha = obj.metadata.labels.get("paasta.yelp.com/git_sha")
|
|
3689
|
+
if git_sha and git_sha.startswith("git"):
|
|
3690
|
+
git_sha = git_sha[len("git") :]
|
|
3691
|
+
|
|
3692
|
+
image_version = obj.metadata.labels.get("paasta.yelp.com/image_version")
|
|
3693
|
+
|
|
3694
|
+
# Suppress entries where we have no clue what's running.
|
|
3695
|
+
if git_sha or config_sha:
|
|
3696
|
+
ret.add(
|
|
3697
|
+
(
|
|
3698
|
+
DeploymentVersion(sha=git_sha, image_version=image_version),
|
|
3699
|
+
config_sha,
|
|
3700
|
+
)
|
|
3701
|
+
)
|
|
3702
|
+
return ret
|
|
3703
|
+
|
|
3704
|
+
|
|
3705
|
+
def get_all_nodes(
|
|
3706
|
+
kube_client: KubeClient,
|
|
3707
|
+
) -> List[V1Node]:
|
|
3708
|
+
return kube_client.core.list_node().items
|
|
3709
|
+
|
|
3710
|
+
|
|
3711
|
+
@time_cache(ttl=60)
|
|
3712
|
+
def get_all_nodes_cached(kube_client: KubeClient) -> Sequence[V1Node]:
|
|
3713
|
+
nodes: Sequence[V1Node] = get_all_nodes(kube_client)
|
|
3714
|
+
return nodes
|
|
3715
|
+
|
|
3716
|
+
|
|
3717
|
+
def filter_nodes_by_blacklist(
|
|
3718
|
+
nodes: Sequence[V1Node], blacklist: DeployBlacklist, whitelist: DeployWhitelist
|
|
3719
|
+
) -> Sequence[V1Node]:
|
|
3720
|
+
"""Takes an input list of nodes and filters them based on the given blacklist.
|
|
3721
|
+
The blacklist is in the form of:
|
|
3722
|
+
|
|
3723
|
+
[["location_type", "location]]
|
|
3724
|
+
|
|
3725
|
+
Where the list inside is something like ["region", "uswest1-prod"]
|
|
3726
|
+
|
|
3727
|
+
:returns: The list of nodes after the filter
|
|
3728
|
+
"""
|
|
3729
|
+
if whitelist:
|
|
3730
|
+
whitelist = (paasta_prefixed(whitelist[0]), whitelist[1])
|
|
3731
|
+
blacklist = [(paasta_prefixed(entry[0]), entry[1]) for entry in blacklist]
|
|
3732
|
+
return [
|
|
3733
|
+
node
|
|
3734
|
+
for node in nodes
|
|
3735
|
+
if host_passes_whitelist(node.metadata.labels, whitelist)
|
|
3736
|
+
and host_passes_blacklist(node.metadata.labels, blacklist)
|
|
3737
|
+
]
|
|
3738
|
+
|
|
3739
|
+
|
|
3740
|
+
def paasta_prefixed(
|
|
3741
|
+
attribute: str,
|
|
3742
|
+
) -> str:
|
|
3743
|
+
# discovery attributes are exempt for now
|
|
3744
|
+
if attribute in DISCOVERY_ATTRIBUTES:
|
|
3745
|
+
return YELP_ATTRIBUTE_PREFIX + attribute
|
|
3746
|
+
elif "/" in attribute:
|
|
3747
|
+
return attribute
|
|
3748
|
+
else:
|
|
3749
|
+
return PAASTA_ATTRIBUTE_PREFIX + attribute
|
|
3750
|
+
|
|
3751
|
+
|
|
3752
|
+
def get_nodes_grouped_by_attribute(
|
|
3753
|
+
nodes: Sequence[V1Node], attribute: str
|
|
3754
|
+
) -> Mapping[str, Sequence[V1Node]]:
|
|
3755
|
+
attribute = paasta_prefixed(attribute)
|
|
3756
|
+
sorted_nodes = sorted(
|
|
3757
|
+
nodes, key=lambda node: node.metadata.labels.get(attribute, "")
|
|
3758
|
+
)
|
|
3759
|
+
return {
|
|
3760
|
+
key: list(group)
|
|
3761
|
+
for key, group in itertools.groupby(
|
|
3762
|
+
sorted_nodes, key=lambda node: node.metadata.labels.get(attribute, "")
|
|
3763
|
+
)
|
|
3764
|
+
if key
|
|
3765
|
+
}
|
|
3766
|
+
|
|
3767
|
+
|
|
3768
|
+
def get_kubernetes_app_name(service: str, instance: str) -> str:
|
|
3769
|
+
return "{service}-{instance}".format(
|
|
3770
|
+
service=sanitise_kubernetes_name(service),
|
|
3771
|
+
instance=sanitise_kubernetes_name(instance),
|
|
3772
|
+
)
|
|
3773
|
+
|
|
3774
|
+
|
|
3775
|
+
def get_kubernetes_app_by_name(
|
|
3776
|
+
name: str, kube_client: KubeClient, namespace: str
|
|
3777
|
+
) -> Union[V1Deployment, V1StatefulSet]:
|
|
3778
|
+
try:
|
|
3779
|
+
app = kube_client.deployments.read_namespaced_deployment_status(
|
|
3780
|
+
name=name, namespace=namespace
|
|
3781
|
+
)
|
|
3782
|
+
return app
|
|
3783
|
+
except ApiException as e:
|
|
3784
|
+
if e.status == 404:
|
|
3785
|
+
pass
|
|
3786
|
+
else:
|
|
3787
|
+
raise
|
|
3788
|
+
return kube_client.deployments.read_namespaced_stateful_set_status(
|
|
3789
|
+
name=name, namespace=namespace
|
|
3790
|
+
)
|
|
3791
|
+
|
|
3792
|
+
|
|
3793
|
+
def create_deployment(
|
|
3794
|
+
kube_client: KubeClient,
|
|
3795
|
+
formatted_deployment: V1Deployment,
|
|
3796
|
+
namespace: str,
|
|
3797
|
+
) -> None:
|
|
3798
|
+
return kube_client.deployments.create_namespaced_deployment(
|
|
3799
|
+
namespace=namespace, body=formatted_deployment
|
|
3800
|
+
)
|
|
3801
|
+
|
|
3802
|
+
|
|
3803
|
+
def update_deployment(
|
|
3804
|
+
kube_client: KubeClient,
|
|
3805
|
+
formatted_deployment: V1Deployment,
|
|
3806
|
+
namespace: str,
|
|
3807
|
+
) -> None:
|
|
3808
|
+
return kube_client.deployments.replace_namespaced_deployment(
|
|
3809
|
+
name=formatted_deployment.metadata.name,
|
|
3810
|
+
namespace=namespace,
|
|
3811
|
+
body=formatted_deployment,
|
|
3812
|
+
)
|
|
3813
|
+
|
|
3814
|
+
|
|
3815
|
+
def patch_deployment(
|
|
3816
|
+
kube_client: KubeClient,
|
|
3817
|
+
formatted_deployment: V1Deployment,
|
|
3818
|
+
namespace: str,
|
|
3819
|
+
) -> None:
|
|
3820
|
+
return kube_client.deployments.patch_namespaced_deployment(
|
|
3821
|
+
name=formatted_deployment.metadata.name,
|
|
3822
|
+
namespace=namespace,
|
|
3823
|
+
body=formatted_deployment,
|
|
3824
|
+
)
|
|
3825
|
+
|
|
3826
|
+
|
|
3827
|
+
def delete_deployment(
|
|
3828
|
+
kube_client: KubeClient,
|
|
3829
|
+
deployment_name: str,
|
|
3830
|
+
namespace: str,
|
|
3831
|
+
) -> None:
|
|
3832
|
+
return kube_client.deployments.delete_namespaced_deployment(
|
|
3833
|
+
name=deployment_name,
|
|
3834
|
+
namespace=namespace,
|
|
3835
|
+
)
|
|
3836
|
+
|
|
3837
|
+
|
|
3838
|
+
def create_stateful_set(
|
|
3839
|
+
kube_client: KubeClient,
|
|
3840
|
+
formatted_stateful_set: V1StatefulSet,
|
|
3841
|
+
namespace: str,
|
|
3842
|
+
) -> None:
|
|
3843
|
+
return kube_client.deployments.create_namespaced_stateful_set(
|
|
3844
|
+
namespace=namespace, body=formatted_stateful_set
|
|
3845
|
+
)
|
|
3846
|
+
|
|
3847
|
+
|
|
3848
|
+
def update_stateful_set(
|
|
3849
|
+
kube_client: KubeClient,
|
|
3850
|
+
formatted_stateful_set: V1StatefulSet,
|
|
3851
|
+
namespace: str,
|
|
3852
|
+
) -> None:
|
|
3853
|
+
return kube_client.deployments.replace_namespaced_stateful_set(
|
|
3854
|
+
name=formatted_stateful_set.metadata.name,
|
|
3855
|
+
namespace=namespace,
|
|
3856
|
+
body=formatted_stateful_set,
|
|
3857
|
+
)
|
|
3858
|
+
|
|
3859
|
+
|
|
3860
|
+
def create_job(
|
|
3861
|
+
kube_client: KubeClient,
|
|
3862
|
+
formatted_job: V1Job,
|
|
3863
|
+
namespace: str,
|
|
3864
|
+
) -> None:
|
|
3865
|
+
return kube_client.batches.create_namespaced_job(
|
|
3866
|
+
namespace=namespace,
|
|
3867
|
+
body=formatted_job,
|
|
3868
|
+
)
|
|
3869
|
+
|
|
3870
|
+
|
|
3871
|
+
def get_event_timestamp(event: CoreV1Event) -> Optional[float]:
|
|
3872
|
+
# Cycle through timestamp attributes in order of preference
|
|
3873
|
+
for ts_attr in ["last_timestamp", "event_time", "first_timestamp"]:
|
|
3874
|
+
ts = getattr(event, ts_attr)
|
|
3875
|
+
if ts:
|
|
3876
|
+
return ts.timestamp()
|
|
3877
|
+
return None
|
|
3878
|
+
|
|
3879
|
+
|
|
3880
|
+
@async_timeout()
|
|
3881
|
+
async def get_events_for_object(
|
|
3882
|
+
kube_client: KubeClient,
|
|
3883
|
+
obj: Union[V1Pod, V1Deployment, V1StatefulSet, V1ReplicaSet],
|
|
3884
|
+
kind: str, # for some reason, obj.kind isn't populated when this function is called so we pass it in by hand
|
|
3885
|
+
max_age_in_seconds: Optional[int] = None,
|
|
3886
|
+
) -> List[CoreV1Event]:
|
|
3887
|
+
|
|
3888
|
+
try:
|
|
3889
|
+
# this is a blocking call since it does network I/O and can end up significantly blocking the
|
|
3890
|
+
# asyncio event loop when doing things like getting events for all the Pods for a service with
|
|
3891
|
+
# a large amount of replicas. therefore, we need to wrap the kubernetes client into something
|
|
3892
|
+
# that's awaitable so that we can actually do things concurrently and not serially
|
|
3893
|
+
events = await a_sync.to_async(kube_client.core.list_namespaced_event)(
|
|
3894
|
+
namespace=obj.metadata.namespace,
|
|
3895
|
+
field_selector=f"involvedObject.name={obj.metadata.name},involvedObject.kind={kind}",
|
|
3896
|
+
limit=MAX_EVENTS_TO_RETRIEVE,
|
|
3897
|
+
)
|
|
3898
|
+
events = events.items if events else []
|
|
3899
|
+
if max_age_in_seconds and max_age_in_seconds > 0:
|
|
3900
|
+
# NOTE: the k8s API returns timestamps in UTC, so we make sure to always work in UTC
|
|
3901
|
+
min_timestamp = datetime.now(timezone.utc).timestamp() - max_age_in_seconds
|
|
3902
|
+
events = [
|
|
3903
|
+
evt
|
|
3904
|
+
for evt in events
|
|
3905
|
+
if get_event_timestamp(evt) is None
|
|
3906
|
+
or get_event_timestamp(evt) > min_timestamp
|
|
3907
|
+
]
|
|
3908
|
+
return events
|
|
3909
|
+
except ApiException:
|
|
3910
|
+
return []
|
|
3911
|
+
|
|
3912
|
+
|
|
3913
|
+
@async_timeout()
|
|
3914
|
+
async def get_hpa(
|
|
3915
|
+
kube_client: KubeClient,
|
|
3916
|
+
name: str,
|
|
3917
|
+
namespace: str,
|
|
3918
|
+
) -> V2HorizontalPodAutoscaler:
|
|
3919
|
+
async_get_hpa = a_sync.to_async(
|
|
3920
|
+
kube_client.autoscaling.read_namespaced_horizontal_pod_autoscaler
|
|
3921
|
+
)
|
|
3922
|
+
try:
|
|
3923
|
+
return await async_get_hpa(name, namespace)
|
|
3924
|
+
except ApiException as e:
|
|
3925
|
+
if e.status == 404:
|
|
3926
|
+
return None
|
|
3927
|
+
else:
|
|
3928
|
+
raise
|
|
3929
|
+
|
|
3930
|
+
|
|
3931
|
+
def get_kubernetes_app_deploy_status(
|
|
3932
|
+
app: Union[V1Deployment, V1StatefulSet],
|
|
3933
|
+
desired_instances: int,
|
|
3934
|
+
) -> Tuple[int, str]:
|
|
3935
|
+
if app.status.ready_replicas is None:
|
|
3936
|
+
if desired_instances == 0:
|
|
3937
|
+
deploy_status = KubernetesDeployStatus.Stopped
|
|
3938
|
+
else:
|
|
3939
|
+
deploy_status = KubernetesDeployStatus.Waiting
|
|
3940
|
+
elif app.status.ready_replicas != desired_instances:
|
|
3941
|
+
deploy_status = KubernetesDeployStatus.Waiting
|
|
3942
|
+
# updated_replicas can currently be None for stateful sets so we may not correctly detect status for now
|
|
3943
|
+
# when https://github.com/kubernetes/kubernetes/pull/62943 lands in a release this should work for both:
|
|
3944
|
+
elif app.status.updated_replicas is not None and (
|
|
3945
|
+
app.status.updated_replicas < desired_instances
|
|
3946
|
+
):
|
|
3947
|
+
deploy_status = KubernetesDeployStatus.Deploying
|
|
3948
|
+
elif app.status.replicas == 0 and desired_instances == 0:
|
|
3949
|
+
deploy_status = KubernetesDeployStatus.Stopped
|
|
3950
|
+
else:
|
|
3951
|
+
deploy_status = KubernetesDeployStatus.Running
|
|
3952
|
+
# Temporarily removing the message because the events query it used was overloading etcd
|
|
3953
|
+
# TODO: change the implementation or remove the deploy message entirely
|
|
3954
|
+
deploy_message = ""
|
|
3955
|
+
return deploy_status, deploy_message
|
|
3956
|
+
|
|
3957
|
+
|
|
3958
|
+
class KubernetesDeployStatus:
|
|
3959
|
+
"""An enum to represent Kubernetes app deploy status.
|
|
3960
|
+
Changing name of the keys will affect both the paasta CLI and API.
|
|
3961
|
+
"""
|
|
3962
|
+
|
|
3963
|
+
Running, Deploying, Waiting, Stopped = range(0, 4)
|
|
3964
|
+
|
|
3965
|
+
@classmethod
|
|
3966
|
+
def tostring(cls, val: int) -> str:
|
|
3967
|
+
for k, v in vars(cls).items():
|
|
3968
|
+
if v == val:
|
|
3969
|
+
return k
|
|
3970
|
+
raise ValueError("Unknown Kubernetes deploy status %d" % val)
|
|
3971
|
+
|
|
3972
|
+
@classmethod
|
|
3973
|
+
def fromstring(cls, _str: str) -> int:
|
|
3974
|
+
return getattr(cls, _str, None)
|
|
3975
|
+
|
|
3976
|
+
|
|
3977
|
+
def is_kubernetes_available() -> bool:
|
|
3978
|
+
return Path(os.environ.get("KUBECONFIG", KUBE_CONFIG_PATH)).exists()
|
|
3979
|
+
|
|
3980
|
+
|
|
3981
|
+
def create_secret(
|
|
3982
|
+
kube_client: KubeClient,
|
|
3983
|
+
service_name: str,
|
|
3984
|
+
secret_name: str,
|
|
3985
|
+
secret_data: Dict[str, str],
|
|
3986
|
+
namespace: str,
|
|
3987
|
+
) -> None:
|
|
3988
|
+
"""
|
|
3989
|
+
See restrictions on kubernetes secret at https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Secret.md
|
|
3990
|
+
:param secret_name: Expect properly formatted kubernetes secret name, see _get_secret_name()
|
|
3991
|
+
:param secret_data: Expect a mapping of string-to-string where values are base64-encoded
|
|
3992
|
+
:param service_name: Expect unsanitised service name, since it's used as a label it will have 63 character limit
|
|
3993
|
+
:param namespace: Unsanitized namespace of a service that will use the secret
|
|
3994
|
+
:raises ApiException:
|
|
3995
|
+
"""
|
|
3996
|
+
kube_client.core.create_namespaced_secret(
|
|
3997
|
+
namespace=namespace,
|
|
3998
|
+
body=V1Secret(
|
|
3999
|
+
metadata=V1ObjectMeta(
|
|
4000
|
+
name=secret_name,
|
|
4001
|
+
labels={
|
|
4002
|
+
"yelp.com/paasta_service": sanitise_label_value(service_name),
|
|
4003
|
+
"paasta.yelp.com/service": sanitise_label_value(service_name),
|
|
4004
|
+
},
|
|
4005
|
+
),
|
|
4006
|
+
data=secret_data,
|
|
4007
|
+
),
|
|
4008
|
+
)
|
|
4009
|
+
|
|
4010
|
+
|
|
4011
|
+
def update_secret(
|
|
4012
|
+
kube_client: KubeClient,
|
|
4013
|
+
service_name: str,
|
|
4014
|
+
secret_name: str,
|
|
4015
|
+
secret_data: Dict[str, str],
|
|
4016
|
+
namespace: str,
|
|
4017
|
+
) -> None:
|
|
4018
|
+
"""
|
|
4019
|
+
Expect secret_name to exist, e.g. kubectl get secret
|
|
4020
|
+
:param service_name: Expect unsanitised service name
|
|
4021
|
+
:param secret_data: Expect a mapping of string-to-string where values are base64-encoded
|
|
4022
|
+
:param namespace: Unsanitized namespace of a service that will use the secret
|
|
4023
|
+
:raises ApiException:
|
|
4024
|
+
"""
|
|
4025
|
+
kube_client.core.replace_namespaced_secret(
|
|
4026
|
+
name=secret_name,
|
|
4027
|
+
namespace=namespace,
|
|
4028
|
+
body=V1Secret(
|
|
4029
|
+
metadata=V1ObjectMeta(
|
|
4030
|
+
name=secret_name,
|
|
4031
|
+
labels={
|
|
4032
|
+
"yelp.com/paasta_service": sanitise_label_value(service_name),
|
|
4033
|
+
"paasta.yelp.com/service": sanitise_label_value(service_name),
|
|
4034
|
+
},
|
|
4035
|
+
),
|
|
4036
|
+
data=secret_data,
|
|
4037
|
+
),
|
|
4038
|
+
)
|
|
4039
|
+
|
|
4040
|
+
|
|
4041
|
+
@time_cache(ttl=300)
|
|
4042
|
+
def get_secret_signature(
|
|
4043
|
+
kube_client: KubeClient,
|
|
4044
|
+
signature_name: str,
|
|
4045
|
+
namespace: str,
|
|
4046
|
+
) -> Optional[str]:
|
|
4047
|
+
"""
|
|
4048
|
+
:param signature_name: Expect the signature to exist in kubernetes configmap
|
|
4049
|
+
:return: Kubernetes configmap as a signature
|
|
4050
|
+
:raises ApiException:
|
|
4051
|
+
"""
|
|
4052
|
+
try:
|
|
4053
|
+
signature = kube_client.core.read_namespaced_config_map(
|
|
4054
|
+
name=signature_name,
|
|
4055
|
+
namespace=namespace,
|
|
4056
|
+
)
|
|
4057
|
+
except ApiException as e:
|
|
4058
|
+
if e.status == 404:
|
|
4059
|
+
return None
|
|
4060
|
+
else:
|
|
4061
|
+
raise
|
|
4062
|
+
if not signature:
|
|
4063
|
+
return None
|
|
4064
|
+
else:
|
|
4065
|
+
return signature.data["signature"]
|
|
4066
|
+
|
|
4067
|
+
|
|
4068
|
+
def update_secret_signature(
|
|
4069
|
+
kube_client: KubeClient,
|
|
4070
|
+
service_name: str,
|
|
4071
|
+
signature_name: str,
|
|
4072
|
+
secret_signature: str,
|
|
4073
|
+
namespace: str,
|
|
4074
|
+
) -> None:
|
|
4075
|
+
"""
|
|
4076
|
+
:param service_name: Expect unsanitised service_name
|
|
4077
|
+
:param signature_name: Expect signature_name to exist in kubernetes configmap
|
|
4078
|
+
:param secret_signature: Signature to replace with
|
|
4079
|
+
:raises ApiException:
|
|
4080
|
+
"""
|
|
4081
|
+
kube_client.core.replace_namespaced_config_map(
|
|
4082
|
+
name=signature_name,
|
|
4083
|
+
namespace=namespace,
|
|
4084
|
+
body=V1ConfigMap(
|
|
4085
|
+
metadata=V1ObjectMeta(
|
|
4086
|
+
name=signature_name,
|
|
4087
|
+
labels={
|
|
4088
|
+
"yelp.com/paasta_service": sanitise_label_value(service_name),
|
|
4089
|
+
"paasta.yelp.com/service": sanitise_label_value(service_name),
|
|
4090
|
+
},
|
|
4091
|
+
),
|
|
4092
|
+
data={"signature": secret_signature},
|
|
4093
|
+
),
|
|
4094
|
+
)
|
|
4095
|
+
|
|
4096
|
+
|
|
4097
|
+
def create_secret_signature(
|
|
4098
|
+
kube_client: KubeClient,
|
|
4099
|
+
service_name: str,
|
|
4100
|
+
signature_name: str,
|
|
4101
|
+
secret_signature: str,
|
|
4102
|
+
namespace: str,
|
|
4103
|
+
) -> None:
|
|
4104
|
+
"""
|
|
4105
|
+
:param service_name: Expect unsanitised service_name
|
|
4106
|
+
:param signature_name: Expected properly formatted signature, see _get_secret_signature_name()
|
|
4107
|
+
:param secret_signature: Signature value
|
|
4108
|
+
:param namespace: Unsanitized namespace of a service that will use the signature
|
|
4109
|
+
"""
|
|
4110
|
+
kube_client.core.create_namespaced_config_map(
|
|
4111
|
+
namespace=namespace,
|
|
4112
|
+
body=V1ConfigMap(
|
|
4113
|
+
metadata=V1ObjectMeta(
|
|
4114
|
+
name=signature_name,
|
|
4115
|
+
labels={
|
|
4116
|
+
"yelp.com/paasta_service": sanitise_label_value(service_name),
|
|
4117
|
+
"paasta.yelp.com/service": sanitise_label_value(service_name),
|
|
4118
|
+
},
|
|
4119
|
+
),
|
|
4120
|
+
data={"signature": secret_signature},
|
|
4121
|
+
),
|
|
4122
|
+
)
|
|
4123
|
+
|
|
4124
|
+
|
|
4125
|
+
def sanitise_kubernetes_name(
|
|
4126
|
+
service: str,
|
|
4127
|
+
) -> str:
|
|
4128
|
+
"""
|
|
4129
|
+
Sanitizes kubernetes name so that hyphen (-) can be used a delimeter
|
|
4130
|
+
"""
|
|
4131
|
+
name = service.replace("_", "--")
|
|
4132
|
+
if name.startswith("--"):
|
|
4133
|
+
name = name.replace("--", "underscore-", 1)
|
|
4134
|
+
return name.lower()
|
|
4135
|
+
|
|
4136
|
+
|
|
4137
|
+
def load_custom_resource_definitions(
|
|
4138
|
+
system_paasta_config: SystemPaastaConfig,
|
|
4139
|
+
) -> Sequence[CustomResourceDefinition]:
|
|
4140
|
+
custom_resources = []
|
|
4141
|
+
for custom_resource_dict in system_paasta_config.get_kubernetes_custom_resources():
|
|
4142
|
+
kube_kind = KubeKind(**custom_resource_dict.pop("kube_kind")) # type: ignore
|
|
4143
|
+
custom_resources.append(
|
|
4144
|
+
CustomResourceDefinition( # type: ignore
|
|
4145
|
+
kube_kind=kube_kind, **custom_resource_dict # type: ignore
|
|
4146
|
+
)
|
|
4147
|
+
)
|
|
4148
|
+
return custom_resources
|
|
4149
|
+
|
|
4150
|
+
|
|
4151
|
+
def create_pod_topology_spread_constraints(
|
|
4152
|
+
service: str,
|
|
4153
|
+
instance: str,
|
|
4154
|
+
topology_spread_constraints: List[TopologySpreadConstraintDict],
|
|
4155
|
+
) -> List[V1TopologySpreadConstraint]:
|
|
4156
|
+
"""
|
|
4157
|
+
Applies cluster-level topology spread constraints to every Pod template.
|
|
4158
|
+
This allows us to configure default topology spread constraints on EKS where we cannot configure the scheduler.
|
|
4159
|
+
"""
|
|
4160
|
+
if not topology_spread_constraints:
|
|
4161
|
+
return []
|
|
4162
|
+
|
|
4163
|
+
selector = V1LabelSelector(
|
|
4164
|
+
match_labels={
|
|
4165
|
+
"paasta.yelp.com/service": service,
|
|
4166
|
+
"paasta.yelp.com/instance": instance,
|
|
4167
|
+
}
|
|
4168
|
+
)
|
|
4169
|
+
|
|
4170
|
+
pod_topology_spread_constraints = []
|
|
4171
|
+
for constraint in topology_spread_constraints:
|
|
4172
|
+
pod_topology_spread_constraints.append(
|
|
4173
|
+
V1TopologySpreadConstraint(
|
|
4174
|
+
label_selector=selector,
|
|
4175
|
+
topology_key=constraint.get(
|
|
4176
|
+
"topology_key", None
|
|
4177
|
+
), # ValueError will be raised if unset
|
|
4178
|
+
max_skew=constraint.get("max_skew", 1),
|
|
4179
|
+
when_unsatisfiable=constraint.get(
|
|
4180
|
+
"when_unsatisfiable", "ScheduleAnyway"
|
|
4181
|
+
),
|
|
4182
|
+
)
|
|
4183
|
+
)
|
|
4184
|
+
|
|
4185
|
+
return pod_topology_spread_constraints
|
|
4186
|
+
|
|
4187
|
+
|
|
4188
|
+
def sanitised_cr_name(service: str, instance: str) -> str:
|
|
4189
|
+
sanitised_service = sanitise_kubernetes_name(service)
|
|
4190
|
+
sanitised_instance = sanitise_kubernetes_name(instance)
|
|
4191
|
+
return f"{sanitised_service}-{sanitised_instance}"
|
|
4192
|
+
|
|
4193
|
+
|
|
4194
|
+
def get_cr(
|
|
4195
|
+
kube_client: KubeClient, cr_id: Mapping[str, str]
|
|
4196
|
+
) -> Optional[Mapping[str, Any]]:
|
|
4197
|
+
try:
|
|
4198
|
+
return kube_client.custom.get_namespaced_custom_object(**cr_id)
|
|
4199
|
+
except ApiException as e:
|
|
4200
|
+
if e.status == 404:
|
|
4201
|
+
return None
|
|
4202
|
+
else:
|
|
4203
|
+
raise
|
|
4204
|
+
|
|
4205
|
+
|
|
4206
|
+
def set_cr_desired_state(
|
|
4207
|
+
kube_client: KubeClient, cr_id: Mapping[str, str], desired_state: str
|
|
4208
|
+
) -> str:
|
|
4209
|
+
cr = kube_client.custom.get_namespaced_custom_object(**cr_id)
|
|
4210
|
+
if cr.get("status", {}).get("state") == desired_state:
|
|
4211
|
+
return cr["status"]
|
|
4212
|
+
|
|
4213
|
+
if "metadata" not in cr:
|
|
4214
|
+
cr["metadata"] = {}
|
|
4215
|
+
if "annotations" not in cr["metadata"]:
|
|
4216
|
+
cr["metadata"]["annotations"] = {}
|
|
4217
|
+
cr["metadata"]["annotations"]["yelp.com/desired_state"] = desired_state
|
|
4218
|
+
cr["metadata"]["annotations"]["paasta.yelp.com/desired_state"] = desired_state
|
|
4219
|
+
kube_client.custom.replace_namespaced_custom_object(**cr_id, body=cr)
|
|
4220
|
+
status = cr.get("status")
|
|
4221
|
+
return status
|
|
4222
|
+
|
|
4223
|
+
|
|
4224
|
+
def get_pod_hostname(kube_client: KubeClient, pod: V1Pod) -> str:
|
|
4225
|
+
"""Gets the hostname of a pod's node from labels"""
|
|
4226
|
+
if not pod.spec.node_name: # can be none, if pod not yet scheduled
|
|
4227
|
+
return "NotScheduled"
|
|
4228
|
+
try:
|
|
4229
|
+
node = kube_client.core.read_node(name=pod.spec.node_name)
|
|
4230
|
+
except ApiException:
|
|
4231
|
+
# fall back to node name (which has the IP) if node somehow doesnt exist
|
|
4232
|
+
return pod.spec.node_name
|
|
4233
|
+
# if label has disappeared (say we changed it), default to node name
|
|
4234
|
+
return node.metadata.labels.get("yelp.com/hostname", pod.spec.node_name)
|
|
4235
|
+
|
|
4236
|
+
|
|
4237
|
+
def get_pod_node(
|
|
4238
|
+
kube_client: KubeClient, pod: V1Pod, cache_nodes: bool = False
|
|
4239
|
+
) -> Optional[V1Node]:
|
|
4240
|
+
if cache_nodes:
|
|
4241
|
+
nodes = get_all_nodes_cached(kube_client)
|
|
4242
|
+
else:
|
|
4243
|
+
nodes = get_all_nodes(kube_client)
|
|
4244
|
+
running_node = [node for node in nodes if node.metadata.name == pod.spec.node_name]
|
|
4245
|
+
return running_node[0] if running_node else None
|
|
4246
|
+
|
|
4247
|
+
|
|
4248
|
+
def to_node_label(label: str) -> str:
|
|
4249
|
+
"""k8s-ifies certain special node labels"""
|
|
4250
|
+
if label in {"instance_type", "instance-type"}:
|
|
4251
|
+
return "node.kubernetes.io/instance-type"
|
|
4252
|
+
elif label in {
|
|
4253
|
+
"datacenter",
|
|
4254
|
+
"ecosystem",
|
|
4255
|
+
"habitat",
|
|
4256
|
+
"hostname",
|
|
4257
|
+
"region",
|
|
4258
|
+
"superregion",
|
|
4259
|
+
}:
|
|
4260
|
+
return f"yelp.com/{label}"
|
|
4261
|
+
return label
|
|
4262
|
+
|
|
4263
|
+
|
|
4264
|
+
def get_all_service_accounts(
|
|
4265
|
+
kube_client: KubeClient,
|
|
4266
|
+
namespace: str,
|
|
4267
|
+
label_selector: Optional[str] = None,
|
|
4268
|
+
) -> Sequence[V1ServiceAccount]:
|
|
4269
|
+
return kube_client.core.list_namespaced_service_account(
|
|
4270
|
+
namespace=namespace, label_selector=label_selector
|
|
4271
|
+
).items
|
|
4272
|
+
|
|
4273
|
+
|
|
4274
|
+
def get_all_role_bindings(
|
|
4275
|
+
kube_client: KubeClient,
|
|
4276
|
+
namespace: str,
|
|
4277
|
+
) -> Sequence[V1RoleBinding]:
|
|
4278
|
+
return kube_client.rbac.list_namespaced_role_binding(namespace=namespace).items
|
|
4279
|
+
|
|
4280
|
+
|
|
4281
|
+
def get_all_limit_ranges(
|
|
4282
|
+
kube_client: KubeClient,
|
|
4283
|
+
namespace: str,
|
|
4284
|
+
) -> Sequence[V1LimitRange]:
|
|
4285
|
+
return kube_client.core.list_namespaced_limit_range(namespace).items
|
|
4286
|
+
|
|
4287
|
+
|
|
4288
|
+
_RE_NORMALIZE_IAM_ROLE = re.compile(r"[^0-9a-zA-Z]+")
|
|
4289
|
+
|
|
4290
|
+
|
|
4291
|
+
def get_service_account_name(
|
|
4292
|
+
iam_role: str,
|
|
4293
|
+
k8s_role: Optional[str] = None,
|
|
4294
|
+
) -> str:
|
|
4295
|
+
# the service account is expected to always be prefixed with paasta- as using the actual namespace
|
|
4296
|
+
# potentially wastes a lot of characters (e.g., paasta-nrtsearchservices) that could be used for
|
|
4297
|
+
# the actual name
|
|
4298
|
+
if iam_role: # this is either an empty string or a real role
|
|
4299
|
+
# it's possible for an IAM role to be used for multiple purposes. Some usages may require a
|
|
4300
|
+
# Kubernetes Role attached to the Service Account (e.g., Spark drivers may access S3 but also
|
|
4301
|
+
# need to manage Spark executor Pods), while "normal" services/batches need a Service Account
|
|
4302
|
+
# with only an IAM role attached.
|
|
4303
|
+
# to support these two usecases, we'll suffix the name of a Service Account with the
|
|
4304
|
+
# Kubernetes Role name to disambiguate between the two.
|
|
4305
|
+
if k8s_role:
|
|
4306
|
+
sa_name = f"paasta--{_RE_NORMALIZE_IAM_ROLE.sub('-', iam_role.lower())}--{k8s_role}"
|
|
4307
|
+
else:
|
|
4308
|
+
sa_name = f"paasta--{_RE_NORMALIZE_IAM_ROLE.sub('-', iam_role.lower())}"
|
|
4309
|
+
# until Core ML migrates Spark to use Pod Identity, we need to support starting Spark drivers with a Service Account
|
|
4310
|
+
# that only has k8s access
|
|
4311
|
+
elif not iam_role and k8s_role:
|
|
4312
|
+
sa_name = f"paasta--{k8s_role}"
|
|
4313
|
+
# we should never get here in normal usage, but just in case we make a mistake in the future :)
|
|
4314
|
+
else:
|
|
4315
|
+
raise ValueError(
|
|
4316
|
+
"Expected at least one of iam_role or k8s_role to be passed in!"
|
|
4317
|
+
)
|
|
4318
|
+
|
|
4319
|
+
return sa_name
|
|
4320
|
+
|
|
4321
|
+
|
|
4322
|
+
def ensure_service_account(
|
|
4323
|
+
iam_role: str,
|
|
4324
|
+
namespace: str,
|
|
4325
|
+
kube_client: KubeClient,
|
|
4326
|
+
k8s_role: Optional[str] = None,
|
|
4327
|
+
) -> None:
|
|
4328
|
+
sa_name = get_service_account_name(iam_role, k8s_role)
|
|
4329
|
+
|
|
4330
|
+
if not any(
|
|
4331
|
+
sa.metadata and sa.metadata.name == sa_name
|
|
4332
|
+
for sa in get_all_service_accounts(kube_client, namespace)
|
|
4333
|
+
):
|
|
4334
|
+
sa = V1ServiceAccount(
|
|
4335
|
+
kind="ServiceAccount",
|
|
4336
|
+
metadata=V1ObjectMeta(
|
|
4337
|
+
name=sa_name,
|
|
4338
|
+
namespace=namespace,
|
|
4339
|
+
annotations={"eks.amazonaws.com/role-arn": iam_role},
|
|
4340
|
+
),
|
|
4341
|
+
)
|
|
4342
|
+
kube_client.core.create_namespaced_service_account(namespace=namespace, body=sa)
|
|
4343
|
+
|
|
4344
|
+
# we're expecting that any Role dynamically associated with a Service Account already exists.
|
|
4345
|
+
# at Yelp, this means that we have a version-controlled resource for the Role in Puppet.
|
|
4346
|
+
# and since the Role already exists, we just need to associate it with the Service Account through
|
|
4347
|
+
# a Role Binding
|
|
4348
|
+
if k8s_role:
|
|
4349
|
+
# that said, we still check that there's a RoleBinding every time this function is called so that
|
|
4350
|
+
# we can self-heal if we somehow create a Service Account and then fail to create a Role Binding
|
|
4351
|
+
# due to a transient issue
|
|
4352
|
+
if not any(
|
|
4353
|
+
rb.metadata and rb.metadata.name == sa_name
|
|
4354
|
+
for rb in get_all_role_bindings(kube_client, namespace)
|
|
4355
|
+
):
|
|
4356
|
+
role_binding = V1RoleBinding(
|
|
4357
|
+
metadata=V1ObjectMeta(
|
|
4358
|
+
name=sa_name,
|
|
4359
|
+
namespace=namespace,
|
|
4360
|
+
),
|
|
4361
|
+
role_ref=V1RoleRef(
|
|
4362
|
+
api_group="rbac.authorization.k8s.io",
|
|
4363
|
+
kind="Role",
|
|
4364
|
+
name=k8s_role,
|
|
4365
|
+
),
|
|
4366
|
+
subjects=[
|
|
4367
|
+
V1Subject(
|
|
4368
|
+
kind="ServiceAccount",
|
|
4369
|
+
namespace=namespace,
|
|
4370
|
+
name=sa_name,
|
|
4371
|
+
),
|
|
4372
|
+
],
|
|
4373
|
+
)
|
|
4374
|
+
kube_client.rbac.create_namespaced_role_binding(
|
|
4375
|
+
namespace=namespace, body=role_binding
|
|
4376
|
+
)
|
|
4377
|
+
|
|
4378
|
+
|
|
4379
|
+
def mode_to_int(mode: Optional[Union[str, int]]) -> Optional[int]:
|
|
4380
|
+
if mode is not None:
|
|
4381
|
+
if isinstance(mode, str):
|
|
4382
|
+
if len(mode) < 2 or mode[0] != "0":
|
|
4383
|
+
raise ValueError(f"Invalid mode: {mode}")
|
|
4384
|
+
mode = int(mode[1:], 8)
|
|
4385
|
+
return mode
|
|
4386
|
+
|
|
4387
|
+
|
|
4388
|
+
def update_crds(
|
|
4389
|
+
kube_client: KubeClient,
|
|
4390
|
+
desired_crds: Collection[Union[V1CustomResourceDefinition]],
|
|
4391
|
+
existing_crds: Union[V1CustomResourceDefinitionList],
|
|
4392
|
+
) -> bool:
|
|
4393
|
+
for desired_crd in desired_crds:
|
|
4394
|
+
existing_crd = None
|
|
4395
|
+
for crd in existing_crds.items:
|
|
4396
|
+
if crd.metadata.name == desired_crd.metadata["name"]:
|
|
4397
|
+
existing_crd = crd
|
|
4398
|
+
break
|
|
4399
|
+
try:
|
|
4400
|
+
|
|
4401
|
+
apiextensions = kube_client.apiextensions
|
|
4402
|
+
|
|
4403
|
+
if existing_crd:
|
|
4404
|
+
desired_crd.metadata[
|
|
4405
|
+
"resourceVersion"
|
|
4406
|
+
] = existing_crd.metadata.resource_version
|
|
4407
|
+
|
|
4408
|
+
apiextensions.replace_custom_resource_definition(
|
|
4409
|
+
name=desired_crd.metadata["name"], body=desired_crd
|
|
4410
|
+
)
|
|
4411
|
+
else:
|
|
4412
|
+
try:
|
|
4413
|
+
apiextensions.create_custom_resource_definition(body=desired_crd)
|
|
4414
|
+
except ValueError as err:
|
|
4415
|
+
# TODO: kubernetes server will sometimes reply with conditions:null,
|
|
4416
|
+
# figure out how to deal with this correctly, for more details:
|
|
4417
|
+
# https://github.com/kubernetes/kubernetes/pull/64996
|
|
4418
|
+
if "`conditions`, must not be `None`" in str(err):
|
|
4419
|
+
pass
|
|
4420
|
+
else:
|
|
4421
|
+
raise err
|
|
4422
|
+
log.info(f"deployed internal crd {desired_crd.metadata['name']}")
|
|
4423
|
+
except ApiException as exc:
|
|
4424
|
+
log.error(
|
|
4425
|
+
f"error deploying crd {desired_crd.metadata['name']}, "
|
|
4426
|
+
f"status: {exc.status}, reason: {exc.reason}"
|
|
4427
|
+
)
|
|
4428
|
+
log.debug(exc.body)
|
|
4429
|
+
return False
|
|
4430
|
+
|
|
4431
|
+
return True
|
|
4432
|
+
|
|
4433
|
+
|
|
4434
|
+
def sanitise_label_value(value: str) -> str:
|
|
4435
|
+
"""
|
|
4436
|
+
:param value: value is sanitized and limited to 63 characters due to kubernetes restriction
|
|
4437
|
+
:return: Sanitised at most 63-character label value
|
|
4438
|
+
"""
|
|
4439
|
+
return limit_size_with_hash(
|
|
4440
|
+
sanitise_kubernetes_name(value),
|
|
4441
|
+
limit=63,
|
|
4442
|
+
)
|
|
4443
|
+
|
|
4444
|
+
|
|
4445
|
+
def _get_secret_name(
|
|
4446
|
+
namespace: str, secret_identifier: str, service_name: str, key_name: str
|
|
4447
|
+
) -> str:
|
|
4448
|
+
"""
|
|
4449
|
+
Use to generate kubernetes secret names,
|
|
4450
|
+
secret names have limit of 253 characters due to https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#dns-subdomain-names
|
|
4451
|
+
However, if you are storing secret name as a label value as well then it has lower limit of 63 characters.
|
|
4452
|
+
Hyphen (-) is used as a delimeter between values.
|
|
4453
|
+
|
|
4454
|
+
:param namespace: Unsanitised namespace of a service that will use the signature
|
|
4455
|
+
:param secret_identifier: Identifies the type of secret
|
|
4456
|
+
:param service_name: Unsanitised service_name
|
|
4457
|
+
:param key_name: Name of the actual secret, typically specified in a configuration file
|
|
4458
|
+
:return: Sanitised at most 253-character kubernetes secret name
|
|
4459
|
+
"""
|
|
4460
|
+
return limit_size_with_hash(
|
|
4461
|
+
"-".join(
|
|
4462
|
+
[
|
|
4463
|
+
namespace,
|
|
4464
|
+
secret_identifier,
|
|
4465
|
+
sanitise_kubernetes_name(service_name),
|
|
4466
|
+
sanitise_kubernetes_name(key_name),
|
|
4467
|
+
]
|
|
4468
|
+
),
|
|
4469
|
+
limit=253,
|
|
4470
|
+
)
|
|
4471
|
+
|
|
4472
|
+
|
|
4473
|
+
def _get_secret_signature_name(
|
|
4474
|
+
namespace: str, secret_identifier: str, service_name: str, key_name: str
|
|
4475
|
+
) -> str:
|
|
4476
|
+
"""
|
|
4477
|
+
:param namespace: Unsanitised namespace of a service that will use the signature
|
|
4478
|
+
:param secret_identifier: Identifies the type of secret
|
|
4479
|
+
:param service_name: Unsanitised service_name
|
|
4480
|
+
:param key_name: Name of the actual secret, typically specified in a configuration file
|
|
4481
|
+
:return: Sanitised signature name as kubernetes configmap name with at most 253 characters
|
|
4482
|
+
"""
|
|
4483
|
+
return limit_size_with_hash(
|
|
4484
|
+
"-".join(
|
|
4485
|
+
[
|
|
4486
|
+
namespace,
|
|
4487
|
+
secret_identifier,
|
|
4488
|
+
sanitise_kubernetes_name(service_name),
|
|
4489
|
+
sanitise_kubernetes_name(key_name),
|
|
4490
|
+
"signature",
|
|
4491
|
+
]
|
|
4492
|
+
),
|
|
4493
|
+
limit=253,
|
|
4494
|
+
)
|
|
4495
|
+
|
|
4496
|
+
|
|
4497
|
+
def get_paasta_secret_name(namespace: str, service_name: str, key_name: str) -> str:
|
|
4498
|
+
"""
|
|
4499
|
+
Use whenever creating or references a PaaSTA secret
|
|
4500
|
+
|
|
4501
|
+
:param namespace: Unsanitised namespace of a service that will use the signature
|
|
4502
|
+
:param service_name: Unsanitised service_name
|
|
4503
|
+
:param key_name: Name of the actual secret, typically specified in a configuration file
|
|
4504
|
+
:return: Sanitised PaaSTA secret name
|
|
4505
|
+
"""
|
|
4506
|
+
return _get_secret_name(
|
|
4507
|
+
namespace=namespace,
|
|
4508
|
+
secret_identifier="secret",
|
|
4509
|
+
service_name=service_name,
|
|
4510
|
+
key_name=key_name,
|
|
4511
|
+
)
|
|
4512
|
+
|
|
4513
|
+
|
|
4514
|
+
def get_paasta_secret_signature_name(
|
|
4515
|
+
namespace: str, service_name: str, key_name: str
|
|
4516
|
+
) -> str:
|
|
4517
|
+
"""
|
|
4518
|
+
Get PaaSTA signature name stored as kubernetes configmap
|
|
4519
|
+
|
|
4520
|
+
:param namespace: Unsanitised namespace of a service that will use the signature
|
|
4521
|
+
:param service_name: Unsanitised service_name
|
|
4522
|
+
:param key_name: Name of the actual secret, typically specified in a configuration file
|
|
4523
|
+
:return: Sanitised PaaSTA signature name
|
|
4524
|
+
"""
|
|
4525
|
+
return _get_secret_signature_name(
|
|
4526
|
+
namespace=namespace,
|
|
4527
|
+
secret_identifier="secret",
|
|
4528
|
+
service_name=service_name,
|
|
4529
|
+
key_name=key_name,
|
|
4530
|
+
)
|
|
4531
|
+
|
|
4532
|
+
|
|
4533
|
+
def get_secret(
|
|
4534
|
+
kube_client: KubeClient,
|
|
4535
|
+
secret_name: str,
|
|
4536
|
+
key_name: str,
|
|
4537
|
+
*,
|
|
4538
|
+
namespace: str,
|
|
4539
|
+
decode: bool = True,
|
|
4540
|
+
) -> Union[str, bytes]:
|
|
4541
|
+
"""
|
|
4542
|
+
:param secret_name: Expect properly formatted kubernetes secret name and that it exists
|
|
4543
|
+
:param key_name: Expect key_name to be a key in a data section
|
|
4544
|
+
:raises ApiException:
|
|
4545
|
+
:raises KeyError: if key_name does not exists in kubernetes secret's data section
|
|
4546
|
+
"""
|
|
4547
|
+
secret_data = kube_client.core.read_namespaced_secret(
|
|
4548
|
+
name=secret_name, namespace=namespace
|
|
4549
|
+
).data[key_name]
|
|
4550
|
+
# String secrets (e.g. yaml config files) need to be decoded
|
|
4551
|
+
# Binary secrets (e.g. TLS Keystore or binary certificate files) cannot be decoded
|
|
4552
|
+
if decode:
|
|
4553
|
+
return base64.b64decode(secret_data).decode("utf-8")
|
|
4554
|
+
return base64.b64decode(secret_data)
|
|
4555
|
+
|
|
4556
|
+
|
|
4557
|
+
def get_kubernetes_secret_env_variables(
|
|
4558
|
+
kube_client: KubeClient,
|
|
4559
|
+
environment: Dict[str, str],
|
|
4560
|
+
service_name: str,
|
|
4561
|
+
namespace: str,
|
|
4562
|
+
) -> Dict[str, str]:
|
|
4563
|
+
decrypted_secrets = {}
|
|
4564
|
+
for k, v in environment.items():
|
|
4565
|
+
if is_secret_ref(v):
|
|
4566
|
+
secret = get_secret_name_from_ref(v)
|
|
4567
|
+
# decode=True because environment variables need to be strings and not binary
|
|
4568
|
+
# Cast to string to make mypy / type-hints happy
|
|
4569
|
+
decrypted_secrets[k] = str(
|
|
4570
|
+
get_secret(
|
|
4571
|
+
kube_client,
|
|
4572
|
+
secret_name=get_paasta_secret_name(
|
|
4573
|
+
namespace,
|
|
4574
|
+
SHARED_SECRET_SERVICE if is_shared_secret(v) else service_name,
|
|
4575
|
+
secret,
|
|
4576
|
+
),
|
|
4577
|
+
key_name=secret,
|
|
4578
|
+
decode=True,
|
|
4579
|
+
namespace=namespace,
|
|
4580
|
+
)
|
|
4581
|
+
)
|
|
4582
|
+
return decrypted_secrets
|
|
4583
|
+
|
|
4584
|
+
|
|
4585
|
+
def get_kubernetes_secret_volumes(
|
|
4586
|
+
kube_client: KubeClient,
|
|
4587
|
+
secret_volumes_config: Sequence[SecretVolume],
|
|
4588
|
+
service_name: str,
|
|
4589
|
+
namespace: str,
|
|
4590
|
+
) -> Dict[str, Union[str, bytes]]:
|
|
4591
|
+
secret_volumes = {}
|
|
4592
|
+
# The config might look one of two ways:
|
|
4593
|
+
# Implicit full path consisting of the container path and the secret name:
|
|
4594
|
+
# secret_volumes:
|
|
4595
|
+
# - container_path: /nail/foo
|
|
4596
|
+
# secret_name: the_secret_1
|
|
4597
|
+
# - container_path: /nail/bar
|
|
4598
|
+
# secret_name: the_secret_2
|
|
4599
|
+
#
|
|
4600
|
+
# This ^ should result in two files (/nail/foo/the_secret_1, /nail/foo/the_secret_2)
|
|
4601
|
+
#
|
|
4602
|
+
# OR
|
|
4603
|
+
#
|
|
4604
|
+
# Multiple files within a folder with explicit path names
|
|
4605
|
+
# secret_volumes:
|
|
4606
|
+
# - container_path: /nail/foo
|
|
4607
|
+
# items:
|
|
4608
|
+
# - key: the_secret_1
|
|
4609
|
+
# path: bar.yaml
|
|
4610
|
+
# - key: the_secret_2
|
|
4611
|
+
# path: baz.yaml
|
|
4612
|
+
#
|
|
4613
|
+
# This ^ should result in 2 files (/nail/foo/bar.yaml, /nail/foo/baz.yaml)
|
|
4614
|
+
# We need to support both cases
|
|
4615
|
+
for secret_volume in secret_volumes_config:
|
|
4616
|
+
if "items" not in secret_volume:
|
|
4617
|
+
secret_contents = get_secret(
|
|
4618
|
+
kube_client,
|
|
4619
|
+
secret_name=get_paasta_secret_name(
|
|
4620
|
+
namespace, service_name, secret_volume["secret_name"]
|
|
4621
|
+
),
|
|
4622
|
+
key_name=secret_volume["secret_name"],
|
|
4623
|
+
decode=False,
|
|
4624
|
+
namespace=namespace,
|
|
4625
|
+
)
|
|
4626
|
+
# Index by container path => the actual secret contents, to be used downstream to create local files and mount into the container
|
|
4627
|
+
secret_volumes[
|
|
4628
|
+
os.path.join(
|
|
4629
|
+
secret_volume["container_path"], secret_volume["secret_name"]
|
|
4630
|
+
)
|
|
4631
|
+
] = secret_contents
|
|
4632
|
+
else:
|
|
4633
|
+
for item in secret_volume["items"]:
|
|
4634
|
+
secret_contents = get_secret(
|
|
4635
|
+
kube_client,
|
|
4636
|
+
secret_name=get_paasta_secret_name(
|
|
4637
|
+
namespace, service_name, item["key"]
|
|
4638
|
+
),
|
|
4639
|
+
key_name=item["key"],
|
|
4640
|
+
decode=False,
|
|
4641
|
+
namespace=namespace,
|
|
4642
|
+
)
|
|
4643
|
+
secret_volumes[
|
|
4644
|
+
os.path.join(secret_volume["container_path"], item["path"])
|
|
4645
|
+
] = secret_contents
|
|
4646
|
+
|
|
4647
|
+
return secret_volumes
|
|
4648
|
+
|
|
4649
|
+
|
|
4650
|
+
@lru_cache()
|
|
4651
|
+
def get_authenticating_services(soa_dir: str = DEFAULT_SOA_DIR) -> Set[str]:
|
|
4652
|
+
"""Load list of services participating in authenticated traffic"""
|
|
4653
|
+
authenticating_services_conf_path = os.path.join(soa_dir, "authenticating.yaml")
|
|
4654
|
+
config = service_configuration_lib.read_yaml_file(authenticating_services_conf_path)
|
|
4655
|
+
return set(config.get("services", []))
|
|
4656
|
+
|
|
4657
|
+
|
|
4658
|
+
def add_volumes_for_authenticating_services(
|
|
4659
|
+
service_name: str,
|
|
4660
|
+
config_volumes: List[ProjectedSAVolume],
|
|
4661
|
+
soa_dir: str = DEFAULT_SOA_DIR,
|
|
4662
|
+
) -> List[ProjectedSAVolume]:
|
|
4663
|
+
"""Add projected service account volume to the list of volumes if service
|
|
4664
|
+
participates in authenticated traffic. In case of changes, a new list is returned,
|
|
4665
|
+
no updates in-place.
|
|
4666
|
+
|
|
4667
|
+
:param str service_name: name of the service
|
|
4668
|
+
:param List[ProjectedSAVolume] config_volumes: existing projected volumes from service config
|
|
4669
|
+
:param str soa_dir: path to SOA configurations directory
|
|
4670
|
+
:return: updated list of projected service account volumes
|
|
4671
|
+
"""
|
|
4672
|
+
token_config = load_system_paasta_config().get_service_auth_token_volume_config()
|
|
4673
|
+
if (
|
|
4674
|
+
token_config
|
|
4675
|
+
and service_name in get_authenticating_services(soa_dir)
|
|
4676
|
+
and not any(volume == token_config for volume in config_volumes)
|
|
4677
|
+
):
|
|
4678
|
+
config_volumes = [token_config, *config_volumes]
|
|
4679
|
+
return config_volumes
|