paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1362 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Any
|
|
5
|
+
from typing import DefaultDict
|
|
6
|
+
from typing import Dict
|
|
7
|
+
from typing import Iterable
|
|
8
|
+
from typing import List
|
|
9
|
+
from typing import Mapping
|
|
10
|
+
from typing import MutableMapping
|
|
11
|
+
from typing import Optional
|
|
12
|
+
from typing import Sequence
|
|
13
|
+
from typing import Set
|
|
14
|
+
from typing import Tuple
|
|
15
|
+
from typing import Union
|
|
16
|
+
|
|
17
|
+
import a_sync
|
|
18
|
+
import pytz
|
|
19
|
+
from kubernetes.client import V1Container
|
|
20
|
+
from kubernetes.client import V1ControllerRevision
|
|
21
|
+
from kubernetes.client import V1Pod
|
|
22
|
+
from kubernetes.client import V1Probe
|
|
23
|
+
from kubernetes.client import V1ReplicaSet
|
|
24
|
+
from kubernetes.client.rest import ApiException
|
|
25
|
+
from mypy_extensions import TypedDict
|
|
26
|
+
|
|
27
|
+
from paasta_tools import cassandracluster_tools
|
|
28
|
+
from paasta_tools import eks_tools
|
|
29
|
+
from paasta_tools import envoy_tools
|
|
30
|
+
from paasta_tools import flink_tools
|
|
31
|
+
from paasta_tools import kafkacluster_tools
|
|
32
|
+
from paasta_tools import kubernetes_tools
|
|
33
|
+
from paasta_tools import monkrelaycluster_tools
|
|
34
|
+
from paasta_tools import nrtsearchservice_tools
|
|
35
|
+
from paasta_tools import smartstack_tools
|
|
36
|
+
from paasta_tools.cli.utils import LONG_RUNNING_INSTANCE_TYPE_HANDLERS
|
|
37
|
+
from paasta_tools.instance.hpa_metrics_parser import HPAMetricsDict
|
|
38
|
+
from paasta_tools.instance.hpa_metrics_parser import HPAMetricsParser
|
|
39
|
+
from paasta_tools.kubernetes_tools import get_pod_event_messages
|
|
40
|
+
from paasta_tools.kubernetes_tools import get_tail_lines_for_kubernetes_container
|
|
41
|
+
from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig
|
|
42
|
+
from paasta_tools.kubernetes_tools import paasta_prefixed
|
|
43
|
+
from paasta_tools.long_running_service_tools import (
|
|
44
|
+
get_expected_instance_count_for_namespace,
|
|
45
|
+
)
|
|
46
|
+
from paasta_tools.long_running_service_tools import LongRunningServiceConfig
|
|
47
|
+
from paasta_tools.long_running_service_tools import ServiceNamespaceConfig
|
|
48
|
+
from paasta_tools.smartstack_tools import KubeSmartstackEnvoyReplicationChecker
|
|
49
|
+
from paasta_tools.smartstack_tools import match_backends_and_pods
|
|
50
|
+
from paasta_tools.utils import calculate_tail_lines
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
INSTANCE_TYPES_CR = {
|
|
54
|
+
"flink",
|
|
55
|
+
"flinkeks",
|
|
56
|
+
"cassandracluster",
|
|
57
|
+
"kafkacluster",
|
|
58
|
+
}
|
|
59
|
+
INSTANCE_TYPES_K8S = {
|
|
60
|
+
"cassandracluster",
|
|
61
|
+
"eks",
|
|
62
|
+
"kubernetes",
|
|
63
|
+
}
|
|
64
|
+
INSTANCE_TYPES = INSTANCE_TYPES_K8S.union(INSTANCE_TYPES_CR)
|
|
65
|
+
|
|
66
|
+
INSTANCE_TYPES_WITH_SET_STATE = {"flink", "flinkeks"}
|
|
67
|
+
INSTANCE_TYPE_CR_ID = dict(
|
|
68
|
+
flink=flink_tools.cr_id,
|
|
69
|
+
flinkeks=flink_tools.cr_id,
|
|
70
|
+
cassandracluster=cassandracluster_tools.cr_id,
|
|
71
|
+
kafkacluster=kafkacluster_tools.cr_id,
|
|
72
|
+
nrtsearchservice=nrtsearchservice_tools.cr_id,
|
|
73
|
+
nrtsearchserviceeks=nrtsearchservice_tools.cr_id,
|
|
74
|
+
monkrelaycluster=monkrelaycluster_tools.cr_id,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ServiceMesh(Enum):
|
|
79
|
+
SMARTSTACK = "smartstack"
|
|
80
|
+
ENVOY = "envoy"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class KubernetesAutoscalingStatusDict(TypedDict):
|
|
84
|
+
min_instances: int
|
|
85
|
+
max_instances: int
|
|
86
|
+
metrics: List
|
|
87
|
+
desired_replicas: int
|
|
88
|
+
last_scale_time: str
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class KubernetesVersionDict(TypedDict, total=False):
|
|
92
|
+
name: str
|
|
93
|
+
type: str
|
|
94
|
+
replicas: int
|
|
95
|
+
ready_replicas: int
|
|
96
|
+
create_timestamp: int
|
|
97
|
+
git_sha: str
|
|
98
|
+
image_version: Optional[str]
|
|
99
|
+
config_sha: str
|
|
100
|
+
pods: Sequence[Mapping[str, Any]]
|
|
101
|
+
namespace: str
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def cr_id(service: str, instance: str, instance_type: str) -> Mapping[str, str]:
|
|
105
|
+
cr_id_fn = INSTANCE_TYPE_CR_ID.get(instance_type)
|
|
106
|
+
if not cr_id_fn:
|
|
107
|
+
raise RuntimeError(f"Unknown instance type {instance_type}")
|
|
108
|
+
return cr_id_fn(service, instance)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def can_handle(instance_type: str) -> bool:
|
|
112
|
+
return instance_type in INSTANCE_TYPES
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def can_set_state(instance_type: str) -> bool:
|
|
116
|
+
return instance_type in INSTANCE_TYPES_WITH_SET_STATE
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def set_cr_desired_state(
|
|
120
|
+
kube_client: kubernetes_tools.KubeClient,
|
|
121
|
+
service: str,
|
|
122
|
+
instance: str,
|
|
123
|
+
instance_type: str,
|
|
124
|
+
desired_state: str,
|
|
125
|
+
) -> None:
|
|
126
|
+
try:
|
|
127
|
+
kubernetes_tools.set_cr_desired_state(
|
|
128
|
+
kube_client=kube_client,
|
|
129
|
+
cr_id=cr_id(service, instance, instance_type),
|
|
130
|
+
desired_state=desired_state,
|
|
131
|
+
)
|
|
132
|
+
except ApiException as e:
|
|
133
|
+
error_message = (
|
|
134
|
+
f"Error while setting state {desired_state} of "
|
|
135
|
+
f"{service}.{instance}: {e}"
|
|
136
|
+
)
|
|
137
|
+
raise RuntimeError(error_message)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
async def autoscaling_status(
|
|
141
|
+
kube_client: kubernetes_tools.KubeClient,
|
|
142
|
+
job_config: LongRunningServiceConfig,
|
|
143
|
+
namespace: str,
|
|
144
|
+
) -> KubernetesAutoscalingStatusDict:
|
|
145
|
+
hpa = await kubernetes_tools.get_hpa(
|
|
146
|
+
kube_client,
|
|
147
|
+
name=job_config.get_sanitised_deployment_name(),
|
|
148
|
+
namespace=namespace,
|
|
149
|
+
)
|
|
150
|
+
if hpa is None:
|
|
151
|
+
return KubernetesAutoscalingStatusDict(
|
|
152
|
+
min_instances=-1,
|
|
153
|
+
max_instances=-1,
|
|
154
|
+
metrics=[],
|
|
155
|
+
desired_replicas=-1,
|
|
156
|
+
last_scale_time="unknown (could not find HPA object)",
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Parse metrics sources, based on
|
|
160
|
+
# https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V2beta2ExternalMetricSource.md#v2beta2externalmetricsource
|
|
161
|
+
parser = HPAMetricsParser(hpa)
|
|
162
|
+
|
|
163
|
+
# https://github.com/python/mypy/issues/7217
|
|
164
|
+
metrics_by_name: DefaultDict[str, HPAMetricsDict] = defaultdict(
|
|
165
|
+
lambda: HPAMetricsDict()
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
if hpa.spec.metrics is not None:
|
|
169
|
+
for metric_spec in hpa.spec.metrics:
|
|
170
|
+
parsed = parser.parse_target(metric_spec)
|
|
171
|
+
metrics_by_name[parsed["name"]].update(parsed)
|
|
172
|
+
|
|
173
|
+
if hpa.status.current_metrics is not None:
|
|
174
|
+
for metric_spec in hpa.status.current_metrics:
|
|
175
|
+
parsed = parser.parse_current(metric_spec)
|
|
176
|
+
if parsed is not None:
|
|
177
|
+
metrics_by_name[parsed["name"]].update(parsed)
|
|
178
|
+
|
|
179
|
+
metric_stats = list(metrics_by_name.values())
|
|
180
|
+
|
|
181
|
+
last_scale_time = (
|
|
182
|
+
hpa.status.last_scale_time.replace(tzinfo=pytz.UTC).isoformat()
|
|
183
|
+
if getattr(hpa.status, "last_scale_time")
|
|
184
|
+
else "N/A"
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
return KubernetesAutoscalingStatusDict(
|
|
188
|
+
min_instances=hpa.spec.min_replicas,
|
|
189
|
+
max_instances=hpa.spec.max_replicas,
|
|
190
|
+
metrics=metric_stats,
|
|
191
|
+
desired_replicas=hpa.status.desired_replicas,
|
|
192
|
+
last_scale_time=last_scale_time,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
async def pod_info(
|
|
197
|
+
pod: V1Pod,
|
|
198
|
+
client: kubernetes_tools.KubeClient,
|
|
199
|
+
num_tail_lines: int,
|
|
200
|
+
) -> Dict[str, Any]:
|
|
201
|
+
container_statuses = pod.status.container_statuses or []
|
|
202
|
+
try:
|
|
203
|
+
pod_event_messages = await get_pod_event_messages(client, pod)
|
|
204
|
+
except asyncio.TimeoutError:
|
|
205
|
+
pod_event_messages = [{"error": "Could not fetch events for pod"}]
|
|
206
|
+
containers = [
|
|
207
|
+
dict(
|
|
208
|
+
name=container.name,
|
|
209
|
+
tail_lines=await get_tail_lines_for_kubernetes_container(
|
|
210
|
+
client,
|
|
211
|
+
pod,
|
|
212
|
+
container,
|
|
213
|
+
num_tail_lines,
|
|
214
|
+
),
|
|
215
|
+
)
|
|
216
|
+
for container in container_statuses
|
|
217
|
+
]
|
|
218
|
+
return {
|
|
219
|
+
"name": pod.metadata.name,
|
|
220
|
+
"host": kubernetes_tools.get_pod_hostname(client, pod),
|
|
221
|
+
"deployed_timestamp": pod.metadata.creation_timestamp.timestamp(),
|
|
222
|
+
"phase": pod.status.phase,
|
|
223
|
+
"ready": kubernetes_tools.is_pod_ready(pod),
|
|
224
|
+
"containers": containers,
|
|
225
|
+
"reason": pod.status.reason,
|
|
226
|
+
"message": pod.status.message,
|
|
227
|
+
"events": pod_event_messages,
|
|
228
|
+
"git_sha": pod.metadata.labels.get("paasta.yelp.com/git_sha"),
|
|
229
|
+
"config_sha": pod.metadata.labels.get("paasta.yelp.com/config_sha"),
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# TODO: Cleanup
|
|
234
|
+
# Only used in old kubernetes_status
|
|
235
|
+
async def job_status(
|
|
236
|
+
kstatus: MutableMapping[str, Any],
|
|
237
|
+
client: kubernetes_tools.KubeClient,
|
|
238
|
+
job_config: LongRunningServiceConfig,
|
|
239
|
+
pod_list: Sequence[V1Pod],
|
|
240
|
+
replicaset_list: Sequence[V1ReplicaSet],
|
|
241
|
+
verbose: int,
|
|
242
|
+
namespace: str,
|
|
243
|
+
) -> None:
|
|
244
|
+
app_id = job_config.get_sanitised_deployment_name()
|
|
245
|
+
kstatus["app_id"] = app_id
|
|
246
|
+
kstatus["pods"] = []
|
|
247
|
+
kstatus["replicasets"] = []
|
|
248
|
+
|
|
249
|
+
if verbose > 0:
|
|
250
|
+
num_tail_lines = calculate_tail_lines(verbose)
|
|
251
|
+
kstatus["pods"] = await asyncio.gather(
|
|
252
|
+
*[pod_info(pod, client, num_tail_lines) for pod in pod_list]
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
for replicaset in replicaset_list:
|
|
256
|
+
kstatus["replicasets"].append(
|
|
257
|
+
{
|
|
258
|
+
"name": replicaset.metadata.name,
|
|
259
|
+
"replicas": replicaset.spec.replicas,
|
|
260
|
+
"ready_replicas": ready_replicas_from_replicaset(replicaset),
|
|
261
|
+
"create_timestamp": replicaset.metadata.creation_timestamp.timestamp(),
|
|
262
|
+
"git_sha": replicaset.metadata.labels.get("paasta.yelp.com/git_sha"),
|
|
263
|
+
"config_sha": replicaset.metadata.labels.get(
|
|
264
|
+
"paasta.yelp.com/config_sha"
|
|
265
|
+
),
|
|
266
|
+
}
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
kstatus["expected_instance_count"] = job_config.get_instances()
|
|
270
|
+
|
|
271
|
+
app = kubernetes_tools.get_kubernetes_app_by_name(
|
|
272
|
+
name=app_id, kube_client=client, namespace=namespace
|
|
273
|
+
)
|
|
274
|
+
desired_instances = (
|
|
275
|
+
job_config.get_instances() if job_config.get_desired_state() != "stop" else 0
|
|
276
|
+
)
|
|
277
|
+
deploy_status, message = kubernetes_tools.get_kubernetes_app_deploy_status(
|
|
278
|
+
app=app,
|
|
279
|
+
desired_instances=desired_instances,
|
|
280
|
+
)
|
|
281
|
+
kstatus["deploy_status"] = kubernetes_tools.KubernetesDeployStatus.tostring(
|
|
282
|
+
deploy_status
|
|
283
|
+
)
|
|
284
|
+
kstatus["deploy_status_message"] = message
|
|
285
|
+
kstatus["running_instance_count"] = (
|
|
286
|
+
app.status.ready_replicas if app.status.ready_replicas else 0
|
|
287
|
+
)
|
|
288
|
+
kstatus["create_timestamp"] = app.metadata.creation_timestamp.timestamp()
|
|
289
|
+
kstatus["namespace"] = app.metadata.namespace
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
async def get_backends_from_mesh_status(
|
|
293
|
+
mesh_status_task: "asyncio.Future[Dict[str, Any]]",
|
|
294
|
+
) -> Set[str]:
|
|
295
|
+
status = await mesh_status_task
|
|
296
|
+
if status.get("locations"):
|
|
297
|
+
backends = {
|
|
298
|
+
be["address"]
|
|
299
|
+
for location in status["locations"]
|
|
300
|
+
for be in location.get("backends", [])
|
|
301
|
+
}
|
|
302
|
+
else:
|
|
303
|
+
backends = set()
|
|
304
|
+
|
|
305
|
+
return backends
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
async def mesh_status(
|
|
309
|
+
service: str,
|
|
310
|
+
service_mesh: ServiceMesh,
|
|
311
|
+
instance: str,
|
|
312
|
+
job_config: LongRunningServiceConfig,
|
|
313
|
+
service_namespace_config: ServiceNamespaceConfig,
|
|
314
|
+
pods_task: "asyncio.Future[V1Pod]",
|
|
315
|
+
settings: Any,
|
|
316
|
+
should_return_individual_backends: bool = False,
|
|
317
|
+
) -> Mapping[str, Any]:
|
|
318
|
+
registration = job_config.get_registrations()[0]
|
|
319
|
+
instance_pool = job_config.get_pool()
|
|
320
|
+
|
|
321
|
+
async_get_nodes = a_sync.to_async(kubernetes_tools.get_all_nodes)
|
|
322
|
+
nodes = await async_get_nodes(settings.kubernetes_client)
|
|
323
|
+
|
|
324
|
+
replication_checker = KubeSmartstackEnvoyReplicationChecker(
|
|
325
|
+
nodes=nodes,
|
|
326
|
+
system_paasta_config=settings.system_paasta_config,
|
|
327
|
+
)
|
|
328
|
+
node_hostname_by_location = replication_checker.get_allowed_locations_and_hosts(
|
|
329
|
+
job_config
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
expected_smartstack_count = get_expected_instance_count_for_namespace(
|
|
333
|
+
service=service,
|
|
334
|
+
namespace=job_config.get_nerve_namespace(),
|
|
335
|
+
cluster=settings.cluster,
|
|
336
|
+
instance_type_class=KubernetesDeploymentConfig,
|
|
337
|
+
)
|
|
338
|
+
expected_count_per_location = int(
|
|
339
|
+
expected_smartstack_count / len(node_hostname_by_location)
|
|
340
|
+
)
|
|
341
|
+
mesh_status: MutableMapping[str, Any] = {
|
|
342
|
+
"registration": registration,
|
|
343
|
+
"expected_backends_per_location": expected_count_per_location,
|
|
344
|
+
"locations": [],
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
pods = await pods_task
|
|
348
|
+
for location, hosts in node_hostname_by_location.items():
|
|
349
|
+
host = replication_checker.get_hostname_in_pool(hosts, instance_pool)
|
|
350
|
+
if service_mesh == ServiceMesh.SMARTSTACK:
|
|
351
|
+
mesh_status["locations"].append(
|
|
352
|
+
_build_smartstack_location_dict(
|
|
353
|
+
synapse_host=host,
|
|
354
|
+
synapse_port=settings.system_paasta_config.get_synapse_port(),
|
|
355
|
+
synapse_haproxy_url_format=settings.system_paasta_config.get_synapse_haproxy_url_format(),
|
|
356
|
+
registration=registration,
|
|
357
|
+
pods=pods,
|
|
358
|
+
location=location,
|
|
359
|
+
should_return_individual_backends=should_return_individual_backends,
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
elif service_mesh == ServiceMesh.ENVOY:
|
|
363
|
+
mesh_status["locations"].append(
|
|
364
|
+
_build_envoy_location_dict(
|
|
365
|
+
envoy_host=host,
|
|
366
|
+
envoy_admin_port=settings.system_paasta_config.get_envoy_admin_port(),
|
|
367
|
+
envoy_admin_endpoint_format=settings.system_paasta_config.get_envoy_admin_endpoint_format(),
|
|
368
|
+
registration=registration,
|
|
369
|
+
pods=pods,
|
|
370
|
+
location=location,
|
|
371
|
+
should_return_individual_backends=should_return_individual_backends,
|
|
372
|
+
)
|
|
373
|
+
)
|
|
374
|
+
return mesh_status
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _build_envoy_location_dict(
|
|
378
|
+
envoy_host: str,
|
|
379
|
+
envoy_admin_port: int,
|
|
380
|
+
envoy_admin_endpoint_format: str,
|
|
381
|
+
registration: str,
|
|
382
|
+
pods: Iterable[V1Pod],
|
|
383
|
+
location: str,
|
|
384
|
+
should_return_individual_backends: bool,
|
|
385
|
+
) -> MutableMapping[str, Any]:
|
|
386
|
+
backends = envoy_tools.get_backends(
|
|
387
|
+
registration,
|
|
388
|
+
envoy_host=envoy_host,
|
|
389
|
+
envoy_admin_port=envoy_admin_port,
|
|
390
|
+
envoy_admin_endpoint_format=envoy_admin_endpoint_format,
|
|
391
|
+
)
|
|
392
|
+
sorted_envoy_backends = sorted(
|
|
393
|
+
[
|
|
394
|
+
backend[0]
|
|
395
|
+
for _, service_backends in backends.items()
|
|
396
|
+
for backend in service_backends
|
|
397
|
+
],
|
|
398
|
+
key=lambda backend: backend["eds_health_status"],
|
|
399
|
+
)
|
|
400
|
+
casper_proxied_backends = {
|
|
401
|
+
(backend["address"], backend["port_value"])
|
|
402
|
+
for _, service_backends in backends.items()
|
|
403
|
+
for backend, is_casper_proxied_backend in service_backends
|
|
404
|
+
if is_casper_proxied_backend
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
matched_envoy_backends_and_pods = envoy_tools.match_backends_and_pods(
|
|
408
|
+
sorted_envoy_backends,
|
|
409
|
+
pods,
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
return envoy_tools.build_envoy_location_dict(
|
|
413
|
+
location,
|
|
414
|
+
matched_envoy_backends_and_pods,
|
|
415
|
+
should_return_individual_backends,
|
|
416
|
+
casper_proxied_backends,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _build_smartstack_location_dict(
|
|
421
|
+
synapse_host: str,
|
|
422
|
+
synapse_port: int,
|
|
423
|
+
synapse_haproxy_url_format: str,
|
|
424
|
+
registration: str,
|
|
425
|
+
pods: Iterable[V1Pod],
|
|
426
|
+
location: str,
|
|
427
|
+
should_return_individual_backends: bool,
|
|
428
|
+
) -> MutableMapping[str, Any]:
|
|
429
|
+
sorted_backends = sorted(
|
|
430
|
+
smartstack_tools.get_backends(
|
|
431
|
+
registration,
|
|
432
|
+
synapse_host=synapse_host,
|
|
433
|
+
synapse_port=synapse_port,
|
|
434
|
+
synapse_haproxy_url_format=synapse_haproxy_url_format,
|
|
435
|
+
),
|
|
436
|
+
key=lambda backend: backend["status"],
|
|
437
|
+
reverse=True, # put 'UP' backends above 'MAINT' backends
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
matched_backends_and_pods = match_backends_and_pods(sorted_backends, pods)
|
|
441
|
+
location_dict = smartstack_tools.build_smartstack_location_dict(
|
|
442
|
+
location, matched_backends_and_pods, should_return_individual_backends
|
|
443
|
+
)
|
|
444
|
+
return location_dict
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def cr_status(
|
|
448
|
+
service: str,
|
|
449
|
+
instance: str,
|
|
450
|
+
verbose: int,
|
|
451
|
+
instance_type: str,
|
|
452
|
+
kube_client: Any,
|
|
453
|
+
) -> Mapping[str, Any]:
|
|
454
|
+
status: MutableMapping[str, Any] = {}
|
|
455
|
+
cr = (
|
|
456
|
+
kubernetes_tools.get_cr(
|
|
457
|
+
kube_client=kube_client, cr_id=cr_id(service, instance, instance_type)
|
|
458
|
+
)
|
|
459
|
+
or {}
|
|
460
|
+
)
|
|
461
|
+
crstatus = cr.get("status")
|
|
462
|
+
metadata = cr.get("metadata")
|
|
463
|
+
if crstatus is not None:
|
|
464
|
+
status["status"] = crstatus
|
|
465
|
+
if metadata is not None:
|
|
466
|
+
status["metadata"] = metadata
|
|
467
|
+
return status
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def filter_actually_running_replicasets(
|
|
471
|
+
replicaset_list: Sequence[V1ReplicaSet],
|
|
472
|
+
) -> List[V1ReplicaSet]:
|
|
473
|
+
return [
|
|
474
|
+
rs
|
|
475
|
+
for rs in replicaset_list
|
|
476
|
+
if not (rs.spec.replicas == 0 and ready_replicas_from_replicaset(rs) == 0)
|
|
477
|
+
]
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def bounce_status(
|
|
481
|
+
service: str, instance: str, settings: Any, is_eks: bool = False
|
|
482
|
+
) -> Dict[str, Any]:
|
|
483
|
+
status: Dict[str, Any] = {}
|
|
484
|
+
# this should be the only place where it matters that we use eks_tools.
|
|
485
|
+
# apart from loading config files, we should be using kubernetes_tools
|
|
486
|
+
# everywhere.
|
|
487
|
+
job_config: Union[KubernetesDeploymentConfig, eks_tools.EksDeploymentConfig]
|
|
488
|
+
if is_eks:
|
|
489
|
+
job_config = eks_tools.load_eks_service_config(
|
|
490
|
+
service=service,
|
|
491
|
+
instance=instance,
|
|
492
|
+
cluster=settings.cluster,
|
|
493
|
+
soa_dir=settings.soa_dir,
|
|
494
|
+
load_deployments=True,
|
|
495
|
+
)
|
|
496
|
+
else:
|
|
497
|
+
job_config = kubernetes_tools.load_kubernetes_service_config(
|
|
498
|
+
service=service,
|
|
499
|
+
instance=instance,
|
|
500
|
+
cluster=settings.cluster,
|
|
501
|
+
soa_dir=settings.soa_dir,
|
|
502
|
+
load_deployments=True,
|
|
503
|
+
)
|
|
504
|
+
expected_instance_count = job_config.get_instances()
|
|
505
|
+
status["expected_instance_count"] = expected_instance_count
|
|
506
|
+
desired_state = job_config.get_desired_state()
|
|
507
|
+
status["desired_state"] = desired_state
|
|
508
|
+
|
|
509
|
+
kube_client = settings.kubernetes_client
|
|
510
|
+
if kube_client is None:
|
|
511
|
+
raise RuntimeError("Could not load Kubernetes client!")
|
|
512
|
+
|
|
513
|
+
app = kubernetes_tools.get_kubernetes_app_by_name(
|
|
514
|
+
name=job_config.get_sanitised_deployment_name(),
|
|
515
|
+
kube_client=kube_client,
|
|
516
|
+
namespace=job_config.get_kubernetes_namespace(),
|
|
517
|
+
)
|
|
518
|
+
status["running_instance_count"] = (
|
|
519
|
+
app.status.ready_replicas if app.status.ready_replicas else 0
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
deploy_status, message = kubernetes_tools.get_kubernetes_app_deploy_status(
|
|
523
|
+
app=app,
|
|
524
|
+
desired_instances=(expected_instance_count if desired_state != "stop" else 0),
|
|
525
|
+
)
|
|
526
|
+
status["deploy_status"] = kubernetes_tools.KubernetesDeployStatus.tostring(
|
|
527
|
+
deploy_status
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
if job_config.get_persistent_volumes():
|
|
531
|
+
version_objects = a_sync.block(
|
|
532
|
+
kubernetes_tools.controller_revisions_for_service_instance,
|
|
533
|
+
service=job_config.service,
|
|
534
|
+
instance=job_config.instance,
|
|
535
|
+
kube_client=kube_client,
|
|
536
|
+
namespace=job_config.get_kubernetes_namespace(),
|
|
537
|
+
)
|
|
538
|
+
else:
|
|
539
|
+
replicasets = a_sync.block(
|
|
540
|
+
kubernetes_tools.replicasets_for_service_instance,
|
|
541
|
+
service=job_config.service,
|
|
542
|
+
instance=job_config.instance,
|
|
543
|
+
kube_client=kube_client,
|
|
544
|
+
namespace=job_config.get_kubernetes_namespace(),
|
|
545
|
+
)
|
|
546
|
+
version_objects = filter_actually_running_replicasets(replicasets)
|
|
547
|
+
|
|
548
|
+
active_versions = kubernetes_tools.get_active_versions_for_service(
|
|
549
|
+
[app, *version_objects],
|
|
550
|
+
)
|
|
551
|
+
status["active_shas"] = [
|
|
552
|
+
(deployment_version.sha, config_sha)
|
|
553
|
+
for deployment_version, config_sha in active_versions
|
|
554
|
+
]
|
|
555
|
+
status["active_versions"] = [
|
|
556
|
+
(deployment_version.sha, deployment_version.image_version, config_sha)
|
|
557
|
+
for deployment_version, config_sha in active_versions
|
|
558
|
+
]
|
|
559
|
+
status["app_count"] = len(active_versions)
|
|
560
|
+
return status
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
async def get_pods_for_service_instance_multiple_namespaces(
|
|
564
|
+
service: str,
|
|
565
|
+
instance: str,
|
|
566
|
+
kube_client: kubernetes_tools.KubeClient,
|
|
567
|
+
namespaces: Iterable[str],
|
|
568
|
+
) -> Sequence[V1Pod]:
|
|
569
|
+
ret: List[V1Pod] = []
|
|
570
|
+
|
|
571
|
+
for coro in asyncio.as_completed(
|
|
572
|
+
[
|
|
573
|
+
kubernetes_tools.pods_for_service_instance(
|
|
574
|
+
service=service,
|
|
575
|
+
instance=instance,
|
|
576
|
+
kube_client=kube_client,
|
|
577
|
+
namespace=namespace,
|
|
578
|
+
)
|
|
579
|
+
for namespace in namespaces
|
|
580
|
+
]
|
|
581
|
+
):
|
|
582
|
+
ret.extend(await coro)
|
|
583
|
+
|
|
584
|
+
return ret
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def find_all_relevant_namespaces(
|
|
588
|
+
service: str,
|
|
589
|
+
instance: str,
|
|
590
|
+
kube_client: kubernetes_tools.KubeClient,
|
|
591
|
+
job_config: LongRunningServiceConfig,
|
|
592
|
+
) -> Set[str]:
|
|
593
|
+
return {job_config.get_kubernetes_namespace()} | {
|
|
594
|
+
deployment.namespace
|
|
595
|
+
for deployment in kubernetes_tools.list_deployments_in_managed_namespaces(
|
|
596
|
+
kube_client=kube_client,
|
|
597
|
+
label_selector=f"{paasta_prefixed('service')}={service},{paasta_prefixed('instance')}={instance}",
|
|
598
|
+
)
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
@a_sync.to_blocking
|
|
603
|
+
async def kubernetes_status_v2(
|
|
604
|
+
service: str,
|
|
605
|
+
instance: str,
|
|
606
|
+
verbose: int,
|
|
607
|
+
include_envoy: bool,
|
|
608
|
+
instance_type: str,
|
|
609
|
+
settings: Any,
|
|
610
|
+
all_namespaces: bool = False,
|
|
611
|
+
) -> Dict[str, Any]:
|
|
612
|
+
status: Dict[str, Any] = {}
|
|
613
|
+
config_loader = LONG_RUNNING_INSTANCE_TYPE_HANDLERS[instance_type].loader
|
|
614
|
+
job_config = config_loader(
|
|
615
|
+
service=service,
|
|
616
|
+
instance=instance,
|
|
617
|
+
cluster=settings.cluster,
|
|
618
|
+
soa_dir=settings.soa_dir,
|
|
619
|
+
load_deployments=True,
|
|
620
|
+
)
|
|
621
|
+
kube_client = settings.kubernetes_client
|
|
622
|
+
if kube_client is None:
|
|
623
|
+
return status
|
|
624
|
+
|
|
625
|
+
if all_namespaces:
|
|
626
|
+
relevant_namespaces = await a_sync.to_async(find_all_relevant_namespaces)(
|
|
627
|
+
service, instance, kube_client, job_config
|
|
628
|
+
)
|
|
629
|
+
else:
|
|
630
|
+
relevant_namespaces = {job_config.get_kubernetes_namespace()}
|
|
631
|
+
|
|
632
|
+
tasks: List["asyncio.Future[Dict[str, Any]]"] = []
|
|
633
|
+
|
|
634
|
+
if (
|
|
635
|
+
verbose > 1
|
|
636
|
+
and job_config.is_autoscaling_enabled()
|
|
637
|
+
and job_config.get_autoscaling_params().get("decision_policy", "") != "bespoke" # type: ignore
|
|
638
|
+
):
|
|
639
|
+
autoscaling_task = asyncio.create_task(
|
|
640
|
+
autoscaling_status(
|
|
641
|
+
kube_client, job_config, job_config.get_kubernetes_namespace()
|
|
642
|
+
)
|
|
643
|
+
)
|
|
644
|
+
tasks.append(autoscaling_task)
|
|
645
|
+
else:
|
|
646
|
+
autoscaling_task = None
|
|
647
|
+
|
|
648
|
+
pods_task = asyncio.create_task(
|
|
649
|
+
get_pods_for_service_instance_multiple_namespaces(
|
|
650
|
+
service=service,
|
|
651
|
+
instance=instance,
|
|
652
|
+
kube_client=kube_client,
|
|
653
|
+
namespaces=relevant_namespaces,
|
|
654
|
+
)
|
|
655
|
+
)
|
|
656
|
+
tasks.append(pods_task)
|
|
657
|
+
|
|
658
|
+
service_namespace_config = kubernetes_tools.load_service_namespace_config(
|
|
659
|
+
service=service,
|
|
660
|
+
namespace=job_config.get_nerve_namespace(),
|
|
661
|
+
soa_dir=settings.soa_dir,
|
|
662
|
+
)
|
|
663
|
+
if "proxy_port" in service_namespace_config:
|
|
664
|
+
mesh_status_task = asyncio.create_task(
|
|
665
|
+
mesh_status(
|
|
666
|
+
service=service,
|
|
667
|
+
service_mesh=ServiceMesh.ENVOY,
|
|
668
|
+
instance=job_config.get_nerve_namespace(),
|
|
669
|
+
job_config=job_config,
|
|
670
|
+
service_namespace_config=service_namespace_config,
|
|
671
|
+
pods_task=pods_task,
|
|
672
|
+
should_return_individual_backends=True,
|
|
673
|
+
settings=settings,
|
|
674
|
+
)
|
|
675
|
+
)
|
|
676
|
+
backends_task = asyncio.create_task(
|
|
677
|
+
get_backends_from_mesh_status(mesh_status_task)
|
|
678
|
+
)
|
|
679
|
+
tasks.extend([mesh_status_task, backends_task])
|
|
680
|
+
else:
|
|
681
|
+
mesh_status_task = None
|
|
682
|
+
backends_task = None
|
|
683
|
+
|
|
684
|
+
if job_config.get_persistent_volumes():
|
|
685
|
+
pod_status_by_sha_and_readiness_task = asyncio.create_task(
|
|
686
|
+
get_pod_status_tasks_by_sha_and_readiness(
|
|
687
|
+
pods_task,
|
|
688
|
+
backends_task,
|
|
689
|
+
kube_client,
|
|
690
|
+
verbose,
|
|
691
|
+
)
|
|
692
|
+
)
|
|
693
|
+
versions_task = asyncio.create_task(
|
|
694
|
+
get_versions_for_controller_revisions(
|
|
695
|
+
kube_client=kube_client,
|
|
696
|
+
service=service,
|
|
697
|
+
instance=instance,
|
|
698
|
+
namespaces=relevant_namespaces,
|
|
699
|
+
pod_status_by_sha_and_readiness_task=pod_status_by_sha_and_readiness_task,
|
|
700
|
+
)
|
|
701
|
+
)
|
|
702
|
+
tasks.extend([pod_status_by_sha_and_readiness_task, versions_task])
|
|
703
|
+
else:
|
|
704
|
+
pod_status_by_replicaset_task = asyncio.create_task(
|
|
705
|
+
get_pod_status_tasks_by_replicaset(
|
|
706
|
+
pods_task,
|
|
707
|
+
backends_task,
|
|
708
|
+
kube_client,
|
|
709
|
+
verbose,
|
|
710
|
+
)
|
|
711
|
+
)
|
|
712
|
+
versions_task = asyncio.create_task(
|
|
713
|
+
get_versions_for_replicasets(
|
|
714
|
+
kube_client=kube_client,
|
|
715
|
+
service=service,
|
|
716
|
+
instance=instance,
|
|
717
|
+
namespaces=relevant_namespaces,
|
|
718
|
+
pod_status_by_replicaset_task=pod_status_by_replicaset_task,
|
|
719
|
+
)
|
|
720
|
+
)
|
|
721
|
+
tasks.extend([pod_status_by_replicaset_task, versions_task])
|
|
722
|
+
|
|
723
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
724
|
+
|
|
725
|
+
desired_state = job_config.get_desired_state()
|
|
726
|
+
status["app_name"] = job_config.get_sanitised_deployment_name()
|
|
727
|
+
status["desired_state"] = desired_state
|
|
728
|
+
status["desired_instances"] = (
|
|
729
|
+
job_config.get_instances() if desired_state != "stop" else 0
|
|
730
|
+
)
|
|
731
|
+
status["bounce_method"] = job_config.get_bounce_method()
|
|
732
|
+
|
|
733
|
+
try:
|
|
734
|
+
pods_task.result() # just verifies we have a valid result
|
|
735
|
+
# These tasks also depend on pods_task, so we cannot populate them without pods
|
|
736
|
+
status["versions"] = versions_task.result()
|
|
737
|
+
if mesh_status_task is not None:
|
|
738
|
+
status["envoy"] = mesh_status_task.result()
|
|
739
|
+
except asyncio.TimeoutError:
|
|
740
|
+
status["versions"] = []
|
|
741
|
+
status["error_message"] = (
|
|
742
|
+
"Could not fetch instance data. "
|
|
743
|
+
"This is usually a temporary problem. Please try again or contact #compute-infra for help if you continue to see this message\n"
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
if autoscaling_task is not None:
|
|
747
|
+
try:
|
|
748
|
+
status["autoscaling_status"] = autoscaling_task.result()
|
|
749
|
+
except Exception as e:
|
|
750
|
+
if "error_message" not in status:
|
|
751
|
+
status["error_message"] = (
|
|
752
|
+
f"Unknown error occurred while fetching autoscaling status. "
|
|
753
|
+
f"Please contact #compute-infra for help: {e}"
|
|
754
|
+
)
|
|
755
|
+
else:
|
|
756
|
+
status[
|
|
757
|
+
"error_message"
|
|
758
|
+
] += f"Unknown error occurred while fetching autoscaling status: {e}"
|
|
759
|
+
return status
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
async def get_pod_status_tasks_by_replicaset(
|
|
763
|
+
pods_task: "asyncio.Future[V1Pod]",
|
|
764
|
+
backends_task: "asyncio.Future[Dict[str, Any]]",
|
|
765
|
+
client: kubernetes_tools.KubeClient,
|
|
766
|
+
verbose: int,
|
|
767
|
+
) -> Dict[str, List["asyncio.Future[Dict[str, Any]]"]]:
|
|
768
|
+
num_tail_lines = calculate_tail_lines(verbose)
|
|
769
|
+
pods = await pods_task
|
|
770
|
+
tasks_by_replicaset: DefaultDict[
|
|
771
|
+
str, List["asyncio.Future[Dict[str, Any]]"]
|
|
772
|
+
] = defaultdict(list)
|
|
773
|
+
for pod in pods:
|
|
774
|
+
for owner_reference in pod.metadata.owner_references:
|
|
775
|
+
if owner_reference.kind == "ReplicaSet":
|
|
776
|
+
pod_status_task = asyncio.create_task(
|
|
777
|
+
get_pod_status(pod, backends_task, client, num_tail_lines)
|
|
778
|
+
)
|
|
779
|
+
tasks_by_replicaset[owner_reference.name].append(pod_status_task)
|
|
780
|
+
|
|
781
|
+
return tasks_by_replicaset
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
async def get_versions_for_replicasets(
|
|
785
|
+
kube_client: kubernetes_tools.KubeClient,
|
|
786
|
+
service: str,
|
|
787
|
+
instance: str,
|
|
788
|
+
namespaces: Iterable[str],
|
|
789
|
+
pod_status_by_replicaset_task: "asyncio.Future[Mapping[str, Sequence[asyncio.Future[Dict[str, Any]]]]]",
|
|
790
|
+
) -> List[KubernetesVersionDict]:
|
|
791
|
+
|
|
792
|
+
replicaset_list: List[V1ReplicaSet] = []
|
|
793
|
+
for coro in asyncio.as_completed(
|
|
794
|
+
[
|
|
795
|
+
kubernetes_tools.replicasets_for_service_instance(
|
|
796
|
+
service=service,
|
|
797
|
+
instance=instance,
|
|
798
|
+
kube_client=kube_client,
|
|
799
|
+
namespace=namespace,
|
|
800
|
+
)
|
|
801
|
+
for namespace in namespaces
|
|
802
|
+
]
|
|
803
|
+
):
|
|
804
|
+
replicaset_list.extend(await coro)
|
|
805
|
+
|
|
806
|
+
# For the purpose of active_versions/app_count, don't count replicasets that
|
|
807
|
+
# are at 0/0.
|
|
808
|
+
actually_running_replicasets = filter_actually_running_replicasets(replicaset_list)
|
|
809
|
+
|
|
810
|
+
pod_status_by_replicaset = await pod_status_by_replicaset_task
|
|
811
|
+
versions = await asyncio.gather(
|
|
812
|
+
*[
|
|
813
|
+
get_replicaset_status(
|
|
814
|
+
replicaset,
|
|
815
|
+
kube_client,
|
|
816
|
+
pod_status_by_replicaset.get(replicaset.metadata.name),
|
|
817
|
+
)
|
|
818
|
+
for replicaset in actually_running_replicasets
|
|
819
|
+
]
|
|
820
|
+
)
|
|
821
|
+
return versions
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
async def get_replicaset_status(
|
|
825
|
+
replicaset: V1ReplicaSet,
|
|
826
|
+
client: kubernetes_tools.KubeClient,
|
|
827
|
+
pod_status_tasks: Sequence["asyncio.Future[Dict[str, Any]]"],
|
|
828
|
+
) -> KubernetesVersionDict:
|
|
829
|
+
return {
|
|
830
|
+
"name": replicaset.metadata.name,
|
|
831
|
+
"type": "ReplicaSet",
|
|
832
|
+
"replicas": replicaset.spec.replicas,
|
|
833
|
+
"ready_replicas": ready_replicas_from_replicaset(replicaset),
|
|
834
|
+
"create_timestamp": replicaset.metadata.creation_timestamp.timestamp(),
|
|
835
|
+
"git_sha": replicaset.metadata.labels.get("paasta.yelp.com/git_sha"),
|
|
836
|
+
"image_version": replicaset.metadata.labels.get(
|
|
837
|
+
"paasta.yelp.com/image_version", None
|
|
838
|
+
),
|
|
839
|
+
"config_sha": replicaset.metadata.labels.get("paasta.yelp.com/config_sha"),
|
|
840
|
+
"pods": await asyncio.gather(*pod_status_tasks) if pod_status_tasks else [],
|
|
841
|
+
"namespace": replicaset.metadata.namespace,
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
async def get_pod_status(
|
|
846
|
+
pod: V1Pod,
|
|
847
|
+
backends_task: "asyncio.Future[Dict[str, Any]]",
|
|
848
|
+
client: Any,
|
|
849
|
+
num_tail_lines: int,
|
|
850
|
+
) -> Dict[str, Any]:
|
|
851
|
+
events_task = asyncio.create_task(
|
|
852
|
+
get_pod_event_messages(client, pod, max_age_in_seconds=900)
|
|
853
|
+
)
|
|
854
|
+
containers_task = asyncio.create_task(
|
|
855
|
+
get_pod_containers(pod, client, num_tail_lines)
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
await asyncio.gather(events_task, containers_task, return_exceptions=True)
|
|
859
|
+
|
|
860
|
+
reason = pod.status.reason
|
|
861
|
+
message = pod.status.message
|
|
862
|
+
scheduled = kubernetes_tools.is_pod_scheduled(pod)
|
|
863
|
+
ready = kubernetes_tools.is_pod_ready(pod)
|
|
864
|
+
delete_timestamp = (
|
|
865
|
+
pod.metadata.deletion_timestamp.timestamp()
|
|
866
|
+
if pod.metadata.deletion_timestamp
|
|
867
|
+
else None
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
try:
|
|
871
|
+
# Filter events to only last 15m
|
|
872
|
+
pod_event_messages = events_task.result()
|
|
873
|
+
except asyncio.TimeoutError:
|
|
874
|
+
pod_event_messages = [{"error": "Could not retrieve events. Please try again."}]
|
|
875
|
+
|
|
876
|
+
if not scheduled and reason != "Evicted":
|
|
877
|
+
sched_condition = kubernetes_tools.get_pod_condition(pod, "PodScheduled")
|
|
878
|
+
# If the condition is not yet available (e.g. pod not fully created yet), defer to Status messages
|
|
879
|
+
if sched_condition:
|
|
880
|
+
reason = sched_condition.reason
|
|
881
|
+
message = sched_condition.message
|
|
882
|
+
|
|
883
|
+
mesh_ready = None
|
|
884
|
+
if backends_task is not None:
|
|
885
|
+
# TODO: Remove this once k8s readiness reflects mesh readiness, PAASTA-17266
|
|
886
|
+
mesh_ready = pod.status.pod_ip in (await backends_task)
|
|
887
|
+
|
|
888
|
+
return {
|
|
889
|
+
"name": pod.metadata.name,
|
|
890
|
+
"ip": pod.status.pod_ip,
|
|
891
|
+
"host": pod.status.host_ip,
|
|
892
|
+
"phase": pod.status.phase,
|
|
893
|
+
"reason": reason,
|
|
894
|
+
"message": message,
|
|
895
|
+
"scheduled": scheduled,
|
|
896
|
+
"ready": ready,
|
|
897
|
+
"mesh_ready": mesh_ready,
|
|
898
|
+
"containers": containers_task.result(),
|
|
899
|
+
"create_timestamp": pod.metadata.creation_timestamp.timestamp(),
|
|
900
|
+
"delete_timestamp": delete_timestamp,
|
|
901
|
+
"events": pod_event_messages,
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
def get_container_healthcheck(pod_ip: str, probe: V1Probe) -> Dict[str, Any]:
|
|
906
|
+
if getattr(probe, "http_get", None):
|
|
907
|
+
return {
|
|
908
|
+
"http_url": f"http://{pod_ip}:{probe.http_get.port}{probe.http_get.path}"
|
|
909
|
+
}
|
|
910
|
+
if getattr(probe, "tcp_socket", None):
|
|
911
|
+
return {"tcp_port": f"{probe.tcp_socket.port}"}
|
|
912
|
+
if getattr(probe, "_exec", None):
|
|
913
|
+
return {"cmd": f"{' '.join(probe._exec.command)}"}
|
|
914
|
+
return {}
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
async def get_pod_containers(
|
|
918
|
+
pod: V1Pod, client: Any, num_tail_lines: int
|
|
919
|
+
) -> List[Dict[str, Any]]:
|
|
920
|
+
containers = []
|
|
921
|
+
statuses = pod.status.container_statuses or []
|
|
922
|
+
container_specs = pod.spec.containers
|
|
923
|
+
for cs in statuses:
|
|
924
|
+
specs: List[V1Container] = [c for c in container_specs if c.name == cs.name]
|
|
925
|
+
healthcheck_grace_period = 0
|
|
926
|
+
healthcheck = None
|
|
927
|
+
if specs:
|
|
928
|
+
# There should be only one matching spec
|
|
929
|
+
spec = specs[0]
|
|
930
|
+
if spec.liveness_probe:
|
|
931
|
+
healthcheck_grace_period = (
|
|
932
|
+
spec.liveness_probe.initial_delay_seconds or 0
|
|
933
|
+
)
|
|
934
|
+
healthcheck = get_container_healthcheck(
|
|
935
|
+
pod.status.pod_ip, spec.liveness_probe
|
|
936
|
+
)
|
|
937
|
+
|
|
938
|
+
state_dict = cs.state.to_dict()
|
|
939
|
+
state = None
|
|
940
|
+
reason = None
|
|
941
|
+
message = None
|
|
942
|
+
start_timestamp = None
|
|
943
|
+
for state_name, this_state in state_dict.items():
|
|
944
|
+
# Each container has only populated state at a time
|
|
945
|
+
if this_state:
|
|
946
|
+
state = state_name
|
|
947
|
+
if "reason" in this_state:
|
|
948
|
+
reason = this_state["reason"]
|
|
949
|
+
if "message" in this_state:
|
|
950
|
+
message = this_state["message"]
|
|
951
|
+
if this_state.get("started_at"):
|
|
952
|
+
start_timestamp = this_state["started_at"].timestamp()
|
|
953
|
+
|
|
954
|
+
last_state_dict = cs.last_state.to_dict()
|
|
955
|
+
last_state = None
|
|
956
|
+
last_reason = None
|
|
957
|
+
last_message = None
|
|
958
|
+
last_duration = None
|
|
959
|
+
last_timestamp = None
|
|
960
|
+
for state_name, this_state in last_state_dict.items():
|
|
961
|
+
if this_state:
|
|
962
|
+
last_state = state_name
|
|
963
|
+
if "reason" in this_state:
|
|
964
|
+
last_reason = this_state["reason"]
|
|
965
|
+
if "message" in this_state:
|
|
966
|
+
last_message = this_state["message"]
|
|
967
|
+
if this_state.get("started_at"):
|
|
968
|
+
if this_state.get("finished_at"):
|
|
969
|
+
last_duration = (
|
|
970
|
+
this_state["finished_at"] - this_state["started_at"]
|
|
971
|
+
).total_seconds()
|
|
972
|
+
|
|
973
|
+
last_timestamp = this_state["started_at"].timestamp()
|
|
974
|
+
|
|
975
|
+
async def get_tail_lines() -> MutableMapping[str, Any]:
|
|
976
|
+
try:
|
|
977
|
+
return await get_tail_lines_for_kubernetes_container(
|
|
978
|
+
client,
|
|
979
|
+
pod,
|
|
980
|
+
cs,
|
|
981
|
+
num_tail_lines,
|
|
982
|
+
previous=False,
|
|
983
|
+
)
|
|
984
|
+
except asyncio.TimeoutError:
|
|
985
|
+
return {"error_message": f"Could not fetch logs for {cs.name}"}
|
|
986
|
+
|
|
987
|
+
# get previous log lines as well if this container restarted recently
|
|
988
|
+
async def get_previous_tail_lines() -> MutableMapping[str, Any]:
|
|
989
|
+
if state == "running" and kubernetes_tools.recent_container_restart(
|
|
990
|
+
cs.restart_count, last_state, last_timestamp
|
|
991
|
+
):
|
|
992
|
+
try:
|
|
993
|
+
return await get_tail_lines_for_kubernetes_container(
|
|
994
|
+
client,
|
|
995
|
+
pod,
|
|
996
|
+
cs,
|
|
997
|
+
num_tail_lines,
|
|
998
|
+
previous=True,
|
|
999
|
+
)
|
|
1000
|
+
except asyncio.TimeoutError:
|
|
1001
|
+
return {
|
|
1002
|
+
"error_message": f"Could not fetch previous logs for {cs.name}"
|
|
1003
|
+
}
|
|
1004
|
+
return None
|
|
1005
|
+
|
|
1006
|
+
tail_lines, previous_tail_lines = await asyncio.gather(
|
|
1007
|
+
asyncio.ensure_future(get_tail_lines()),
|
|
1008
|
+
asyncio.ensure_future(get_previous_tail_lines()),
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
containers.append(
|
|
1012
|
+
{
|
|
1013
|
+
"name": cs.name,
|
|
1014
|
+
"restart_count": cs.restart_count,
|
|
1015
|
+
"state": state,
|
|
1016
|
+
"reason": reason,
|
|
1017
|
+
"message": message,
|
|
1018
|
+
"last_state": last_state,
|
|
1019
|
+
"last_reason": last_reason,
|
|
1020
|
+
"last_message": last_message,
|
|
1021
|
+
"last_duration": last_duration,
|
|
1022
|
+
"last_timestamp": last_timestamp,
|
|
1023
|
+
"previous_tail_lines": previous_tail_lines,
|
|
1024
|
+
"timestamp": start_timestamp,
|
|
1025
|
+
"healthcheck_grace_period": healthcheck_grace_period,
|
|
1026
|
+
"healthcheck_cmd": healthcheck,
|
|
1027
|
+
"tail_lines": tail_lines,
|
|
1028
|
+
}
|
|
1029
|
+
)
|
|
1030
|
+
return containers
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
async def get_pod_status_tasks_by_sha_and_readiness(
|
|
1034
|
+
pods_task: "asyncio.Future[V1Pod]",
|
|
1035
|
+
backends_task: "asyncio.Future[Dict[str, Any]]",
|
|
1036
|
+
client: kubernetes_tools.KubeClient,
|
|
1037
|
+
verbose: int,
|
|
1038
|
+
) -> DefaultDict[
|
|
1039
|
+
Tuple[str, str], DefaultDict[bool, List["asyncio.Future[Dict[str, Any]]"]]
|
|
1040
|
+
]:
|
|
1041
|
+
num_tail_lines = calculate_tail_lines(verbose)
|
|
1042
|
+
tasks_by_sha_and_readiness: DefaultDict[
|
|
1043
|
+
Tuple[str, str], DefaultDict[bool, List["asyncio.Future[Dict[str, Any]]"]]
|
|
1044
|
+
] = defaultdict(lambda: defaultdict(list))
|
|
1045
|
+
for pod in await pods_task:
|
|
1046
|
+
git_sha = pod.metadata.labels["paasta.yelp.com/git_sha"]
|
|
1047
|
+
config_sha = pod.metadata.labels["paasta.yelp.com/config_sha"]
|
|
1048
|
+
is_ready = kubernetes_tools.is_pod_ready(pod)
|
|
1049
|
+
pod_status_task = asyncio.create_task(
|
|
1050
|
+
get_pod_status(pod, backends_task, client, num_tail_lines)
|
|
1051
|
+
)
|
|
1052
|
+
tasks_by_sha_and_readiness[(git_sha, config_sha)][is_ready].append(
|
|
1053
|
+
pod_status_task
|
|
1054
|
+
)
|
|
1055
|
+
|
|
1056
|
+
return tasks_by_sha_and_readiness
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
async def get_versions_for_controller_revisions(
|
|
1060
|
+
kube_client: kubernetes_tools.KubeClient,
|
|
1061
|
+
service: str,
|
|
1062
|
+
instance: str,
|
|
1063
|
+
namespaces: Iterable[str],
|
|
1064
|
+
pod_status_by_sha_and_readiness_task: "asyncio.Future[Mapping[Tuple[str, str], Mapping[bool, Sequence[asyncio.Future[Mapping[str, Any]]]]]]",
|
|
1065
|
+
) -> List[KubernetesVersionDict]:
|
|
1066
|
+
controller_revision_list: List[V1ControllerRevision] = []
|
|
1067
|
+
|
|
1068
|
+
for coro in asyncio.as_completed(
|
|
1069
|
+
[
|
|
1070
|
+
kubernetes_tools.controller_revisions_for_service_instance(
|
|
1071
|
+
service=service,
|
|
1072
|
+
instance=instance,
|
|
1073
|
+
kube_client=kube_client,
|
|
1074
|
+
namespace=namespace,
|
|
1075
|
+
)
|
|
1076
|
+
for namespace in namespaces
|
|
1077
|
+
]
|
|
1078
|
+
):
|
|
1079
|
+
controller_revision_list.extend(await coro)
|
|
1080
|
+
|
|
1081
|
+
cr_by_shas: Dict[Tuple[str, str], V1ControllerRevision] = {}
|
|
1082
|
+
for cr in controller_revision_list:
|
|
1083
|
+
git_sha = cr.metadata.labels["paasta.yelp.com/git_sha"]
|
|
1084
|
+
config_sha = cr.metadata.labels["paasta.yelp.com/config_sha"]
|
|
1085
|
+
cr_by_shas[(git_sha, config_sha)] = cr
|
|
1086
|
+
|
|
1087
|
+
pod_status_by_sha_and_readiness = await pod_status_by_sha_and_readiness_task
|
|
1088
|
+
versions = await asyncio.gather(
|
|
1089
|
+
*[
|
|
1090
|
+
get_version_for_controller_revision(
|
|
1091
|
+
cr,
|
|
1092
|
+
kube_client,
|
|
1093
|
+
pod_status_by_sha_and_readiness[(git_sha, config_sha)],
|
|
1094
|
+
)
|
|
1095
|
+
for (git_sha, config_sha), cr in cr_by_shas.items()
|
|
1096
|
+
]
|
|
1097
|
+
)
|
|
1098
|
+
|
|
1099
|
+
return versions
|
|
1100
|
+
|
|
1101
|
+
|
|
1102
|
+
async def get_version_for_controller_revision(
|
|
1103
|
+
cr: V1ControllerRevision,
|
|
1104
|
+
client: Any,
|
|
1105
|
+
pod_status_tasks_by_readiness: Mapping[
|
|
1106
|
+
bool, Sequence["asyncio.Future[Mapping[str, Any]]"]
|
|
1107
|
+
],
|
|
1108
|
+
) -> KubernetesVersionDict:
|
|
1109
|
+
all_pod_status_tasks = [
|
|
1110
|
+
task for tasks in pod_status_tasks_by_readiness.values() for task in tasks
|
|
1111
|
+
]
|
|
1112
|
+
await asyncio.gather(*all_pod_status_tasks)
|
|
1113
|
+
return {
|
|
1114
|
+
"name": cr.metadata.name,
|
|
1115
|
+
"type": "ControllerRevision",
|
|
1116
|
+
"replicas": len(all_pod_status_tasks),
|
|
1117
|
+
"ready_replicas": len(pod_status_tasks_by_readiness[True]),
|
|
1118
|
+
"create_timestamp": cr.metadata.creation_timestamp.timestamp(),
|
|
1119
|
+
"git_sha": cr.metadata.labels.get("paasta.yelp.com/git_sha"),
|
|
1120
|
+
"image_version": cr.metadata.labels.get("paasta.yelp.com/image_version", None),
|
|
1121
|
+
"config_sha": cr.metadata.labels.get("paasta.yelp.com/config_sha"),
|
|
1122
|
+
"pods": [task.result() for task in all_pod_status_tasks],
|
|
1123
|
+
"namespace": cr.metadata.namespace,
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
|
|
1127
|
+
# TODO: Cleanup old kubernetes status
|
|
1128
|
+
@a_sync.to_blocking
|
|
1129
|
+
async def kubernetes_status(
|
|
1130
|
+
service: str,
|
|
1131
|
+
instance: str,
|
|
1132
|
+
verbose: int,
|
|
1133
|
+
include_envoy: bool,
|
|
1134
|
+
instance_type: str,
|
|
1135
|
+
settings: Any,
|
|
1136
|
+
) -> Mapping[str, Any]:
|
|
1137
|
+
kstatus: Dict[str, Any] = {}
|
|
1138
|
+
config_loader = LONG_RUNNING_INSTANCE_TYPE_HANDLERS[instance_type].loader
|
|
1139
|
+
job_config = config_loader(
|
|
1140
|
+
service=service,
|
|
1141
|
+
instance=instance,
|
|
1142
|
+
cluster=settings.cluster,
|
|
1143
|
+
soa_dir=settings.soa_dir,
|
|
1144
|
+
load_deployments=True,
|
|
1145
|
+
)
|
|
1146
|
+
kube_client = settings.kubernetes_client
|
|
1147
|
+
if kube_client is None:
|
|
1148
|
+
return kstatus
|
|
1149
|
+
|
|
1150
|
+
app = kubernetes_tools.get_kubernetes_app_by_name(
|
|
1151
|
+
name=job_config.get_sanitised_deployment_name(),
|
|
1152
|
+
kube_client=kube_client,
|
|
1153
|
+
namespace=job_config.get_kubernetes_namespace(),
|
|
1154
|
+
)
|
|
1155
|
+
# bouncing status can be inferred from app_count, ref get_bouncing_status
|
|
1156
|
+
|
|
1157
|
+
# this task is necessary for mesh_status, but most other use cases want
|
|
1158
|
+
# just the list of pods
|
|
1159
|
+
pods_task = asyncio.create_task(
|
|
1160
|
+
kubernetes_tools.pods_for_service_instance(
|
|
1161
|
+
service=job_config.service,
|
|
1162
|
+
instance=job_config.instance,
|
|
1163
|
+
kube_client=kube_client,
|
|
1164
|
+
namespace=job_config.get_kubernetes_namespace(),
|
|
1165
|
+
)
|
|
1166
|
+
)
|
|
1167
|
+
pod_list = await pods_task
|
|
1168
|
+
replicaset_list = await kubernetes_tools.replicasets_for_service_instance(
|
|
1169
|
+
service=job_config.service,
|
|
1170
|
+
instance=job_config.instance,
|
|
1171
|
+
kube_client=kube_client,
|
|
1172
|
+
namespace=job_config.get_kubernetes_namespace(),
|
|
1173
|
+
)
|
|
1174
|
+
# For the purpose of active_versions/app_count, don't count replicasets that are at 0/0.
|
|
1175
|
+
actually_running_replicasets = filter_actually_running_replicasets(replicaset_list)
|
|
1176
|
+
active_versions = kubernetes_tools.get_active_versions_for_service(
|
|
1177
|
+
[app, *pod_list, *actually_running_replicasets]
|
|
1178
|
+
)
|
|
1179
|
+
kstatus["app_count"] = len(active_versions)
|
|
1180
|
+
kstatus["desired_state"] = job_config.get_desired_state()
|
|
1181
|
+
kstatus["bounce_method"] = job_config.get_bounce_method()
|
|
1182
|
+
kstatus["active_shas"] = [
|
|
1183
|
+
(deployment_version.sha, config_sha)
|
|
1184
|
+
for deployment_version, config_sha in active_versions
|
|
1185
|
+
]
|
|
1186
|
+
kstatus["active_versions"] = [
|
|
1187
|
+
(deployment_version.sha, deployment_version.image_version, config_sha)
|
|
1188
|
+
for deployment_version, config_sha in active_versions
|
|
1189
|
+
]
|
|
1190
|
+
|
|
1191
|
+
await job_status(
|
|
1192
|
+
kstatus=kstatus,
|
|
1193
|
+
client=kube_client,
|
|
1194
|
+
namespace=job_config.get_kubernetes_namespace(),
|
|
1195
|
+
job_config=job_config,
|
|
1196
|
+
verbose=verbose,
|
|
1197
|
+
pod_list=pod_list,
|
|
1198
|
+
replicaset_list=replicaset_list,
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1201
|
+
if (
|
|
1202
|
+
job_config.is_autoscaling_enabled() is True
|
|
1203
|
+
and job_config.get_autoscaling_params().get("decision_policy", "") != "bespoke" # type: ignore
|
|
1204
|
+
):
|
|
1205
|
+
try:
|
|
1206
|
+
kstatus["autoscaling_status"] = await autoscaling_status(
|
|
1207
|
+
kube_client, job_config, job_config.get_kubernetes_namespace()
|
|
1208
|
+
)
|
|
1209
|
+
except Exception as e:
|
|
1210
|
+
kstatus[
|
|
1211
|
+
"error_message"
|
|
1212
|
+
] = f"Unknown error occurred while fetching autoscaling status. Please contact #compute-infra for help: {e}"
|
|
1213
|
+
|
|
1214
|
+
evicted_count = 0
|
|
1215
|
+
for pod in pod_list:
|
|
1216
|
+
if pod.status.reason == "Evicted":
|
|
1217
|
+
evicted_count += 1
|
|
1218
|
+
kstatus["evicted_count"] = evicted_count
|
|
1219
|
+
|
|
1220
|
+
if include_envoy:
|
|
1221
|
+
service_namespace_config = kubernetes_tools.load_service_namespace_config(
|
|
1222
|
+
service=service,
|
|
1223
|
+
namespace=job_config.get_nerve_namespace(),
|
|
1224
|
+
soa_dir=settings.soa_dir,
|
|
1225
|
+
)
|
|
1226
|
+
if "proxy_port" in service_namespace_config:
|
|
1227
|
+
kstatus["envoy"] = await mesh_status(
|
|
1228
|
+
service=service,
|
|
1229
|
+
service_mesh=ServiceMesh.ENVOY,
|
|
1230
|
+
instance=job_config.get_nerve_namespace(),
|
|
1231
|
+
job_config=job_config,
|
|
1232
|
+
service_namespace_config=service_namespace_config,
|
|
1233
|
+
pods_task=pods_task,
|
|
1234
|
+
should_return_individual_backends=verbose > 0,
|
|
1235
|
+
settings=settings,
|
|
1236
|
+
)
|
|
1237
|
+
return kstatus
|
|
1238
|
+
|
|
1239
|
+
|
|
1240
|
+
def instance_status(
|
|
1241
|
+
service: str,
|
|
1242
|
+
instance: str,
|
|
1243
|
+
verbose: int,
|
|
1244
|
+
include_envoy: bool,
|
|
1245
|
+
use_new: bool,
|
|
1246
|
+
instance_type: str,
|
|
1247
|
+
settings: Any,
|
|
1248
|
+
all_namespaces: bool,
|
|
1249
|
+
) -> Mapping[str, Any]:
|
|
1250
|
+
status = {}
|
|
1251
|
+
|
|
1252
|
+
if not can_handle(instance_type):
|
|
1253
|
+
raise RuntimeError(
|
|
1254
|
+
f"Unknown instance type: {instance_type!r}, "
|
|
1255
|
+
f"can handle: {INSTANCE_TYPES}"
|
|
1256
|
+
)
|
|
1257
|
+
|
|
1258
|
+
if instance_type in INSTANCE_TYPES_CR:
|
|
1259
|
+
status[instance_type] = cr_status(
|
|
1260
|
+
service=service,
|
|
1261
|
+
instance=instance,
|
|
1262
|
+
instance_type=instance_type,
|
|
1263
|
+
verbose=verbose,
|
|
1264
|
+
kube_client=settings.kubernetes_client,
|
|
1265
|
+
)
|
|
1266
|
+
|
|
1267
|
+
if instance_type in INSTANCE_TYPES_K8S:
|
|
1268
|
+
if use_new:
|
|
1269
|
+
status["kubernetes_v2"] = kubernetes_status_v2(
|
|
1270
|
+
service=service,
|
|
1271
|
+
instance=instance,
|
|
1272
|
+
instance_type=instance_type,
|
|
1273
|
+
verbose=verbose,
|
|
1274
|
+
include_envoy=include_envoy,
|
|
1275
|
+
settings=settings,
|
|
1276
|
+
all_namespaces=all_namespaces,
|
|
1277
|
+
)
|
|
1278
|
+
else:
|
|
1279
|
+
status["kubernetes"] = kubernetes_status(
|
|
1280
|
+
service=service,
|
|
1281
|
+
instance=instance,
|
|
1282
|
+
instance_type=instance_type,
|
|
1283
|
+
verbose=verbose,
|
|
1284
|
+
include_envoy=include_envoy,
|
|
1285
|
+
settings=settings,
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
return status
|
|
1289
|
+
|
|
1290
|
+
|
|
1291
|
+
def ready_replicas_from_replicaset(replicaset: V1ReplicaSet) -> int:
|
|
1292
|
+
try:
|
|
1293
|
+
ready_replicas = replicaset.status.ready_replicas
|
|
1294
|
+
if ready_replicas is None:
|
|
1295
|
+
ready_replicas = 0
|
|
1296
|
+
except AttributeError:
|
|
1297
|
+
ready_replicas = 0
|
|
1298
|
+
|
|
1299
|
+
return ready_replicas
|
|
1300
|
+
|
|
1301
|
+
|
|
1302
|
+
@a_sync.to_blocking
|
|
1303
|
+
async def kubernetes_mesh_status(
|
|
1304
|
+
service: str,
|
|
1305
|
+
instance: str,
|
|
1306
|
+
instance_type: str,
|
|
1307
|
+
settings: Any,
|
|
1308
|
+
include_envoy: bool = True,
|
|
1309
|
+
) -> Mapping[str, Any]:
|
|
1310
|
+
|
|
1311
|
+
if not include_envoy:
|
|
1312
|
+
raise RuntimeError("No mesh types specified when requesting mesh status")
|
|
1313
|
+
if instance_type not in LONG_RUNNING_INSTANCE_TYPE_HANDLERS:
|
|
1314
|
+
raise RuntimeError(
|
|
1315
|
+
f"Getting mesh status for {instance_type} instances is not supported"
|
|
1316
|
+
)
|
|
1317
|
+
|
|
1318
|
+
config_loader = LONG_RUNNING_INSTANCE_TYPE_HANDLERS[instance_type].loader
|
|
1319
|
+
job_config = config_loader(
|
|
1320
|
+
service=service,
|
|
1321
|
+
instance=instance,
|
|
1322
|
+
cluster=settings.cluster,
|
|
1323
|
+
soa_dir=settings.soa_dir,
|
|
1324
|
+
load_deployments=True,
|
|
1325
|
+
)
|
|
1326
|
+
service_namespace_config = kubernetes_tools.load_service_namespace_config(
|
|
1327
|
+
service=service,
|
|
1328
|
+
namespace=job_config.get_nerve_namespace(),
|
|
1329
|
+
soa_dir=settings.soa_dir,
|
|
1330
|
+
)
|
|
1331
|
+
if "proxy_port" not in service_namespace_config:
|
|
1332
|
+
raise RuntimeError(
|
|
1333
|
+
f"Instance '{service}.{instance}' is not configured for the mesh"
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1336
|
+
kube_client = settings.kubernetes_client
|
|
1337
|
+
pods_task = asyncio.create_task(
|
|
1338
|
+
kubernetes_tools.pods_for_service_instance(
|
|
1339
|
+
service=job_config.service,
|
|
1340
|
+
instance=job_config.instance,
|
|
1341
|
+
kube_client=kube_client,
|
|
1342
|
+
namespace=job_config.get_kubernetes_namespace(),
|
|
1343
|
+
)
|
|
1344
|
+
)
|
|
1345
|
+
|
|
1346
|
+
kmesh: Dict[str, Any] = {}
|
|
1347
|
+
mesh_status_kwargs = dict(
|
|
1348
|
+
service=service,
|
|
1349
|
+
instance=job_config.get_nerve_namespace(),
|
|
1350
|
+
job_config=job_config,
|
|
1351
|
+
service_namespace_config=service_namespace_config,
|
|
1352
|
+
pods_task=pods_task,
|
|
1353
|
+
should_return_individual_backends=True,
|
|
1354
|
+
settings=settings,
|
|
1355
|
+
)
|
|
1356
|
+
if include_envoy:
|
|
1357
|
+
kmesh["envoy"] = await mesh_status(
|
|
1358
|
+
service_mesh=ServiceMesh.ENVOY,
|
|
1359
|
+
**mesh_status_kwargs,
|
|
1360
|
+
)
|
|
1361
|
+
|
|
1362
|
+
return kmesh
|