paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1110 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright 2015-2016 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
import copy
|
|
16
|
+
import itertools
|
|
17
|
+
import math
|
|
18
|
+
import re
|
|
19
|
+
from collections import Counter
|
|
20
|
+
from collections import namedtuple
|
|
21
|
+
from typing import Any
|
|
22
|
+
from typing import Callable
|
|
23
|
+
from typing import Mapping
|
|
24
|
+
from typing import NamedTuple
|
|
25
|
+
from typing import Sequence
|
|
26
|
+
from typing import Tuple
|
|
27
|
+
from typing import TypeVar
|
|
28
|
+
|
|
29
|
+
import a_sync
|
|
30
|
+
from humanize import naturalsize
|
|
31
|
+
from kubernetes.client import V1Node
|
|
32
|
+
from kubernetes.client import V1Pod
|
|
33
|
+
from mypy_extensions import TypedDict
|
|
34
|
+
from typing_extensions import Counter as _Counter
|
|
35
|
+
|
|
36
|
+
from paasta_tools.kubernetes_tools import get_all_nodes_cached
|
|
37
|
+
from paasta_tools.kubernetes_tools import get_all_pods_cached
|
|
38
|
+
from paasta_tools.kubernetes_tools import get_pod_status
|
|
39
|
+
from paasta_tools.kubernetes_tools import is_node_ready
|
|
40
|
+
from paasta_tools.kubernetes_tools import KubeClient
|
|
41
|
+
from paasta_tools.kubernetes_tools import list_all_deployments
|
|
42
|
+
from paasta_tools.kubernetes_tools import paasta_prefixed
|
|
43
|
+
from paasta_tools.kubernetes_tools import PodStatus
|
|
44
|
+
from paasta_tools.mesos.master import MesosMetrics
|
|
45
|
+
from paasta_tools.mesos.master import MesosState
|
|
46
|
+
from paasta_tools.mesos_maintenance import MAINTENANCE_ROLE
|
|
47
|
+
from paasta_tools.mesos_tools import get_all_tasks_from_state
|
|
48
|
+
from paasta_tools.mesos_tools import get_mesos_quorum
|
|
49
|
+
from paasta_tools.mesos_tools import get_number_of_mesos_masters
|
|
50
|
+
from paasta_tools.mesos_tools import get_zookeeper_host_path
|
|
51
|
+
from paasta_tools.mesos_tools import is_task_terminal
|
|
52
|
+
from paasta_tools.mesos_tools import MesosResources
|
|
53
|
+
from paasta_tools.mesos_tools import MesosTask
|
|
54
|
+
from paasta_tools.utils import PaastaColors
|
|
55
|
+
from paasta_tools.utils import print_with_indent
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
DEFAULT_KUBERNETES_CPU_REQUEST = "100m"
|
|
59
|
+
DEFAULT_KUBERNETES_MEMORY_REQUEST = "200M"
|
|
60
|
+
DEFAULT_KUBERNETES_DISK_REQUEST = "0"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ResourceInfo(namedtuple("ResourceInfo", ["cpus", "mem", "disk", "gpus"])):
|
|
64
|
+
def __new__(cls, cpus, mem, disk, gpus=0):
|
|
65
|
+
return super().__new__(cls, cpus, mem, disk, gpus)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class HealthCheckResult(NamedTuple):
|
|
69
|
+
message: str
|
|
70
|
+
healthy: bool
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class ResourceUtilization(NamedTuple):
|
|
74
|
+
metric: str
|
|
75
|
+
total: int
|
|
76
|
+
free: int
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_num_masters() -> int:
|
|
80
|
+
"""Gets the number of masters from mesos state"""
|
|
81
|
+
zookeeper_host_path = get_zookeeper_host_path()
|
|
82
|
+
return get_number_of_mesos_masters(
|
|
83
|
+
zookeeper_host_path.host, zookeeper_host_path.path
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_mesos_cpu_status(
|
|
88
|
+
metrics: MesosMetrics, mesos_state: MesosState
|
|
89
|
+
) -> Tuple[int, int, int]:
|
|
90
|
+
"""Takes in the mesos metrics and analyzes them, returning the status.
|
|
91
|
+
|
|
92
|
+
:param metrics: mesos metrics dictionary.
|
|
93
|
+
:param mesos_state: mesos state dictionary.
|
|
94
|
+
:returns: Tuple of total, used, and available CPUs.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
total = metrics["master/cpus_total"]
|
|
98
|
+
used = metrics["master/cpus_used"]
|
|
99
|
+
|
|
100
|
+
for slave in mesos_state["slaves"]:
|
|
101
|
+
used += reserved_maintenence_resources(slave["reserved_resources"])["cpus"]
|
|
102
|
+
|
|
103
|
+
available = total - used
|
|
104
|
+
return total, used, available
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_kube_cpu_status(
|
|
108
|
+
nodes: Sequence[V1Node],
|
|
109
|
+
) -> Tuple[float, float, float]:
|
|
110
|
+
"""Takes in the list of Kubernetes nodes and analyzes them, returning the status.
|
|
111
|
+
|
|
112
|
+
:param nodes: list of Kubernetes nodes.
|
|
113
|
+
:returns: Tuple of total, used, and available CPUs.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
total = 0.0
|
|
117
|
+
available = 0.0
|
|
118
|
+
for node in nodes:
|
|
119
|
+
available += suffixed_number_value(node.status.allocatable["cpu"])
|
|
120
|
+
total += suffixed_number_value(node.status.capacity["cpu"])
|
|
121
|
+
|
|
122
|
+
used = total - available
|
|
123
|
+
return total, used, available
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def get_mesos_memory_status(
|
|
127
|
+
metrics: MesosMetrics, mesos_state: MesosState
|
|
128
|
+
) -> Tuple[int, int, int]:
|
|
129
|
+
"""Takes in the mesos metrics and analyzes them, returning the status.
|
|
130
|
+
|
|
131
|
+
:param metrics: mesos metrics dictionary.
|
|
132
|
+
:param mesos_state: mesos state dictionary.
|
|
133
|
+
:returns: Tuple of total, used, and available memory in Mi.
|
|
134
|
+
"""
|
|
135
|
+
total = metrics["master/mem_total"]
|
|
136
|
+
used = metrics["master/mem_used"]
|
|
137
|
+
|
|
138
|
+
for slave in mesos_state["slaves"]:
|
|
139
|
+
used += reserved_maintenence_resources(slave["reserved_resources"])["mem"]
|
|
140
|
+
|
|
141
|
+
available = total - used
|
|
142
|
+
|
|
143
|
+
return total, used, available
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_kube_memory_status(
|
|
147
|
+
nodes: Sequence[V1Node],
|
|
148
|
+
) -> Tuple[float, float, float]:
|
|
149
|
+
"""Takes in the list of Kubernetes nodes and analyzes them, returning the status.
|
|
150
|
+
|
|
151
|
+
:param nodes: list of Kubernetes nodes.
|
|
152
|
+
:returns: Tuple of total, used, and available memory in Mi.
|
|
153
|
+
"""
|
|
154
|
+
total = 0.0
|
|
155
|
+
available = 0.0
|
|
156
|
+
for node in nodes:
|
|
157
|
+
available += suffixed_number_value(node.status.allocatable["memory"])
|
|
158
|
+
total += suffixed_number_value(node.status.capacity["memory"])
|
|
159
|
+
|
|
160
|
+
total //= 1024 * 1024
|
|
161
|
+
available //= 1024 * 1024
|
|
162
|
+
used = total - available
|
|
163
|
+
return total, used, available
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def get_mesos_disk_status(
|
|
167
|
+
metrics: MesosMetrics, mesos_state: MesosState
|
|
168
|
+
) -> Tuple[int, int, int]:
|
|
169
|
+
"""Takes in the mesos metrics and analyzes them, returning the status.
|
|
170
|
+
|
|
171
|
+
:param metrics: mesos metrics dictionary.
|
|
172
|
+
:param mesos_state: mesos state dictionary.
|
|
173
|
+
:returns: Tuple of total, used, and available disk space in Mi.
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
total = metrics["master/disk_total"]
|
|
177
|
+
used = metrics["master/disk_used"]
|
|
178
|
+
|
|
179
|
+
for slave in mesos_state["slaves"]:
|
|
180
|
+
used += reserved_maintenence_resources(slave["reserved_resources"])["disk"]
|
|
181
|
+
|
|
182
|
+
available = total - used
|
|
183
|
+
return total, used, available
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def get_kube_disk_status(
|
|
187
|
+
nodes: Sequence[V1Node],
|
|
188
|
+
) -> Tuple[float, float, float]:
|
|
189
|
+
"""Takes in the list of Kubernetes nodes and analyzes them, returning the status.
|
|
190
|
+
|
|
191
|
+
:param nodes: list of Kubernetes nodes.
|
|
192
|
+
:returns: Tuple of total, used, and available disk space in Mi.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
total = 0.0
|
|
196
|
+
available = 0.0
|
|
197
|
+
for node in nodes:
|
|
198
|
+
available += suffixed_number_value(node.status.allocatable["ephemeral-storage"])
|
|
199
|
+
total += suffixed_number_value(node.status.capacity["ephemeral-storage"])
|
|
200
|
+
|
|
201
|
+
total //= 1024 * 1024
|
|
202
|
+
available //= 1024 * 1024
|
|
203
|
+
used = total - available
|
|
204
|
+
return total, used, available
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def get_mesos_gpu_status(
|
|
208
|
+
metrics: MesosMetrics, mesos_state: MesosState
|
|
209
|
+
) -> Tuple[int, int, int]:
|
|
210
|
+
"""Takes in the mesos metrics and analyzes them, returning gpus status.
|
|
211
|
+
|
|
212
|
+
:param metrics: mesos metrics dictionary.
|
|
213
|
+
:param mesos_state: mesos state dictionary.
|
|
214
|
+
:returns: Tuple of total, used, and available GPUs.
|
|
215
|
+
"""
|
|
216
|
+
total = metrics["master/gpus_total"]
|
|
217
|
+
used = metrics["master/gpus_used"]
|
|
218
|
+
|
|
219
|
+
for slave in mesos_state["slaves"]:
|
|
220
|
+
used += reserved_maintenence_resources(slave["reserved_resources"])["gpus"]
|
|
221
|
+
|
|
222
|
+
available = total - used
|
|
223
|
+
return total, used, available
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def get_kube_gpu_status(
|
|
227
|
+
nodes: Sequence[V1Node],
|
|
228
|
+
) -> Tuple[float, float, float]:
|
|
229
|
+
"""Takes in the list of Kubernetes nodes and analyzes them, returning the status.
|
|
230
|
+
|
|
231
|
+
:param nodes: list of Kubernetes nodes.
|
|
232
|
+
:returns: Tuple of total, used, and available GPUs.
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
total = 0.0
|
|
236
|
+
available = 0.0
|
|
237
|
+
for node in nodes:
|
|
238
|
+
available += suffixed_number_value(
|
|
239
|
+
node.status.allocatable.get("nvidia.com/gpu", "0")
|
|
240
|
+
)
|
|
241
|
+
total += suffixed_number_value(node.status.capacity.get("nvidia.com/gpu", "0"))
|
|
242
|
+
|
|
243
|
+
used = total - available
|
|
244
|
+
return total, used, available
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def filter_mesos_state_metrics(dictionary: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
248
|
+
valid_keys = ["cpus", "mem", "disk", "gpus"]
|
|
249
|
+
return {key: value for (key, value) in dictionary.items() if key in valid_keys}
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def filter_kube_resources(dictionary: Mapping[str, str]) -> Mapping[str, str]:
|
|
253
|
+
valid_keys = ["cpu", "memory", "ephemeral-storage", "nvidia.com/gpu"]
|
|
254
|
+
return {key: value for (key, value) in dictionary.items() if key in valid_keys}
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class ResourceParser:
|
|
258
|
+
@staticmethod
|
|
259
|
+
def cpus(resources):
|
|
260
|
+
resources = resources or {}
|
|
261
|
+
return suffixed_number_value(
|
|
262
|
+
resources.get("cpu", DEFAULT_KUBERNETES_CPU_REQUEST)
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
@staticmethod
|
|
266
|
+
def mem(resources):
|
|
267
|
+
resources = resources or {}
|
|
268
|
+
return suffixed_number_value(
|
|
269
|
+
resources.get("memory", DEFAULT_KUBERNETES_MEMORY_REQUEST)
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
@staticmethod
|
|
273
|
+
def disk(resources):
|
|
274
|
+
resources = resources or {}
|
|
275
|
+
return suffixed_number_value(
|
|
276
|
+
resources.get("ephemeral-storage", DEFAULT_KUBERNETES_DISK_REQUEST)
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def allocated_node_resources(pods: Sequence[V1Pod]) -> Mapping[str, float]:
|
|
281
|
+
cpus = mem = disk = 0
|
|
282
|
+
for pod in pods:
|
|
283
|
+
cpus += sum(
|
|
284
|
+
ResourceParser.cpus(c.resources.requests) for c in pod.spec.containers
|
|
285
|
+
)
|
|
286
|
+
mem += sum(
|
|
287
|
+
ResourceParser.mem(c.resources.requests) for c in pod.spec.containers
|
|
288
|
+
)
|
|
289
|
+
disk += sum(
|
|
290
|
+
ResourceParser.disk(c.resources.requests) for c in pod.spec.containers
|
|
291
|
+
)
|
|
292
|
+
return {"cpu": cpus, "memory": mem, "ephemeral-storage": disk}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def healthcheck_result_for_resource_utilization(
|
|
296
|
+
resource_utilization: ResourceUtilization, threshold: int
|
|
297
|
+
) -> HealthCheckResult:
|
|
298
|
+
"""Given a resource data dict, assert that cpu
|
|
299
|
+
data is ok.
|
|
300
|
+
|
|
301
|
+
:param resource_utilization: the resource_utilization tuple to check
|
|
302
|
+
:returns: a HealthCheckResult
|
|
303
|
+
"""
|
|
304
|
+
try:
|
|
305
|
+
utilization = percent_used(
|
|
306
|
+
resource_utilization.total,
|
|
307
|
+
resource_utilization.total - resource_utilization.free,
|
|
308
|
+
)
|
|
309
|
+
except ZeroDivisionError:
|
|
310
|
+
utilization = 0
|
|
311
|
+
message = "{}: {:.2f}/{:.2f}({:.2f}%) used. Threshold ({:.2f}%)".format(
|
|
312
|
+
resource_utilization.metric,
|
|
313
|
+
float(resource_utilization.total - resource_utilization.free),
|
|
314
|
+
resource_utilization.total,
|
|
315
|
+
utilization,
|
|
316
|
+
threshold,
|
|
317
|
+
)
|
|
318
|
+
healthy = utilization <= threshold
|
|
319
|
+
return HealthCheckResult(message=message, healthy=healthy)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def quorum_ok(masters: int, quorum: int) -> bool:
|
|
323
|
+
return masters >= quorum
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def check_threshold(percent_used: float, threshold: int) -> bool:
|
|
327
|
+
return (100 - percent_used) > threshold
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def percent_used(total: float, used: float) -> float:
|
|
331
|
+
return round(used / float(total) * 100.0, 2)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def assert_cpu_health(
|
|
335
|
+
cpu_status: Tuple[float, float, float], threshold: int = 10
|
|
336
|
+
) -> HealthCheckResult:
|
|
337
|
+
total, used, available = cpu_status
|
|
338
|
+
try:
|
|
339
|
+
perc_used = percent_used(total, used)
|
|
340
|
+
except ZeroDivisionError:
|
|
341
|
+
return HealthCheckResult(
|
|
342
|
+
message="Error reading total available cpu from mesos!", healthy=False
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
if check_threshold(perc_used, threshold):
|
|
346
|
+
return HealthCheckResult(
|
|
347
|
+
message="CPUs: %.2f / %d in use (%s)"
|
|
348
|
+
% (used, total, PaastaColors.green("%.2f%%" % perc_used)),
|
|
349
|
+
healthy=True,
|
|
350
|
+
)
|
|
351
|
+
else:
|
|
352
|
+
return HealthCheckResult(
|
|
353
|
+
message="CRITICAL: Less than %d%% CPUs available. (Currently using %.2f%% of %d)"
|
|
354
|
+
% (threshold, perc_used, total),
|
|
355
|
+
healthy=False,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def assert_memory_health(
|
|
360
|
+
memory_status: Tuple[float, float, float], threshold: int = 10
|
|
361
|
+
) -> HealthCheckResult:
|
|
362
|
+
total: float
|
|
363
|
+
used: float
|
|
364
|
+
total, used, _ = memory_status
|
|
365
|
+
|
|
366
|
+
total /= 1024
|
|
367
|
+
used /= 1024
|
|
368
|
+
|
|
369
|
+
try:
|
|
370
|
+
perc_used = percent_used(total, used)
|
|
371
|
+
except ZeroDivisionError:
|
|
372
|
+
return HealthCheckResult(
|
|
373
|
+
message="Error reading total available memory from mesos!", healthy=False
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
if check_threshold(perc_used, threshold):
|
|
377
|
+
return HealthCheckResult(
|
|
378
|
+
message="Memory: %0.2f / %0.2fGB in use (%s)"
|
|
379
|
+
% (used, total, PaastaColors.green("%.2f%%" % perc_used)),
|
|
380
|
+
healthy=True,
|
|
381
|
+
)
|
|
382
|
+
else:
|
|
383
|
+
return HealthCheckResult(
|
|
384
|
+
message="CRITICAL: Less than %d%% memory available. (Currently using %.2f%% of %.2fGB)"
|
|
385
|
+
% (threshold, perc_used, total),
|
|
386
|
+
healthy=False,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def assert_disk_health(
|
|
391
|
+
disk_status: Tuple[float, float, float], threshold: int = 10
|
|
392
|
+
) -> HealthCheckResult:
|
|
393
|
+
total: float
|
|
394
|
+
used: float
|
|
395
|
+
total, used, _ = disk_status
|
|
396
|
+
|
|
397
|
+
total /= 1024
|
|
398
|
+
used /= 1024
|
|
399
|
+
|
|
400
|
+
try:
|
|
401
|
+
perc_used = percent_used(total, used)
|
|
402
|
+
except ZeroDivisionError:
|
|
403
|
+
return HealthCheckResult(
|
|
404
|
+
message="Error reading total available disk from mesos!", healthy=False
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
if check_threshold(perc_used, threshold):
|
|
408
|
+
return HealthCheckResult(
|
|
409
|
+
message="Disk: %0.2f / %0.2fGB in use (%s)"
|
|
410
|
+
% (used, total, PaastaColors.green("%.2f%%" % perc_used)),
|
|
411
|
+
healthy=True,
|
|
412
|
+
)
|
|
413
|
+
else:
|
|
414
|
+
return HealthCheckResult(
|
|
415
|
+
message="CRITICAL: Less than %d%% disk available. (Currently using %.2f%%)"
|
|
416
|
+
% (threshold, perc_used),
|
|
417
|
+
healthy=False,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def assert_gpu_health(
|
|
422
|
+
gpu_status: Tuple[float, float, float], threshold: int = 0
|
|
423
|
+
) -> HealthCheckResult:
|
|
424
|
+
total, used, available = gpu_status
|
|
425
|
+
|
|
426
|
+
if math.isclose(total, 0):
|
|
427
|
+
# assume that no gpus is healthy since most machines don't have them
|
|
428
|
+
return HealthCheckResult(message="No GPUs found!", healthy=True)
|
|
429
|
+
else:
|
|
430
|
+
perc_used = percent_used(total, used)
|
|
431
|
+
|
|
432
|
+
if check_threshold(perc_used, threshold):
|
|
433
|
+
# only whole gpus can be used
|
|
434
|
+
return HealthCheckResult(
|
|
435
|
+
message="GPUs: %d / %d in use (%s)"
|
|
436
|
+
% (used, total, PaastaColors.green("%.2f%%" % perc_used)),
|
|
437
|
+
healthy=True,
|
|
438
|
+
)
|
|
439
|
+
else:
|
|
440
|
+
return HealthCheckResult(
|
|
441
|
+
message="CRITICAL: Less than %d%% GPUs available. (Currently using %.2f%% of %d)"
|
|
442
|
+
% (threshold, perc_used, total),
|
|
443
|
+
healthy=False,
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def assert_mesos_tasks_running(
|
|
448
|
+
metrics: MesosMetrics,
|
|
449
|
+
) -> HealthCheckResult:
|
|
450
|
+
running = metrics["master/tasks_running"]
|
|
451
|
+
staging = metrics["master/tasks_staging"]
|
|
452
|
+
starting = metrics["master/tasks_starting"]
|
|
453
|
+
return HealthCheckResult(
|
|
454
|
+
message="Tasks: running: %d staging: %d starting: %d"
|
|
455
|
+
% (running, staging, starting),
|
|
456
|
+
healthy=True,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def assert_kube_pods_running(
|
|
461
|
+
kube_client: KubeClient, namespace: str
|
|
462
|
+
) -> HealthCheckResult:
|
|
463
|
+
statuses = [
|
|
464
|
+
get_pod_status(pod) for pod in get_all_pods_cached(kube_client, namespace)
|
|
465
|
+
]
|
|
466
|
+
running = statuses.count(PodStatus.RUNNING)
|
|
467
|
+
pending = statuses.count(PodStatus.PENDING)
|
|
468
|
+
failed = statuses.count(PodStatus.FAILED)
|
|
469
|
+
healthy = running > 0
|
|
470
|
+
return HealthCheckResult(
|
|
471
|
+
message=f"Pods: running: {running} pending: {pending} failed: {failed}",
|
|
472
|
+
healthy=healthy,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def get_mesos_slaves_health_status(
|
|
477
|
+
metrics: MesosMetrics,
|
|
478
|
+
) -> Tuple[int, int]:
|
|
479
|
+
return metrics["master/slaves_active"], metrics["master/slaves_inactive"]
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def get_kube_nodes_health_status(
|
|
483
|
+
nodes: Sequence[V1Node],
|
|
484
|
+
) -> Tuple[int, int]:
|
|
485
|
+
statuses = [is_node_ready(node) for node in nodes]
|
|
486
|
+
return statuses.count(True), statuses.count(False)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def assert_nodes_health(
|
|
490
|
+
nodes_health_status: Tuple[int, int],
|
|
491
|
+
) -> HealthCheckResult:
|
|
492
|
+
active, inactive = nodes_health_status
|
|
493
|
+
healthy = active > 0
|
|
494
|
+
return HealthCheckResult(
|
|
495
|
+
message="Nodes: active: %d inactive: %d" % (active, inactive), healthy=healthy
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def assert_quorum_size() -> HealthCheckResult:
|
|
500
|
+
masters, quorum = get_num_masters(), a_sync.block(get_mesos_quorum)
|
|
501
|
+
if quorum_ok(masters, quorum):
|
|
502
|
+
return HealthCheckResult(
|
|
503
|
+
message="Quorum: masters: %d configured quorum: %d " % (masters, quorum),
|
|
504
|
+
healthy=True,
|
|
505
|
+
)
|
|
506
|
+
else:
|
|
507
|
+
return HealthCheckResult(
|
|
508
|
+
message="CRITICAL: Number of masters (%d) less than configured quorum(%d)."
|
|
509
|
+
% (masters, quorum),
|
|
510
|
+
healthy=False,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
_KeyFuncRetT = Sequence[Tuple[str, str]]
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
class _SlaveT(TypedDict):
|
|
518
|
+
id: str
|
|
519
|
+
resources: MesosResources
|
|
520
|
+
reserved_resources: MesosResources
|
|
521
|
+
attributes: Mapping[str, str]
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
_GenericNodeT = TypeVar("_GenericNodeT", _SlaveT, V1Node)
|
|
525
|
+
|
|
526
|
+
_GenericNodeGroupingFunctionT = Callable[[_GenericNodeT], _KeyFuncRetT]
|
|
527
|
+
|
|
528
|
+
_GenericNodeFilterFunctionT = Callable[[_GenericNodeT], bool]
|
|
529
|
+
|
|
530
|
+
_GenericNodeSortFunctionT = Callable[[Sequence[_GenericNodeT]], Sequence[_GenericNodeT]]
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def key_func_for_attribute(
|
|
534
|
+
attribute: str,
|
|
535
|
+
) -> Callable[[_SlaveT], str]:
|
|
536
|
+
"""Return a closure that given a slave, will return the value of a specific
|
|
537
|
+
attribute.
|
|
538
|
+
|
|
539
|
+
:param attribute: the attribute to inspect in the slave
|
|
540
|
+
:returns: a closure, which takes a slave and returns the value of an attribute
|
|
541
|
+
"""
|
|
542
|
+
|
|
543
|
+
def key_func(slave):
|
|
544
|
+
return slave["attributes"].get(attribute, "unknown")
|
|
545
|
+
|
|
546
|
+
return key_func
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def key_func_for_attribute_multi(
|
|
550
|
+
attributes: Sequence[str],
|
|
551
|
+
) -> _GenericNodeGroupingFunctionT:
|
|
552
|
+
"""Return a closure that given a slave, will return the value of a list of
|
|
553
|
+
attributes, compiled into a hashable tuple
|
|
554
|
+
|
|
555
|
+
:param attributes: the attributes to inspect in the slave
|
|
556
|
+
:returns: a closure, which takes a slave and returns the value of those attributes
|
|
557
|
+
"""
|
|
558
|
+
|
|
559
|
+
def get_attribute(slave, attribute):
|
|
560
|
+
if attribute == "hostname":
|
|
561
|
+
return slave["hostname"]
|
|
562
|
+
else:
|
|
563
|
+
return slave["attributes"].get(attribute, "unknown")
|
|
564
|
+
|
|
565
|
+
def key_func(slave):
|
|
566
|
+
return tuple((a, get_attribute(slave, a)) for a in attributes)
|
|
567
|
+
|
|
568
|
+
return key_func
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def key_func_for_attribute_multi_kube(
|
|
572
|
+
attributes: Sequence[str],
|
|
573
|
+
) -> Callable[[V1Node], _KeyFuncRetT]:
|
|
574
|
+
"""Return a closure that given a node, will return the value of a list of
|
|
575
|
+
attributes, compiled into a hashable tuple
|
|
576
|
+
|
|
577
|
+
:param attributes: the attributes to inspect in the slave
|
|
578
|
+
:returns: a closure, which takes a node and returns the value of those attributes
|
|
579
|
+
"""
|
|
580
|
+
|
|
581
|
+
def get_attribute(node, attribute):
|
|
582
|
+
return node.metadata.labels.get(paasta_prefixed(attribute), "unknown")
|
|
583
|
+
|
|
584
|
+
def key_func(node):
|
|
585
|
+
return tuple((a, get_attribute(node, a)) for a in attributes)
|
|
586
|
+
|
|
587
|
+
return key_func
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def sort_func_for_attributes(
|
|
591
|
+
attributes: Sequence[str],
|
|
592
|
+
) -> _GenericNodeSortFunctionT:
|
|
593
|
+
def sort(slaves):
|
|
594
|
+
for attribute in attributes:
|
|
595
|
+
slaves = sorted(slaves, key=key_func_for_attribute(attribute))
|
|
596
|
+
return slaves
|
|
597
|
+
|
|
598
|
+
return sort
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def group_slaves_by_key_func(
|
|
602
|
+
key_func: _GenericNodeGroupingFunctionT,
|
|
603
|
+
slaves: Sequence[_GenericNodeT],
|
|
604
|
+
sort_func: _GenericNodeSortFunctionT = None,
|
|
605
|
+
) -> Mapping[_KeyFuncRetT, Sequence[_GenericNodeT]]:
|
|
606
|
+
"""Given a function for grouping slaves, return a
|
|
607
|
+
dict where keys are the unique values returned by
|
|
608
|
+
the key_func and the values are all those slaves which
|
|
609
|
+
have that specific value.
|
|
610
|
+
|
|
611
|
+
:param key_func: a function which consumes a slave and returns a value
|
|
612
|
+
:param slaves: a list of slaves
|
|
613
|
+
:returns: a dict of key: [slaves]
|
|
614
|
+
"""
|
|
615
|
+
sorted_slaves: Sequence[_GenericNodeT]
|
|
616
|
+
if sort_func is None:
|
|
617
|
+
sorted_slaves = sorted(slaves, key=key_func)
|
|
618
|
+
else:
|
|
619
|
+
sorted_slaves = sort_func(slaves)
|
|
620
|
+
|
|
621
|
+
return {k: list(v) for k, v in itertools.groupby(sorted_slaves, key=key_func)}
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
class ResourceUtilizationDict(TypedDict):
|
|
625
|
+
free: ResourceInfo
|
|
626
|
+
total: ResourceInfo
|
|
627
|
+
slave_count: int
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def calculate_resource_utilization_for_slaves(
|
|
631
|
+
slaves: Sequence[_SlaveT], tasks: Sequence[MesosTask]
|
|
632
|
+
) -> ResourceUtilizationDict:
|
|
633
|
+
"""Given a list of slaves and a list of tasks, calculate the total available
|
|
634
|
+
resource available in that list of slaves, and the resources consumed by tasks
|
|
635
|
+
running on those slaves.
|
|
636
|
+
|
|
637
|
+
:param slaves: a list of slaves to calculate resource usage for
|
|
638
|
+
:param tasks: the list of tasks running in the mesos cluster
|
|
639
|
+
:returns: a dict, containing keys for "free" and "total" resources. Each of these keys
|
|
640
|
+
is a ResourceInfo tuple, exposing a number for cpu, disk and mem.
|
|
641
|
+
"""
|
|
642
|
+
resource_total_dict: _Counter[str] = Counter()
|
|
643
|
+
for slave in slaves:
|
|
644
|
+
filtered_resources = filter_mesos_state_metrics(slave["resources"])
|
|
645
|
+
resource_total_dict.update(Counter(filtered_resources))
|
|
646
|
+
resource_free_dict = copy.deepcopy(resource_total_dict)
|
|
647
|
+
for task in tasks:
|
|
648
|
+
task_resources = task["resources"]
|
|
649
|
+
resource_free_dict.subtract(Counter(filter_mesos_state_metrics(task_resources)))
|
|
650
|
+
for slave in slaves:
|
|
651
|
+
filtered_resources = filter_mesos_state_metrics(
|
|
652
|
+
reserved_maintenence_resources(slave["reserved_resources"])
|
|
653
|
+
)
|
|
654
|
+
resource_free_dict.subtract(Counter(filtered_resources))
|
|
655
|
+
return {
|
|
656
|
+
"free": ResourceInfo(
|
|
657
|
+
cpus=resource_free_dict["cpus"],
|
|
658
|
+
disk=resource_free_dict["disk"],
|
|
659
|
+
mem=resource_free_dict["mem"],
|
|
660
|
+
gpus=resource_free_dict.get("gpus", 0),
|
|
661
|
+
),
|
|
662
|
+
"total": ResourceInfo(
|
|
663
|
+
cpus=resource_total_dict["cpus"],
|
|
664
|
+
disk=resource_total_dict["disk"],
|
|
665
|
+
mem=resource_total_dict["mem"],
|
|
666
|
+
gpus=resource_total_dict.get("gpus", 0),
|
|
667
|
+
),
|
|
668
|
+
"slave_count": len(slaves),
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
_IEC_NUMBER_SUFFIXES = {
|
|
673
|
+
"k": 1000,
|
|
674
|
+
"m": 1000**-1,
|
|
675
|
+
"M": 1000**2,
|
|
676
|
+
"G": 1000**3,
|
|
677
|
+
"T": 1000**4,
|
|
678
|
+
"P": 1000**5,
|
|
679
|
+
"Ki": 1024,
|
|
680
|
+
"Mi": 1024**2,
|
|
681
|
+
"Gi": 1024**3,
|
|
682
|
+
"Ti": 1024**4,
|
|
683
|
+
"Pi": 1024**5,
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def suffixed_number_value(s: str) -> float:
|
|
688
|
+
pattern = r"(?P<number>\d+)(?P<suff>\w*)"
|
|
689
|
+
match = re.match(pattern, s)
|
|
690
|
+
number, suff = match.groups()
|
|
691
|
+
|
|
692
|
+
if suff in _IEC_NUMBER_SUFFIXES:
|
|
693
|
+
return float(number) * _IEC_NUMBER_SUFFIXES[suff]
|
|
694
|
+
else:
|
|
695
|
+
return float(number)
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
def suffixed_number_dict_values(d: Mapping[Any, str]) -> Mapping[Any, float]:
|
|
699
|
+
return {k: suffixed_number_value(v) for k, v in d.items()}
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def calculate_resource_utilization_for_kube_nodes(
|
|
703
|
+
nodes: Sequence[V1Node],
|
|
704
|
+
pods_by_node: Mapping[str, Sequence[V1Pod]],
|
|
705
|
+
) -> ResourceUtilizationDict:
|
|
706
|
+
"""Given a list of Kubernetes nodes, calculate the total available
|
|
707
|
+
resource available and the resources consumed in that list of nodes.
|
|
708
|
+
|
|
709
|
+
:param nodes: a list of Kubernetes nodes to calculate resource usage for
|
|
710
|
+
:returns: a dict, containing keys for "free" and "total" resources. Each of these keys
|
|
711
|
+
is a ResourceInfo tuple, exposing a number for cpu, disk and mem.
|
|
712
|
+
"""
|
|
713
|
+
resource_total_dict: _Counter[str] = Counter()
|
|
714
|
+
resource_free_dict: _Counter[str] = Counter()
|
|
715
|
+
for node in nodes:
|
|
716
|
+
allocatable_resources = suffixed_number_dict_values(
|
|
717
|
+
filter_kube_resources(node.status.allocatable)
|
|
718
|
+
)
|
|
719
|
+
resource_total_dict.update(Counter(allocatable_resources))
|
|
720
|
+
allocated_resources = allocated_node_resources(pods_by_node[node.metadata.name])
|
|
721
|
+
resource_free_dict.update(
|
|
722
|
+
Counter(
|
|
723
|
+
{
|
|
724
|
+
"cpu": allocatable_resources["cpu"] - allocated_resources["cpu"],
|
|
725
|
+
"ephemeral-storage": allocatable_resources["ephemeral-storage"]
|
|
726
|
+
- allocated_resources["ephemeral-storage"],
|
|
727
|
+
"memory": allocatable_resources["memory"]
|
|
728
|
+
- allocated_resources["memory"],
|
|
729
|
+
}
|
|
730
|
+
)
|
|
731
|
+
)
|
|
732
|
+
return {
|
|
733
|
+
"free": ResourceInfo(
|
|
734
|
+
cpus=resource_free_dict["cpu"],
|
|
735
|
+
disk=resource_free_dict["ephemeral-storage"] / (1024**2),
|
|
736
|
+
mem=resource_free_dict["memory"] / (1024**2),
|
|
737
|
+
gpus=resource_free_dict.get("nvidia.com/gpu", 0),
|
|
738
|
+
),
|
|
739
|
+
"total": ResourceInfo(
|
|
740
|
+
cpus=resource_total_dict["cpu"],
|
|
741
|
+
disk=resource_total_dict["ephemeral-storage"] / (1024**2),
|
|
742
|
+
mem=resource_total_dict["memory"] / (1024**2),
|
|
743
|
+
gpus=resource_total_dict.get("nvidia.com/gpu", 0),
|
|
744
|
+
),
|
|
745
|
+
"slave_count": len(nodes),
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
def filter_tasks_for_slaves(
|
|
750
|
+
slaves: Sequence[_SlaveT], tasks: Sequence[MesosTask]
|
|
751
|
+
) -> Sequence[MesosTask]:
|
|
752
|
+
"""Given a list of slaves and a list of tasks, return a filtered
|
|
753
|
+
list of tasks, where those returned belong to slaves in the list of
|
|
754
|
+
slaves
|
|
755
|
+
|
|
756
|
+
:param slaves: the list of slaves which the tasks provided should be
|
|
757
|
+
running on.
|
|
758
|
+
:param tasks: the tasks to filter :returns: a list of tasks,
|
|
759
|
+
identical to that provided by the tasks param, but with only those where
|
|
760
|
+
the task is running on one of the provided slaves included.
|
|
761
|
+
"""
|
|
762
|
+
slave_ids = [slave["id"] for slave in slaves]
|
|
763
|
+
return [task for task in tasks if task["slave_id"] in slave_ids]
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def make_filter_slave_func(
|
|
767
|
+
attribute: str, values: Sequence[str]
|
|
768
|
+
) -> _GenericNodeFilterFunctionT:
|
|
769
|
+
def filter_func(slave):
|
|
770
|
+
return slave["attributes"].get(attribute, None) in values
|
|
771
|
+
|
|
772
|
+
return filter_func
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
def filter_slaves(
|
|
776
|
+
slaves: Sequence[_GenericNodeT], filters: Sequence[_GenericNodeFilterFunctionT]
|
|
777
|
+
) -> Sequence[_GenericNodeT]:
|
|
778
|
+
"""Filter slaves by attributes
|
|
779
|
+
|
|
780
|
+
:param slaves: list of slaves to filter
|
|
781
|
+
:param filters: list of functions that take a slave and return whether the
|
|
782
|
+
slave should be included
|
|
783
|
+
:returns: list of slaves that return true for all the filters
|
|
784
|
+
"""
|
|
785
|
+
if filters is None:
|
|
786
|
+
return slaves
|
|
787
|
+
return [s for s in slaves if all([f(s) for f in filters])]
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def get_resource_utilization_by_grouping(
|
|
791
|
+
grouping_func: _GenericNodeGroupingFunctionT,
|
|
792
|
+
mesos_state: MesosState,
|
|
793
|
+
filters: Sequence[_GenericNodeFilterFunctionT] = [],
|
|
794
|
+
sort_func: _GenericNodeSortFunctionT = None,
|
|
795
|
+
) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]:
|
|
796
|
+
"""Given a function used to group slaves and mesos state, calculate
|
|
797
|
+
resource utilization for each value of a given attribute.
|
|
798
|
+
|
|
799
|
+
:grouping_func: a function that given a slave, will return the value of an
|
|
800
|
+
attribute to group by.
|
|
801
|
+
:param mesos_state: the mesos state
|
|
802
|
+
:param filters: filters to apply to the slaves in the calculation, with
|
|
803
|
+
filtering preformed by filter_slaves
|
|
804
|
+
:param sort_func: a function that given a list of slaves, will return the
|
|
805
|
+
sorted list of slaves.
|
|
806
|
+
:returns: a dict of {attribute_value: resource_usage}, where resource usage
|
|
807
|
+
is the dict returned by ``calculate_resource_utilization_for_slaves`` for
|
|
808
|
+
slaves grouped by attribute value.
|
|
809
|
+
"""
|
|
810
|
+
slaves: Sequence[_SlaveT] = mesos_state.get("slaves", [])
|
|
811
|
+
slaves = filter_slaves(slaves, filters)
|
|
812
|
+
if not has_registered_slaves(mesos_state):
|
|
813
|
+
raise ValueError("There are no slaves registered in the mesos state.")
|
|
814
|
+
|
|
815
|
+
tasks = get_all_tasks_from_state(mesos_state, include_orphans=True)
|
|
816
|
+
non_terminal_tasks = [task for task in tasks if not is_task_terminal(task)]
|
|
817
|
+
slave_groupings = group_slaves_by_key_func(grouping_func, slaves, sort_func)
|
|
818
|
+
|
|
819
|
+
return {
|
|
820
|
+
attribute_value: calculate_resource_utilization_for_slaves(
|
|
821
|
+
slaves=slaves, tasks=filter_tasks_for_slaves(slaves, non_terminal_tasks)
|
|
822
|
+
)
|
|
823
|
+
for attribute_value, slaves in slave_groupings.items()
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
def get_resource_utilization_by_grouping_kube(
|
|
828
|
+
grouping_func: _GenericNodeGroupingFunctionT,
|
|
829
|
+
kube_client: KubeClient,
|
|
830
|
+
*,
|
|
831
|
+
namespace: str,
|
|
832
|
+
filters: Sequence[_GenericNodeFilterFunctionT] = [],
|
|
833
|
+
sort_func: _GenericNodeSortFunctionT = None,
|
|
834
|
+
) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]:
|
|
835
|
+
"""Given a function used to group nodes, calculate resource utilization
|
|
836
|
+
for each value of a given attribute.
|
|
837
|
+
|
|
838
|
+
:grouping_func: a function that given a node, will return the value of an
|
|
839
|
+
attribute to group by.
|
|
840
|
+
:param kube_client: the Kubernetes client
|
|
841
|
+
:param filters: filters to apply to the nodes in the calculation, with
|
|
842
|
+
filtering preformed by filter_slaves
|
|
843
|
+
:param sort_func: a function that given a list of nodes, will return the
|
|
844
|
+
sorted list of nodes.
|
|
845
|
+
:returns: a dict of {attribute_value: resource_usage}, where resource usage
|
|
846
|
+
is the dict returned by ``calculate_resource_utilization_for_kube_nodes`` for
|
|
847
|
+
nodes grouped by attribute value.
|
|
848
|
+
"""
|
|
849
|
+
nodes = get_all_nodes_cached(kube_client)
|
|
850
|
+
nodes = filter_slaves(nodes, filters)
|
|
851
|
+
if len(nodes) == 0:
|
|
852
|
+
raise ValueError("There are no nodes registered in the Kubernetes.")
|
|
853
|
+
|
|
854
|
+
node_groupings = group_slaves_by_key_func(grouping_func, nodes, sort_func)
|
|
855
|
+
|
|
856
|
+
pods = get_all_pods_cached(kube_client, namespace)
|
|
857
|
+
|
|
858
|
+
pods_by_node = {}
|
|
859
|
+
for node in nodes:
|
|
860
|
+
pods_by_node[node.metadata.name] = [
|
|
861
|
+
pod for pod in pods if pod.spec.node_name == node.metadata.name
|
|
862
|
+
]
|
|
863
|
+
return {
|
|
864
|
+
attribute_value: calculate_resource_utilization_for_kube_nodes(
|
|
865
|
+
nodes, pods_by_node
|
|
866
|
+
)
|
|
867
|
+
for attribute_value, nodes in node_groupings.items()
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def resource_utillizations_from_resource_info(
|
|
872
|
+
total: ResourceInfo, free: ResourceInfo
|
|
873
|
+
) -> Sequence[ResourceUtilization]:
|
|
874
|
+
"""
|
|
875
|
+
Given two ResourceInfo tuples, one for total and one for free,
|
|
876
|
+
create a ResourceUtilization tuple for each metric in the ResourceInfo.
|
|
877
|
+
:param total:
|
|
878
|
+
:param free:
|
|
879
|
+
:returns: ResourceInfo for a metric
|
|
880
|
+
"""
|
|
881
|
+
return [
|
|
882
|
+
ResourceUtilization(metric=field, total=total[index], free=free[index])
|
|
883
|
+
for index, field in enumerate(ResourceInfo._fields)
|
|
884
|
+
]
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
def has_registered_slaves(
|
|
888
|
+
mesos_state: MesosState,
|
|
889
|
+
) -> bool:
|
|
890
|
+
"""Return a boolean indicating if there are any slaves registered
|
|
891
|
+
to the master according to the mesos state.
|
|
892
|
+
:param mesos_state: the mesos state from the master
|
|
893
|
+
:returns: a boolean, indicating if there are > 0 slaves
|
|
894
|
+
"""
|
|
895
|
+
return len(mesos_state.get("slaves", [])) > 0
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
def get_mesos_resource_utilization_health(
|
|
899
|
+
mesos_metrics: MesosMetrics, mesos_state: MesosState
|
|
900
|
+
) -> Sequence[HealthCheckResult]:
|
|
901
|
+
"""Perform healthchecks against mesos metrics.
|
|
902
|
+
:param mesos_metrics: a dict exposing the mesos metrics described in
|
|
903
|
+
https://mesos.apache.org/documentation/latest/monitoring/
|
|
904
|
+
:returns: a list of HealthCheckResult tuples
|
|
905
|
+
"""
|
|
906
|
+
return [
|
|
907
|
+
assert_cpu_health(get_mesos_cpu_status(mesos_metrics, mesos_state)),
|
|
908
|
+
assert_memory_health(get_mesos_memory_status(mesos_metrics, mesos_state)),
|
|
909
|
+
assert_disk_health(get_mesos_disk_status(mesos_metrics, mesos_state)),
|
|
910
|
+
assert_gpu_health(get_mesos_gpu_status(mesos_metrics, mesos_state)),
|
|
911
|
+
assert_mesos_tasks_running(mesos_metrics),
|
|
912
|
+
assert_nodes_health(get_mesos_slaves_health_status(mesos_metrics)),
|
|
913
|
+
]
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def get_kube_resource_utilization_health(
|
|
917
|
+
kube_client: KubeClient,
|
|
918
|
+
) -> Sequence[HealthCheckResult]:
|
|
919
|
+
"""Perform healthchecks against Kubernetes.
|
|
920
|
+
:param kube_client: the KUbernetes client
|
|
921
|
+
:returns: a list of HealthCheckResult tuples
|
|
922
|
+
"""
|
|
923
|
+
|
|
924
|
+
nodes = get_all_nodes_cached(kube_client)
|
|
925
|
+
|
|
926
|
+
return [
|
|
927
|
+
assert_cpu_health(get_kube_cpu_status(nodes)),
|
|
928
|
+
assert_memory_health(get_kube_memory_status(nodes)),
|
|
929
|
+
assert_disk_health(get_kube_disk_status(nodes)),
|
|
930
|
+
assert_gpu_health(get_kube_gpu_status(nodes)),
|
|
931
|
+
assert_nodes_health(get_kube_nodes_health_status(nodes)),
|
|
932
|
+
]
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def get_mesos_state_status(
|
|
936
|
+
mesos_state: MesosState,
|
|
937
|
+
) -> Sequence[HealthCheckResult]:
|
|
938
|
+
"""Perform healthchecks against mesos state.
|
|
939
|
+
:param mesos_state: a dict exposing the mesos state described in
|
|
940
|
+
https://mesos.apache.org/documentation/latest/endpoints/master/state.json/
|
|
941
|
+
:returns: a list of HealthCheckResult tuples
|
|
942
|
+
"""
|
|
943
|
+
return [
|
|
944
|
+
assert_quorum_size(),
|
|
945
|
+
]
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
def run_healthchecks_with_param(
|
|
949
|
+
param: Any,
|
|
950
|
+
healthcheck_functions: Sequence[Callable[..., HealthCheckResult]],
|
|
951
|
+
format_options: Mapping[str, Any] = {},
|
|
952
|
+
) -> Sequence[HealthCheckResult]:
|
|
953
|
+
return [
|
|
954
|
+
healthcheck(param, **format_options) for healthcheck in healthcheck_functions
|
|
955
|
+
]
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
def assert_kube_deployments(
|
|
959
|
+
kube_client: KubeClient, namespace: str
|
|
960
|
+
) -> HealthCheckResult:
|
|
961
|
+
num_deployments = len(list_all_deployments(kube_client, namespace))
|
|
962
|
+
return HealthCheckResult(
|
|
963
|
+
message=f"Kubernetes deployments: {num_deployments:>3}", healthy=True
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
def get_kube_status(
|
|
968
|
+
kube_client: KubeClient, namespace: str
|
|
969
|
+
) -> Sequence[HealthCheckResult]:
|
|
970
|
+
"""Gather information about Kubernetes.
|
|
971
|
+
:param kube_client: the KUbernetes client
|
|
972
|
+
:return: string containing the status
|
|
973
|
+
"""
|
|
974
|
+
return run_healthchecks_with_param(
|
|
975
|
+
[kube_client, namespace], [assert_kube_deployments, assert_kube_pods_running]
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
def critical_events_in_outputs(healthcheck_outputs):
|
|
980
|
+
"""Given a list of HealthCheckResults return those which are unhealthy."""
|
|
981
|
+
return [
|
|
982
|
+
healthcheck
|
|
983
|
+
for healthcheck in healthcheck_outputs
|
|
984
|
+
if healthcheck.healthy is False
|
|
985
|
+
]
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
def generate_summary_for_check(name, ok):
|
|
989
|
+
"""Given a check name and a boolean indicating if the service is OK, return
|
|
990
|
+
a formatted message.
|
|
991
|
+
"""
|
|
992
|
+
status = PaastaColors.green("OK") if ok is True else PaastaColors.red("CRITICAL")
|
|
993
|
+
summary = f"{name} Status: {status}"
|
|
994
|
+
return summary
|
|
995
|
+
|
|
996
|
+
|
|
997
|
+
def status_for_results(healthcheck_results):
|
|
998
|
+
"""Given a list of HealthCheckResult tuples, return the ok status
|
|
999
|
+
for each one.
|
|
1000
|
+
:param healthcheck_results: a list of HealthCheckResult tuples
|
|
1001
|
+
:returns: a list of booleans.
|
|
1002
|
+
"""
|
|
1003
|
+
return [result.healthy for result in healthcheck_results]
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
def print_results_for_healthchecks(summary, ok, results, verbose, indent=2):
|
|
1007
|
+
print(summary)
|
|
1008
|
+
if verbose >= 1:
|
|
1009
|
+
for health_check_result in results:
|
|
1010
|
+
if health_check_result.healthy:
|
|
1011
|
+
print_with_indent(health_check_result.message, indent)
|
|
1012
|
+
else:
|
|
1013
|
+
print_with_indent(PaastaColors.red(health_check_result.message), indent)
|
|
1014
|
+
elif not ok:
|
|
1015
|
+
unhealthy_results = critical_events_in_outputs(results)
|
|
1016
|
+
for health_check_result in unhealthy_results:
|
|
1017
|
+
print_with_indent(PaastaColors.red(health_check_result.message), indent)
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
def healthcheck_result_resource_utilization_pair_for_resource_utilization(
|
|
1021
|
+
utilization, threshold
|
|
1022
|
+
):
|
|
1023
|
+
"""Given a ResourceUtilization, produce a tuple of (HealthCheckResult, ResourceUtilization),
|
|
1024
|
+
where that HealthCheckResult describes the 'health' of a given utilization.
|
|
1025
|
+
:param utilization: a ResourceUtilization tuple
|
|
1026
|
+
:param threshold: a threshold which decides the health of the given ResourceUtilization
|
|
1027
|
+
:returns: a tuple of (HealthCheckResult, ResourceUtilization)
|
|
1028
|
+
"""
|
|
1029
|
+
return (
|
|
1030
|
+
healthcheck_result_for_resource_utilization(utilization, threshold),
|
|
1031
|
+
utilization,
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
def format_table_column_for_healthcheck_resource_utilization_pair(
|
|
1036
|
+
healthcheck_utilization_pair,
|
|
1037
|
+
):
|
|
1038
|
+
"""Given a tuple of (HealthCheckResult, ResourceUtilization), return a
|
|
1039
|
+
string representation of the ResourceUtilization such that it is formatted
|
|
1040
|
+
according to the value of HealthCheckResult.healthy.
|
|
1041
|
+
|
|
1042
|
+
:param healthcheck_utilization_pair: a tuple of (HealthCheckResult, ResourceUtilization)
|
|
1043
|
+
:returns: a string representing the ResourceUtilization.
|
|
1044
|
+
"""
|
|
1045
|
+
color_func = (
|
|
1046
|
+
PaastaColors.green
|
|
1047
|
+
if healthcheck_utilization_pair[0].healthy
|
|
1048
|
+
else PaastaColors.red
|
|
1049
|
+
)
|
|
1050
|
+
utilization = (
|
|
1051
|
+
healthcheck_utilization_pair[1].total - healthcheck_utilization_pair[1].free
|
|
1052
|
+
)
|
|
1053
|
+
if int(healthcheck_utilization_pair[1].total) == 0:
|
|
1054
|
+
utilization_perc = 100
|
|
1055
|
+
else:
|
|
1056
|
+
utilization_perc = (
|
|
1057
|
+
utilization / float(healthcheck_utilization_pair[1].total) * 100
|
|
1058
|
+
)
|
|
1059
|
+
if healthcheck_utilization_pair[1].metric not in ["cpus", "gpus"]:
|
|
1060
|
+
return color_func(
|
|
1061
|
+
"{}/{} ({:.2f}%)".format(
|
|
1062
|
+
naturalsize(utilization * 1024 * 1024, gnu=True),
|
|
1063
|
+
naturalsize(
|
|
1064
|
+
healthcheck_utilization_pair[1].total * 1024 * 1024, gnu=True
|
|
1065
|
+
),
|
|
1066
|
+
utilization_perc,
|
|
1067
|
+
)
|
|
1068
|
+
)
|
|
1069
|
+
else:
|
|
1070
|
+
return color_func(
|
|
1071
|
+
"{:.2f}/{:.0f} ({:.2f}%)".format(
|
|
1072
|
+
utilization, healthcheck_utilization_pair[1].total, utilization_perc
|
|
1073
|
+
)
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
|
|
1077
|
+
def format_row_for_resource_utilization_healthchecks(healthcheck_utilization_pairs):
|
|
1078
|
+
"""Given a list of (HealthCheckResult, ResourceUtilization) tuples, return a list with each of those
|
|
1079
|
+
tuples represented by a formatted string.
|
|
1080
|
+
|
|
1081
|
+
:param healthcheck_utilization_pairs: a list of (HealthCheckResult, ResourceUtilization) tuples.
|
|
1082
|
+
:returns: a list containing a string representation of each (HealthCheckResult, ResourceUtilization) tuple.
|
|
1083
|
+
"""
|
|
1084
|
+
return [
|
|
1085
|
+
format_table_column_for_healthcheck_resource_utilization_pair(pair)
|
|
1086
|
+
for pair in healthcheck_utilization_pairs
|
|
1087
|
+
]
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
def get_table_rows_for_resource_info_dict(
|
|
1091
|
+
attribute_values, healthcheck_utilization_pairs
|
|
1092
|
+
):
|
|
1093
|
+
"""A wrapper method to join together
|
|
1094
|
+
|
|
1095
|
+
:param attribute: The attribute value and formatted columns to be shown in
|
|
1096
|
+
a single row. :param attribute_value: The value of the attribute
|
|
1097
|
+
associated with the row. This becomes index 0 in the array returned.
|
|
1098
|
+
:param healthcheck_utilization_pairs: a list of 2-tuples, where each tuple has the elements
|
|
1099
|
+
(HealthCheckResult, ResourceUtilization)
|
|
1100
|
+
:returns: a list of strings, representing a row in a table to be formatted.
|
|
1101
|
+
"""
|
|
1102
|
+
return attribute_values + format_row_for_resource_utilization_healthchecks(
|
|
1103
|
+
healthcheck_utilization_pairs
|
|
1104
|
+
)
|
|
1105
|
+
|
|
1106
|
+
|
|
1107
|
+
def reserved_maintenence_resources(
|
|
1108
|
+
resources: MesosResources,
|
|
1109
|
+
):
|
|
1110
|
+
return resources.get(MAINTENANCE_ROLE, {"cpus": 0, "mem": 0, "disk": 0, "gpus": 0})
|