paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
# try and catch both /opt/venvs/paasta-tools and ~/pg/paasta/.tox/py38-linux as if we're being run as an application,
|
|
6
|
+
# we likely want to fail on a potential slowdown rather than experience a performance regression
|
|
7
|
+
if "paasta" in sys.prefix:
|
|
8
|
+
from yaml import CSafeLoader as Loader
|
|
9
|
+
from yaml import CSafeDumper as Dumper
|
|
10
|
+
# but for the vanishingly few instances where folks add us as as dependency, we don't enforce that they use the
|
|
11
|
+
# C-accelerated Loader/Dumper
|
|
12
|
+
else:
|
|
13
|
+
try:
|
|
14
|
+
from yaml import CSafeLoader as Loader
|
|
15
|
+
from yaml import CSafeDumper as Dumper
|
|
16
|
+
except ImportError: # pragma: no cover
|
|
17
|
+
from yaml import SafeLoader as Loader # type: ignore
|
|
18
|
+
from yaml import SafeDumper as Dumper # type: ignore
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def dump(*args, **kwargs):
|
|
22
|
+
kwargs["Dumper"] = Dumper
|
|
23
|
+
return yaml.dump(*args, **kwargs)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def dump_all(*args, **kwargs):
|
|
27
|
+
kwargs["Dumper"] = Dumper
|
|
28
|
+
return yaml.dump_all(*args, **kwargs)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load(*args, **kwargs):
|
|
32
|
+
kwargs["Loader"] = Loader
|
|
33
|
+
return yaml.load(*args, **kwargs)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def load_all(*args, **kwargs):
|
|
37
|
+
kwargs["Loader"] = Loader
|
|
38
|
+
return yaml.load_all(*args, **kwargs)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
safe_dump = dump
|
|
42
|
+
safe_dump_all = dump_all
|
|
43
|
+
safe_load = load
|
|
44
|
+
safe_load_all = load_all
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from filecmp import cmp
|
|
5
|
+
from shutil import copy
|
|
6
|
+
from subprocess import CalledProcessError
|
|
7
|
+
from subprocess import run
|
|
8
|
+
from traceback import print_exc
|
|
9
|
+
|
|
10
|
+
APPLIED_DIRECTORY = ".applied"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# This script expects the KUBECONFIG environment variable to be set correctly
|
|
14
|
+
def main(puppet_resource_root: str) -> int:
|
|
15
|
+
exit_code = 0
|
|
16
|
+
applied_resource_root = os.path.join(puppet_resource_root, APPLIED_DIRECTORY)
|
|
17
|
+
|
|
18
|
+
# Loop through everything in the puppet resource path
|
|
19
|
+
for root, dirs, files in os.walk(puppet_resource_root):
|
|
20
|
+
# modifying the 'dirs' variable in-place will update the order that os.walk visits things
|
|
21
|
+
if APPLIED_DIRECTORY in dirs:
|
|
22
|
+
dirs.remove(APPLIED_DIRECTORY)
|
|
23
|
+
dirs.sort() # Need to apply things in the correct order
|
|
24
|
+
|
|
25
|
+
# Check to see if there's a difference between what Puppet created and
|
|
26
|
+
# what's been previously applied
|
|
27
|
+
for filename in sorted([f for f in files if f.endswith(".yaml")]):
|
|
28
|
+
path = os.path.join(root, filename)
|
|
29
|
+
applied_path = os.path.join(
|
|
30
|
+
applied_resource_root, os.path.relpath(path, puppet_resource_root)
|
|
31
|
+
)
|
|
32
|
+
print(f"comparing {path} and {applied_path}")
|
|
33
|
+
if not os.path.exists(applied_path) or not cmp(
|
|
34
|
+
path, applied_path, shallow=False
|
|
35
|
+
):
|
|
36
|
+
# This is idempotent; if something gets out of sync and a resource gets applied
|
|
37
|
+
# a second time, kubectl just won't make any changes
|
|
38
|
+
try:
|
|
39
|
+
run(["kubectl", "apply", "-f", path], check=True)
|
|
40
|
+
os.makedirs(os.path.dirname(applied_path), exist_ok=True)
|
|
41
|
+
copy(path, applied_path)
|
|
42
|
+
except CalledProcessError:
|
|
43
|
+
print(f"There was a problem applying {path}:\n")
|
|
44
|
+
print_exc(
|
|
45
|
+
file=sys.stdout
|
|
46
|
+
) # keep all messages on the same stream so they're in order
|
|
47
|
+
exit_code = 1
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
# Loop through all the files that have been previously applied and see
|
|
51
|
+
# if Puppet has removed any of them
|
|
52
|
+
for root, dirs, files in os.walk(applied_resource_root):
|
|
53
|
+
dirs.sort(reverse=True) # for deleting things, we need to go in reverse order
|
|
54
|
+
for filename in sorted([f for f in files if f.endswith(".yaml")], reverse=True):
|
|
55
|
+
path = os.path.join(root, filename)
|
|
56
|
+
puppet_path = os.path.join(
|
|
57
|
+
puppet_resource_root, os.path.relpath(path, applied_resource_root)
|
|
58
|
+
)
|
|
59
|
+
if not os.path.exists(puppet_path):
|
|
60
|
+
print(f"Deleting resource {path}...")
|
|
61
|
+
try:
|
|
62
|
+
run(
|
|
63
|
+
["kubectl", "delete", "--ignore-not-found=true", "-f", path],
|
|
64
|
+
check=True,
|
|
65
|
+
)
|
|
66
|
+
os.remove(path)
|
|
67
|
+
except CalledProcessError:
|
|
68
|
+
print(f"There was a problem deleting {path}:\n")
|
|
69
|
+
print_exc(
|
|
70
|
+
file=sys.stdout
|
|
71
|
+
) # keep all messages on the same stream so they're in order
|
|
72
|
+
exit_code = 1
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
return exit_code
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == "__main__":
|
|
79
|
+
sys.exit(main(sys.argv[1]))
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
import itertools
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_datetime_from_ts(ts):
|
|
10
|
+
tformat = "%Y-%m-%dT%H:%M:%S.%f"
|
|
11
|
+
return datetime.strptime(ts, tformat)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_deploy_durations_from_file(filename):
|
|
15
|
+
"""
|
|
16
|
+
filename: path to a file to be parsed for datetime data
|
|
17
|
+
The expected input is a paasta service log for the deploy events
|
|
18
|
+
The way I've been fetching them is by running 'internal logreader command' | grep deploy | grep event > filename
|
|
19
|
+
"""
|
|
20
|
+
file_object = open(filename, "r")
|
|
21
|
+
data = sorted(
|
|
22
|
+
[json.loads(line.rstrip("\n")) for line in file_object],
|
|
23
|
+
key=lambda x: get_datetime_from_ts(x["timestamp"]),
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
timedeltas = defaultdict(list)
|
|
27
|
+
last_time = dict()
|
|
28
|
+
instance_bitvector = defaultdict(bool) # defaults to False
|
|
29
|
+
|
|
30
|
+
for datum in data:
|
|
31
|
+
time = get_datetime_from_ts(datum["timestamp"])
|
|
32
|
+
instance = datum["instance"]
|
|
33
|
+
if "in progress" in datum["message"] and not instance_bitvector[instance]:
|
|
34
|
+
instance_bitvector[instance] = True
|
|
35
|
+
last_time[instance] = time
|
|
36
|
+
elif "finishing" in datum["message"]:
|
|
37
|
+
instance_bitvector[instance] = False
|
|
38
|
+
timedeltas[instance].append(time - last_time[instance])
|
|
39
|
+
|
|
40
|
+
return timedeltas
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def display_bounce_info(timedeltas):
|
|
44
|
+
"""
|
|
45
|
+
timedeltas: iterable of timedelta objects
|
|
46
|
+
"""
|
|
47
|
+
std = list(sorted(timedeltas))
|
|
48
|
+
print("Median time to bounce: {} seconds".format(std[len(std) / 2]))
|
|
49
|
+
print("10% time to bounce: {}".format(std[len(std) / 10]))
|
|
50
|
+
print("90% time to bounce: {}".format(std[len(std) * 9 / 10]))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def main(filenames):
|
|
54
|
+
for filename in filenames:
|
|
55
|
+
print(filename)
|
|
56
|
+
print("=========================")
|
|
57
|
+
timedeltas = get_deploy_durations_from_file(filename)
|
|
58
|
+
for instance, tdlist in timedeltas.items():
|
|
59
|
+
if timedeltas:
|
|
60
|
+
print("Instance: %s" % instance)
|
|
61
|
+
display_bounce_info(tdlist)
|
|
62
|
+
print("Overall:")
|
|
63
|
+
display_bounce_info(itertools.chain.from_iterable(timedeltas.values()))
|
|
64
|
+
print("=========================")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == "__main__":
|
|
68
|
+
main(filenames=sys.argv[1:])
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
import argparse
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Type
|
|
6
|
+
|
|
7
|
+
import pysensu_yelp
|
|
8
|
+
|
|
9
|
+
from paasta_tools.eks_tools import EksDeploymentConfig
|
|
10
|
+
from paasta_tools.instance import kubernetes as pik
|
|
11
|
+
from paasta_tools.kubernetes_tools import get_kubernetes_app_name
|
|
12
|
+
from paasta_tools.kubernetes_tools import KubeClient
|
|
13
|
+
from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig
|
|
14
|
+
from paasta_tools.metrics.metastatus_lib import suffixed_number_value
|
|
15
|
+
from paasta_tools.monitoring_tools import send_event
|
|
16
|
+
from paasta_tools.paasta_service_config_loader import PaastaServiceConfigLoader
|
|
17
|
+
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
18
|
+
from paasta_tools.utils import list_services
|
|
19
|
+
from paasta_tools.utils import load_system_paasta_config
|
|
20
|
+
from paasta_tools.utils import SystemPaastaConfig
|
|
21
|
+
|
|
22
|
+
log = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse_args():
|
|
26
|
+
parser = argparse.ArgumentParser(
|
|
27
|
+
description=(
|
|
28
|
+
"Check all autoscaled services to see if they're at their max_instances. If"
|
|
29
|
+
" so, send an alert if their utilization is above"
|
|
30
|
+
" max_instances_alert_threshold."
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"-d",
|
|
35
|
+
"--soa-dir",
|
|
36
|
+
dest="soa_dir",
|
|
37
|
+
default=DEFAULT_SOA_DIR,
|
|
38
|
+
help="Use a different soa config directory",
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"--dry-run",
|
|
42
|
+
dest="dry_run",
|
|
43
|
+
action="store_true",
|
|
44
|
+
help="Print Sensu alert events instead of sending them",
|
|
45
|
+
)
|
|
46
|
+
return parser.parse_args()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def check_max_instances(
|
|
50
|
+
soa_dir: str,
|
|
51
|
+
cluster: str,
|
|
52
|
+
instance_type_class: Type[KubernetesDeploymentConfig],
|
|
53
|
+
system_paasta_config: SystemPaastaConfig,
|
|
54
|
+
dry_run: bool = False,
|
|
55
|
+
):
|
|
56
|
+
kube_client = KubeClient()
|
|
57
|
+
for service in list_services(soa_dir=soa_dir):
|
|
58
|
+
service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir)
|
|
59
|
+
for job_config in service_config.instance_configs(
|
|
60
|
+
cluster=cluster, instance_type_class=instance_type_class
|
|
61
|
+
):
|
|
62
|
+
instance = job_config.get_instance()
|
|
63
|
+
if not job_config.get_autoscaling_metric_spec(
|
|
64
|
+
name=get_kubernetes_app_name(service, instance),
|
|
65
|
+
cluster=cluster,
|
|
66
|
+
kube_client=kube_client,
|
|
67
|
+
namespace=job_config.get_namespace(),
|
|
68
|
+
):
|
|
69
|
+
# Not an instance that uses HPA, don't check.
|
|
70
|
+
# TODO: should we send status=0 here, in case someone disables autoscaling for their service / changes
|
|
71
|
+
# to bespoke autoscaler?
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
if not job_config.get_docker_image():
|
|
75
|
+
# skip services that haven't been marked for deployment yet.
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
autoscaling_status = await pik.autoscaling_status(
|
|
79
|
+
kube_client=kube_client,
|
|
80
|
+
job_config=job_config,
|
|
81
|
+
namespace=job_config.get_namespace(),
|
|
82
|
+
)
|
|
83
|
+
if autoscaling_status["min_instances"] == -1:
|
|
84
|
+
log.warning(
|
|
85
|
+
f"HPA {job_config.get_sanitised_deployment_name()} not found."
|
|
86
|
+
)
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
if (
|
|
90
|
+
autoscaling_status["min_instances"]
|
|
91
|
+
== autoscaling_status["max_instances"]
|
|
92
|
+
) and "canary" in instance:
|
|
93
|
+
status = pysensu_yelp.Status.OK
|
|
94
|
+
output = (
|
|
95
|
+
f"Not checking {service}.{instance} as the instance name contains"
|
|
96
|
+
' "canary" and min_instances == max_instances.'
|
|
97
|
+
)
|
|
98
|
+
elif (
|
|
99
|
+
autoscaling_status["desired_replicas"]
|
|
100
|
+
>= autoscaling_status["max_instances"]
|
|
101
|
+
):
|
|
102
|
+
|
|
103
|
+
metrics_provider_configs = job_config.get_autoscaling_params()[
|
|
104
|
+
"metrics_providers"
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
status = pysensu_yelp.Status.UNKNOWN
|
|
108
|
+
output = "how are there no metrics for this thing?"
|
|
109
|
+
|
|
110
|
+
# This makes an assumption that the metrics currently used by the HPA are exactly the same order (and
|
|
111
|
+
# length) as the list of metrics_providers dictionaries. This should generally be true, but between
|
|
112
|
+
# yelpsoa-configs being pushed and the HPA actually being updated it may not be true. This might cause
|
|
113
|
+
# spurious alerts, but hopefully the frequency is low. We can add some safeguards if it's a problem.
|
|
114
|
+
# (E.g. smarter matching between the status dicts and the config dicts, or bailing/not alerting if the
|
|
115
|
+
# lists aren't the same lengths.)
|
|
116
|
+
for metric, metrics_provider_config in zip(
|
|
117
|
+
autoscaling_status["metrics"], metrics_provider_configs
|
|
118
|
+
):
|
|
119
|
+
|
|
120
|
+
setpoint = metrics_provider_config["setpoint"]
|
|
121
|
+
threshold = metrics_provider_config.get(
|
|
122
|
+
"max_instances_alert_threshold",
|
|
123
|
+
setpoint,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
current_value = suffixed_number_value(metric["current_value"])
|
|
128
|
+
target_value = suffixed_number_value(metric["target_value"])
|
|
129
|
+
except KeyError:
|
|
130
|
+
# we likely couldn't find values for the current metric from autoscaling status
|
|
131
|
+
# if this is the only metric, we will return UNKNOWN+this error
|
|
132
|
+
# suggest fixing their autoscaling config
|
|
133
|
+
output = f'{service}.{instance}: Service is at max_instances, and there is an error fetching your {metrics_provider_config["type"]} metric. Check your autoscaling configs or reach out to #paasta.'
|
|
134
|
+
else:
|
|
135
|
+
# target_value can be 100*setpoint (for cpu), 1 (for uwsgi, piscina, gunicorn,
|
|
136
|
+
# active_requests), or setpoint (for promql).
|
|
137
|
+
# Here we divide current_value by target_value to find the ratio of utilization to setpoint,
|
|
138
|
+
# and then multiply by setpoint to find the actual utilization in the same units as setpoint.
|
|
139
|
+
utilization = setpoint * current_value / target_value
|
|
140
|
+
|
|
141
|
+
if threshold == setpoint:
|
|
142
|
+
threshold_description = f"setpoint ({threshold})"
|
|
143
|
+
else:
|
|
144
|
+
threshold_description = (
|
|
145
|
+
f"max_instances_alert_threshold ({threshold})"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if utilization > threshold:
|
|
149
|
+
status = pysensu_yelp.Status.CRITICAL
|
|
150
|
+
output = (
|
|
151
|
+
f"{service}.{instance}: Service is at max_instances, and"
|
|
152
|
+
f" utilization ({utilization}) is greater than"
|
|
153
|
+
f" {threshold_description}."
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
status = pysensu_yelp.Status.OK
|
|
157
|
+
output = (
|
|
158
|
+
f"{service}.{instance}: Service is at max_instances, but"
|
|
159
|
+
f" utilization ({utilization}) is less than"
|
|
160
|
+
f" {threshold_description}."
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
status = pysensu_yelp.Status.OK
|
|
164
|
+
output = f"{service}.{instance} is below max_instances."
|
|
165
|
+
|
|
166
|
+
monitoring_overrides = job_config.get_monitoring()
|
|
167
|
+
monitoring_overrides.update(
|
|
168
|
+
{
|
|
169
|
+
"page": False, # TODO: remove this line once this alert has been deployed for a little while.
|
|
170
|
+
"runbook": "y/check-autoscaler-max-instances",
|
|
171
|
+
"realert_every": 60, # The check runs once a minute, so this would realert every hour.
|
|
172
|
+
"tip": (
|
|
173
|
+
"The autoscaler wants to scale up to handle additional load"
|
|
174
|
+
" because your service is overloaded, but cannot scale any"
|
|
175
|
+
" higher because of max_instances. You may want to bump"
|
|
176
|
+
" max_instances. To make this alert quieter, adjust"
|
|
177
|
+
" autoscaling.metrics_providers[n].max_instances_alert_threshold in yelpsoa-configs."
|
|
178
|
+
),
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
send_event(
|
|
182
|
+
service,
|
|
183
|
+
check_name=f"check_autoscaler_max_instances.{service}.{instance}",
|
|
184
|
+
overrides=monitoring_overrides,
|
|
185
|
+
status=status,
|
|
186
|
+
output=output,
|
|
187
|
+
soa_dir=soa_dir,
|
|
188
|
+
ttl=None,
|
|
189
|
+
cluster=cluster,
|
|
190
|
+
system_paasta_config=system_paasta_config,
|
|
191
|
+
dry_run=dry_run,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def main():
|
|
196
|
+
args = parse_args()
|
|
197
|
+
system_paasta_config = load_system_paasta_config()
|
|
198
|
+
|
|
199
|
+
for instance_type_class in [KubernetesDeploymentConfig, EksDeploymentConfig]:
|
|
200
|
+
asyncio.run(
|
|
201
|
+
check_max_instances(
|
|
202
|
+
soa_dir=args.soa_dir,
|
|
203
|
+
cluster=system_paasta_config.get_cluster(),
|
|
204
|
+
instance_type_class=instance_type_class,
|
|
205
|
+
system_paasta_config=system_paasta_config,
|
|
206
|
+
dry_run=args.dry_run,
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
main()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
# Copyright 2015-2019 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
Usage: ./check_cassandracluster_services_replication.py [options]
|
|
17
|
+
"""
|
|
18
|
+
import logging
|
|
19
|
+
|
|
20
|
+
from paasta_tools import cassandracluster_tools
|
|
21
|
+
from paasta_tools.check_kubernetes_services_replication import (
|
|
22
|
+
check_kubernetes_pod_replication,
|
|
23
|
+
)
|
|
24
|
+
from paasta_tools.check_services_replication_tools import main
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
log = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if __name__ == "__main__":
|
|
31
|
+
main(
|
|
32
|
+
cassandracluster_tools.CassandraClusterDeploymentConfig,
|
|
33
|
+
check_kubernetes_pod_replication,
|
|
34
|
+
namespace="paasta-cassandraclusters",
|
|
35
|
+
)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
# Copyright 2015-2019 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
Usage: ./check_flink_services_health.py [options]
|
|
17
|
+
"""
|
|
18
|
+
import datetime
|
|
19
|
+
import logging
|
|
20
|
+
from typing import Dict
|
|
21
|
+
from typing import List
|
|
22
|
+
from typing import Sequence
|
|
23
|
+
from typing import Tuple
|
|
24
|
+
|
|
25
|
+
import pysensu_yelp
|
|
26
|
+
|
|
27
|
+
from paasta_tools import flink_tools
|
|
28
|
+
from paasta_tools import flinkeks_tools
|
|
29
|
+
from paasta_tools.check_services_replication_tools import main
|
|
30
|
+
from paasta_tools.check_services_replication_tools import parse_args
|
|
31
|
+
from paasta_tools.flink_tools import FlinkDeploymentConfig
|
|
32
|
+
from paasta_tools.kubernetes_tools import is_pod_ready
|
|
33
|
+
from paasta_tools.kubernetes_tools import V1Pod
|
|
34
|
+
from paasta_tools.monitoring_tools import check_under_replication
|
|
35
|
+
from paasta_tools.monitoring_tools import send_replication_event
|
|
36
|
+
from paasta_tools.smartstack_tools import KubeSmartstackEnvoyReplicationChecker
|
|
37
|
+
from paasta_tools.utils import is_under_replicated
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
log = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def container_lifetime(
|
|
44
|
+
pod: V1Pod,
|
|
45
|
+
) -> datetime.timedelta:
|
|
46
|
+
"""Return a time duration for how long the pod is alive"""
|
|
47
|
+
st = pod.status.start_time
|
|
48
|
+
return datetime.datetime.now(st.tzinfo) - st
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def healthy_flink_containers_cnt(si_pods: Sequence[V1Pod], container_type: str) -> int:
|
|
52
|
+
"""Return count of healthy Flink containers with given type"""
|
|
53
|
+
return len(
|
|
54
|
+
[
|
|
55
|
+
pod
|
|
56
|
+
for pod in si_pods
|
|
57
|
+
if pod.metadata.labels["flink.yelp.com/container-type"] == container_type
|
|
58
|
+
and is_pod_ready(pod)
|
|
59
|
+
and container_lifetime(pod).total_seconds() > 60
|
|
60
|
+
]
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def check_under_registered_taskmanagers(
|
|
65
|
+
instance_config: FlinkDeploymentConfig,
|
|
66
|
+
expected_count: int,
|
|
67
|
+
cr_name: str,
|
|
68
|
+
is_eks: bool,
|
|
69
|
+
) -> Tuple[bool, str, str]:
|
|
70
|
+
"""Check if not enough taskmanagers have been registered to the jobmanager and
|
|
71
|
+
returns both the result of the check in the form of a boolean and a human-readable
|
|
72
|
+
text to be used in logging or monitoring events.
|
|
73
|
+
"""
|
|
74
|
+
unhealthy = True
|
|
75
|
+
if cr_name != "":
|
|
76
|
+
try:
|
|
77
|
+
overview = flink_tools.get_flink_jobmanager_overview(
|
|
78
|
+
cr_name, instance_config.cluster, is_eks
|
|
79
|
+
)
|
|
80
|
+
num_reported = overview.get("taskmanagers", 0)
|
|
81
|
+
crit_threshold = instance_config.get_replication_crit_percentage()
|
|
82
|
+
output = (
|
|
83
|
+
f"{instance_config.job_id} has {num_reported}/{expected_count} "
|
|
84
|
+
f"taskmanagers reported by dashboard (threshold: {crit_threshold}%)"
|
|
85
|
+
)
|
|
86
|
+
unhealthy, _ = is_under_replicated(
|
|
87
|
+
num_reported, expected_count, crit_threshold
|
|
88
|
+
)
|
|
89
|
+
except ValueError as e:
|
|
90
|
+
output = (
|
|
91
|
+
f"Dashboard of service {instance_config.job_id} is not available ({e})"
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
output = f"Dashboard of service {instance_config.job_id} is not available"
|
|
95
|
+
if unhealthy:
|
|
96
|
+
description = f"""
|
|
97
|
+
This alert means that the Flink dashboard is not reporting the expected
|
|
98
|
+
number of taskmanagers.
|
|
99
|
+
|
|
100
|
+
Reasons this might be happening:
|
|
101
|
+
|
|
102
|
+
The service may simply be unhealthy. There also may not be enough resources
|
|
103
|
+
in the cluster to support the requested instance count.
|
|
104
|
+
|
|
105
|
+
Things you can do:
|
|
106
|
+
|
|
107
|
+
* Fix the cause of the unhealthy service. Try running:
|
|
108
|
+
|
|
109
|
+
paasta status -s {instance_config.service} -i {instance_config.instance} -c {instance_config.cluster} -vv
|
|
110
|
+
|
|
111
|
+
"""
|
|
112
|
+
else:
|
|
113
|
+
description = f"{instance_config.job_id} taskmanager is available"
|
|
114
|
+
return unhealthy, output, description
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_cr_name(si_pods: Sequence[V1Pod]) -> str:
|
|
118
|
+
"""Returns the flink custom resource name based on the pod name. We are randomly choosing jobmanager pod here.
|
|
119
|
+
This change is related to FLINK-3129
|
|
120
|
+
"""
|
|
121
|
+
jobmanager_pod = [
|
|
122
|
+
pod
|
|
123
|
+
for pod in si_pods
|
|
124
|
+
if pod.metadata.labels["flink.yelp.com/container-type"] == "jobmanager"
|
|
125
|
+
and is_pod_ready(pod)
|
|
126
|
+
and container_lifetime(pod).total_seconds() > 60
|
|
127
|
+
]
|
|
128
|
+
if len(jobmanager_pod) == 1:
|
|
129
|
+
return jobmanager_pod[0].metadata.name.split("-jobmanager-")[0]
|
|
130
|
+
else:
|
|
131
|
+
return ""
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def check_flink_service_health(
|
|
135
|
+
instance_config: FlinkDeploymentConfig,
|
|
136
|
+
pods_by_service_instance: Dict[str, Dict[str, List[V1Pod]]],
|
|
137
|
+
replication_checker: KubeSmartstackEnvoyReplicationChecker,
|
|
138
|
+
dry_run: bool = False,
|
|
139
|
+
) -> None:
|
|
140
|
+
si_pods = pods_by_service_instance.get(instance_config.service, {}).get(
|
|
141
|
+
instance_config.instance, []
|
|
142
|
+
)
|
|
143
|
+
taskmanagers_expected_cnt = instance_config.config_dict.get(
|
|
144
|
+
"taskmanager", {"instances": 10}
|
|
145
|
+
).get("instances", 10)
|
|
146
|
+
num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor")
|
|
147
|
+
num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager")
|
|
148
|
+
num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager")
|
|
149
|
+
|
|
150
|
+
service_cr_name = get_cr_name(si_pods)
|
|
151
|
+
|
|
152
|
+
results = [
|
|
153
|
+
check_under_replication(
|
|
154
|
+
instance_config=instance_config,
|
|
155
|
+
expected_count=1,
|
|
156
|
+
num_available=num_healthy_supervisors,
|
|
157
|
+
sub_component="supervisor",
|
|
158
|
+
),
|
|
159
|
+
check_under_replication(
|
|
160
|
+
instance_config=instance_config,
|
|
161
|
+
expected_count=1,
|
|
162
|
+
num_available=num_healthy_jobmanagers,
|
|
163
|
+
sub_component="jobmanager",
|
|
164
|
+
),
|
|
165
|
+
check_under_replication(
|
|
166
|
+
instance_config=instance_config,
|
|
167
|
+
expected_count=taskmanagers_expected_cnt,
|
|
168
|
+
num_available=num_healthy_taskmanagers,
|
|
169
|
+
sub_component="taskmanager",
|
|
170
|
+
),
|
|
171
|
+
check_under_registered_taskmanagers(
|
|
172
|
+
instance_config=instance_config,
|
|
173
|
+
expected_count=taskmanagers_expected_cnt,
|
|
174
|
+
cr_name=service_cr_name,
|
|
175
|
+
is_eks=isinstance(instance_config, flinkeks_tools.FlinkEksDeploymentConfig),
|
|
176
|
+
),
|
|
177
|
+
]
|
|
178
|
+
output = ", ".join([r[1] for r in results])
|
|
179
|
+
description = "\n########\n".join([r[2] for r in results])
|
|
180
|
+
if any(r[0] for r in results):
|
|
181
|
+
log.error(output)
|
|
182
|
+
status = pysensu_yelp.Status.CRITICAL
|
|
183
|
+
else:
|
|
184
|
+
log.info(output)
|
|
185
|
+
status = pysensu_yelp.Status.OK
|
|
186
|
+
send_replication_event(
|
|
187
|
+
instance_config=instance_config,
|
|
188
|
+
status=status,
|
|
189
|
+
output=output,
|
|
190
|
+
description=description,
|
|
191
|
+
dry_run=dry_run,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
args = parse_args()
|
|
197
|
+
main(
|
|
198
|
+
instance_type_class=flinkeks_tools.FlinkEksDeploymentConfig
|
|
199
|
+
if args.eks
|
|
200
|
+
else flink_tools.FlinkDeploymentConfig,
|
|
201
|
+
check_service_replication=check_flink_service_health,
|
|
202
|
+
namespace="paasta-flinks",
|
|
203
|
+
)
|