paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,652 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright 2015-2016 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
Getters for deriving monitoring parameters for mesos-deployed stuff.
|
|
17
|
+
This leaves a place for sane defaults that might change depending
|
|
18
|
+
on the framework that is asking, and still allows you to set your team
|
|
19
|
+
*once* for a service in the general config.
|
|
20
|
+
|
|
21
|
+
Everything in here is private, and you shouldn't worry about it.
|
|
22
|
+
"""
|
|
23
|
+
import abc
|
|
24
|
+
import json
|
|
25
|
+
import logging
|
|
26
|
+
import os
|
|
27
|
+
from typing import Dict
|
|
28
|
+
from typing import Mapping
|
|
29
|
+
from typing import Optional
|
|
30
|
+
from typing import Tuple
|
|
31
|
+
|
|
32
|
+
import pysensu_yelp
|
|
33
|
+
import service_configuration_lib
|
|
34
|
+
|
|
35
|
+
from paasta_tools.long_running_service_tools import LongRunningServiceConfig
|
|
36
|
+
from paasta_tools.utils import _log
|
|
37
|
+
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
38
|
+
from paasta_tools.utils import is_under_replicated
|
|
39
|
+
from paasta_tools.utils import load_system_paasta_config
|
|
40
|
+
from paasta_tools.utils import PaastaNotConfiguredError
|
|
41
|
+
from paasta_tools.utils import time_cache
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ReplicationChecker(abc.ABC):
|
|
45
|
+
@abc.abstractmethod
|
|
46
|
+
def get_replication_for_instance(
|
|
47
|
+
self, instance_config: LongRunningServiceConfig
|
|
48
|
+
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
49
|
+
...
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
import yelp_meteorite
|
|
54
|
+
except ImportError:
|
|
55
|
+
yelp_meteorite = None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
DEFAULT_REPLICATION_RUNBOOK = "y/unhealthy-paasta-instances"
|
|
59
|
+
|
|
60
|
+
log = logging.getLogger(__name__)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def monitoring_defaults(key):
|
|
64
|
+
defaults = {
|
|
65
|
+
"runbook": 'Please set a `runbook` field in your monitoring.yaml. Like "y/rb-mesos". Docs: '
|
|
66
|
+
"https://paasta.readthedocs.io/en/latest/yelpsoa_configs.html#monitoring-yaml",
|
|
67
|
+
"tip": "Please set a `tip` field in your monitoring.yaml. Docs: "
|
|
68
|
+
"https://paasta.readthedocs.io/en/latest/yelpsoa_configs.html#monitoring-yaml",
|
|
69
|
+
"ticket": False,
|
|
70
|
+
"project": None,
|
|
71
|
+
"realert_every": -1,
|
|
72
|
+
"tags": [],
|
|
73
|
+
}
|
|
74
|
+
return defaults.get(key, None)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_team(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
78
|
+
return __get_monitoring_config_value("team", overrides, service, soa_dir)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_runbook(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
82
|
+
return __get_monitoring_config_value("runbook", overrides, service, soa_dir)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_tip(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
86
|
+
return __get_monitoring_config_value("tip", overrides, service, soa_dir)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_notification_email(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
90
|
+
return __get_monitoring_config_value(
|
|
91
|
+
"notification_email", overrides, service, soa_dir
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_page(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
96
|
+
return __get_monitoring_config_value("page", overrides, service, soa_dir)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_alert_after(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
100
|
+
return __get_monitoring_config_value("alert_after", overrides, service, soa_dir)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_realert_every(
|
|
104
|
+
overrides, service, soa_dir=DEFAULT_SOA_DIR, monitoring_defaults=monitoring_defaults
|
|
105
|
+
):
|
|
106
|
+
return __get_monitoring_config_value(
|
|
107
|
+
"realert_every",
|
|
108
|
+
overrides=overrides,
|
|
109
|
+
service=service,
|
|
110
|
+
soa_dir=soa_dir,
|
|
111
|
+
monitoring_defaults=monitoring_defaults,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_check_every(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
116
|
+
return __get_monitoring_config_value("check_every", overrides, service, soa_dir)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get_irc_channels(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
120
|
+
return __get_monitoring_config_value("irc_channels", overrides, service, soa_dir)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_slack_channels(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
124
|
+
return __get_monitoring_config_value("slack_channels", overrides, service, soa_dir)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def get_dependencies(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
128
|
+
return __get_monitoring_config_value("dependencies", overrides, service, soa_dir)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def get_ticket(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
132
|
+
return __get_monitoring_config_value("ticket", overrides, service, soa_dir)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def get_project(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
136
|
+
return __get_monitoring_config_value("project", overrides, service, soa_dir)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_priority(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
140
|
+
return __get_monitoring_config_value("priority", overrides, service, soa_dir)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def get_tags(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
144
|
+
return __get_monitoring_config_value("tags", overrides, service, soa_dir)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def get_component(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
148
|
+
return __get_monitoring_config_value("component", overrides, service, soa_dir)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def get_description(overrides, service, soa_dir=DEFAULT_SOA_DIR):
|
|
152
|
+
return __get_monitoring_config_value("description", overrides, service, soa_dir)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# Our typical usage pattern is that we call all the different get_* functions back to back. Applying a small amount of
|
|
156
|
+
# cache here helps cut down on the number of times we re-parse service.yaml.
|
|
157
|
+
_cached_read_service_configuration = time_cache(ttl=5)(
|
|
158
|
+
service_configuration_lib.read_service_configuration
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def __get_monitoring_config_value(
|
|
163
|
+
key,
|
|
164
|
+
overrides,
|
|
165
|
+
service,
|
|
166
|
+
soa_dir=DEFAULT_SOA_DIR,
|
|
167
|
+
monitoring_defaults=monitoring_defaults,
|
|
168
|
+
):
|
|
169
|
+
general_config = _cached_read_service_configuration(service, soa_dir=soa_dir)
|
|
170
|
+
monitor_config = read_monitoring_config(service, soa_dir=soa_dir)
|
|
171
|
+
service_default = general_config.get(key, monitoring_defaults(key))
|
|
172
|
+
service_default = general_config.get("monitoring", {key: service_default}).get(
|
|
173
|
+
key, service_default
|
|
174
|
+
)
|
|
175
|
+
service_default = monitor_config.get(key, service_default)
|
|
176
|
+
return overrides.get(key, service_default)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def get_sensu_team_data(team):
|
|
180
|
+
"""Takes a team and returns the dictionary of Sensu configuration
|
|
181
|
+
settings for that team. The data is in this format:
|
|
182
|
+
https://github.com/Yelp/sensu_handlers#teams
|
|
183
|
+
Returns an empty dictionary if there is nothing to return.
|
|
184
|
+
|
|
185
|
+
Not all teams specify all the different types of configuration settings.
|
|
186
|
+
for example, a team may not specify a `notification_email`. It is up
|
|
187
|
+
to the caller of this function to handle that case.
|
|
188
|
+
"""
|
|
189
|
+
global_team_data = _load_sensu_team_data()["team_data"]
|
|
190
|
+
return global_team_data.get(team, {})
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _load_sensu_team_data():
|
|
194
|
+
try:
|
|
195
|
+
with open("/etc/sensu/team_data.json") as f:
|
|
196
|
+
team_data = json.load(f)
|
|
197
|
+
except IOError:
|
|
198
|
+
log.warning(
|
|
199
|
+
"No Sensu Team data (/etc/sensu/team_data.json) available. Using empty defaults"
|
|
200
|
+
)
|
|
201
|
+
team_data = {}
|
|
202
|
+
return team_data
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def send_event(
|
|
206
|
+
service,
|
|
207
|
+
check_name,
|
|
208
|
+
overrides,
|
|
209
|
+
status,
|
|
210
|
+
output,
|
|
211
|
+
soa_dir,
|
|
212
|
+
ttl=None,
|
|
213
|
+
cluster=None,
|
|
214
|
+
system_paasta_config=None,
|
|
215
|
+
dry_run=False,
|
|
216
|
+
):
|
|
217
|
+
"""Send an event to sensu via pysensu_yelp with the given information.
|
|
218
|
+
|
|
219
|
+
:param service: The service name the event is about
|
|
220
|
+
:param check_name: The name of the check as it appears in Sensu
|
|
221
|
+
:param overrides: A dictionary containing overrides for monitoring options
|
|
222
|
+
(e.g. notification_email, ticket, page)
|
|
223
|
+
:param status: The status to emit for this event
|
|
224
|
+
:param output: The output to emit for this event
|
|
225
|
+
:param soa_dir: The service directory to read monitoring information from
|
|
226
|
+
:param ttl: TTL (optional)
|
|
227
|
+
:param cluster: The cluster name (optional)
|
|
228
|
+
:param system_paasta_config: A SystemPaastaConfig object representing the system
|
|
229
|
+
:param dry_run: Print the Sensu event instead of emitting it
|
|
230
|
+
"""
|
|
231
|
+
# This function assumes the input is a string like "mumble.main"
|
|
232
|
+
team = get_team(overrides, service, soa_dir)
|
|
233
|
+
if not team:
|
|
234
|
+
return
|
|
235
|
+
|
|
236
|
+
if system_paasta_config is None:
|
|
237
|
+
system_paasta_config = load_system_paasta_config()
|
|
238
|
+
if cluster is None:
|
|
239
|
+
try:
|
|
240
|
+
cluster = system_paasta_config.get_cluster()
|
|
241
|
+
except PaastaNotConfiguredError:
|
|
242
|
+
cluster = "localhost"
|
|
243
|
+
|
|
244
|
+
alert_after = overrides.get("alert_after", "5m")
|
|
245
|
+
result_dict = {
|
|
246
|
+
"name": check_name,
|
|
247
|
+
"runbook": overrides.get("runbook", "http://y/paasta-troubleshooting"),
|
|
248
|
+
"status": status,
|
|
249
|
+
"output": output,
|
|
250
|
+
"team": team,
|
|
251
|
+
"page": get_page(overrides, service, soa_dir),
|
|
252
|
+
"tip": get_tip(overrides, service, soa_dir),
|
|
253
|
+
"notification_email": get_notification_email(overrides, service, soa_dir),
|
|
254
|
+
"check_every": overrides.get("check_every", "1m"),
|
|
255
|
+
"realert_every": overrides.get(
|
|
256
|
+
"realert_every", monitoring_defaults("realert_every")
|
|
257
|
+
),
|
|
258
|
+
"alert_after": f"{alert_after}s"
|
|
259
|
+
if isinstance(alert_after, int)
|
|
260
|
+
else alert_after,
|
|
261
|
+
"irc_channels": get_irc_channels(overrides, service, soa_dir),
|
|
262
|
+
"slack_channels": get_slack_channels(overrides, service, soa_dir),
|
|
263
|
+
"ticket": get_ticket(overrides, service, soa_dir),
|
|
264
|
+
"project": get_project(overrides, service, soa_dir),
|
|
265
|
+
"priority": get_priority(overrides, service, soa_dir),
|
|
266
|
+
"source": "paasta-%s" % cluster,
|
|
267
|
+
"tags": get_tags(overrides, service, soa_dir),
|
|
268
|
+
"ttl": ttl,
|
|
269
|
+
"sensu_host": system_paasta_config.get_sensu_host(),
|
|
270
|
+
"sensu_port": system_paasta_config.get_sensu_port(),
|
|
271
|
+
"component": get_component(overrides, service, soa_dir),
|
|
272
|
+
"description": get_description(overrides, service, soa_dir),
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
if dry_run:
|
|
276
|
+
if status == pysensu_yelp.Status.OK:
|
|
277
|
+
print(f"Would've sent an OK event for check '{check_name}'")
|
|
278
|
+
else:
|
|
279
|
+
from pprint import pprint # only import during testing
|
|
280
|
+
|
|
281
|
+
print(f"Would've sent the following alert for check '{check_name}':")
|
|
282
|
+
pprint(result_dict)
|
|
283
|
+
|
|
284
|
+
elif result_dict.get("sensu_host"):
|
|
285
|
+
pysensu_yelp.send_event(**result_dict)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@time_cache(ttl=5)
|
|
289
|
+
def read_monitoring_config(service, soa_dir=DEFAULT_SOA_DIR):
|
|
290
|
+
"""Read a service's monitoring.yaml file.
|
|
291
|
+
|
|
292
|
+
:param service: The service name
|
|
293
|
+
:param soa_dir: THe SOA configuration directory to read from
|
|
294
|
+
:returns: A dictionary of whatever was in soa_dir/name/monitoring.yaml"""
|
|
295
|
+
rootdir = os.path.abspath(soa_dir)
|
|
296
|
+
monitoring_file = os.path.join(rootdir, service, "monitoring.yaml")
|
|
297
|
+
monitor_conf = service_configuration_lib.read_monitoring(monitoring_file)
|
|
298
|
+
return monitor_conf
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def list_teams():
|
|
302
|
+
"""Loads team data from the system. Returns a set of team names (or empty
|
|
303
|
+
set).
|
|
304
|
+
"""
|
|
305
|
+
team_data = _load_sensu_team_data()
|
|
306
|
+
teams = set(team_data.get("team_data", {}).keys())
|
|
307
|
+
return teams
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def send_replication_event(
|
|
311
|
+
instance_config,
|
|
312
|
+
status,
|
|
313
|
+
output,
|
|
314
|
+
description,
|
|
315
|
+
dry_run=False,
|
|
316
|
+
):
|
|
317
|
+
"""Send an event to sensu via pysensu_yelp with the given information.
|
|
318
|
+
|
|
319
|
+
:param instance_config: an instance of LongRunningServiceConfig
|
|
320
|
+
:param status: The status to emit for this event
|
|
321
|
+
:param output: The output to emit for this event
|
|
322
|
+
:param dry_run: Print the event instead of emitting it
|
|
323
|
+
"""
|
|
324
|
+
# This function assumes the input is a string like "mumble.main"
|
|
325
|
+
monitoring_overrides = instance_config.get_monitoring()
|
|
326
|
+
if "alert_after" not in monitoring_overrides:
|
|
327
|
+
monitoring_overrides["alert_after"] = "2m"
|
|
328
|
+
monitoring_overrides["check_every"] = "1m"
|
|
329
|
+
monitoring_overrides["runbook"] = __get_monitoring_config_value(
|
|
330
|
+
"runbook",
|
|
331
|
+
monitoring_overrides,
|
|
332
|
+
instance_config.service,
|
|
333
|
+
soa_dir=instance_config.soa_dir,
|
|
334
|
+
monitoring_defaults=lambda _: DEFAULT_REPLICATION_RUNBOOK,
|
|
335
|
+
)
|
|
336
|
+
monitoring_overrides["tip"] = __get_monitoring_config_value(
|
|
337
|
+
"tip",
|
|
338
|
+
monitoring_overrides,
|
|
339
|
+
instance_config.service,
|
|
340
|
+
soa_dir=instance_config.soa_dir,
|
|
341
|
+
monitoring_defaults=lambda _: (
|
|
342
|
+
f"Check the instance with: `paasta status -s {instance_config.service} "
|
|
343
|
+
f"-i {instance_config.instance} -c {instance_config.cluster} -vv`"
|
|
344
|
+
),
|
|
345
|
+
)
|
|
346
|
+
monitoring_overrides["description"] = description
|
|
347
|
+
|
|
348
|
+
check_name = "check_paasta_services_replication.%s" % instance_config.job_id
|
|
349
|
+
send_event(
|
|
350
|
+
service=instance_config.service,
|
|
351
|
+
check_name=check_name,
|
|
352
|
+
overrides=monitoring_overrides,
|
|
353
|
+
status=status,
|
|
354
|
+
output=output,
|
|
355
|
+
soa_dir=instance_config.soa_dir,
|
|
356
|
+
cluster=instance_config.cluster,
|
|
357
|
+
dry_run=dry_run,
|
|
358
|
+
)
|
|
359
|
+
_log(
|
|
360
|
+
service=instance_config.service,
|
|
361
|
+
line="Replication: %s" % output,
|
|
362
|
+
component="monitoring",
|
|
363
|
+
level="debug",
|
|
364
|
+
cluster=instance_config.cluster,
|
|
365
|
+
instance=instance_config.instance,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def emit_replication_metrics(
|
|
370
|
+
replication_infos: Mapping[str, Mapping[str, Mapping[str, int]]],
|
|
371
|
+
instance_config: LongRunningServiceConfig,
|
|
372
|
+
expected_count: int,
|
|
373
|
+
dry_run: bool = False,
|
|
374
|
+
) -> None:
|
|
375
|
+
for provider, replication_info in replication_infos.items():
|
|
376
|
+
meteorite_dims = {
|
|
377
|
+
"paasta_service": instance_config.service,
|
|
378
|
+
"paasta_cluster": instance_config.cluster,
|
|
379
|
+
"paasta_instance": instance_config.instance,
|
|
380
|
+
"paasta_pool": instance_config.get_pool(),
|
|
381
|
+
"service_discovery_provider": provider,
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
num_available_backends = 0
|
|
385
|
+
for available_backends in replication_info.values():
|
|
386
|
+
num_available_backends += available_backends.get(instance_config.job_id, 0)
|
|
387
|
+
available_backends_metric = "paasta.service.available_backends"
|
|
388
|
+
if dry_run:
|
|
389
|
+
print(
|
|
390
|
+
f"Would've sent value {num_available_backends} for metric '{available_backends_metric}'"
|
|
391
|
+
)
|
|
392
|
+
else:
|
|
393
|
+
available_backends_gauge = yelp_meteorite.create_gauge(
|
|
394
|
+
available_backends_metric, meteorite_dims
|
|
395
|
+
)
|
|
396
|
+
available_backends_gauge.set(num_available_backends)
|
|
397
|
+
|
|
398
|
+
critical_percentage = instance_config.get_replication_crit_percentage()
|
|
399
|
+
num_critical_backends = critical_percentage * expected_count / 100.0
|
|
400
|
+
critical_backends_metric = "paasta.service.critical_backends"
|
|
401
|
+
if dry_run:
|
|
402
|
+
print(
|
|
403
|
+
f"Would've sent value {num_critical_backends} for metric '{critical_backends_metric}'"
|
|
404
|
+
)
|
|
405
|
+
else:
|
|
406
|
+
critical_backends_gauge = yelp_meteorite.create_gauge(
|
|
407
|
+
critical_backends_metric, meteorite_dims
|
|
408
|
+
)
|
|
409
|
+
critical_backends_gauge.set(num_critical_backends)
|
|
410
|
+
|
|
411
|
+
expected_backends_metric = "paasta.service.expected_backends"
|
|
412
|
+
if dry_run:
|
|
413
|
+
print(
|
|
414
|
+
f"Would've sent value {expected_count} for metric '{expected_backends_metric}'"
|
|
415
|
+
)
|
|
416
|
+
else:
|
|
417
|
+
expected_backends_gauge = yelp_meteorite.create_gauge(
|
|
418
|
+
"paasta.service.expected_backends", meteorite_dims
|
|
419
|
+
)
|
|
420
|
+
expected_backends_gauge.set(expected_count)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def check_replication_for_instance(
|
|
424
|
+
instance_config: LongRunningServiceConfig,
|
|
425
|
+
expected_count: int,
|
|
426
|
+
replication_checker: ReplicationChecker,
|
|
427
|
+
dry_run: bool = False,
|
|
428
|
+
) -> bool:
|
|
429
|
+
"""Check a set of namespaces to see if their number of available backends is too low,
|
|
430
|
+
emitting events to Sensu based on the fraction available and the thresholds defined in
|
|
431
|
+
the corresponding yelpsoa config.
|
|
432
|
+
|
|
433
|
+
:param instance_config: an instance of LongRunningServiceConfig
|
|
434
|
+
:param replication_checker: an instance of ReplicationChecker
|
|
435
|
+
:param dry_run: Print Sensu event and metrics instead of emitting them
|
|
436
|
+
"""
|
|
437
|
+
|
|
438
|
+
crit_threshold = instance_config.get_replication_crit_percentage()
|
|
439
|
+
|
|
440
|
+
log.info(
|
|
441
|
+
"Checking instance %s in service discovery providers", instance_config.job_id
|
|
442
|
+
)
|
|
443
|
+
replication_infos = replication_checker.get_replication_for_instance(
|
|
444
|
+
instance_config
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
log.debug(f"Got replication info for {instance_config.job_id}: {replication_infos}")
|
|
448
|
+
if yelp_meteorite is not None:
|
|
449
|
+
emit_replication_metrics(
|
|
450
|
+
replication_infos,
|
|
451
|
+
instance_config,
|
|
452
|
+
expected_count,
|
|
453
|
+
dry_run=dry_run,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
service_is_under_replicated = False
|
|
457
|
+
failed_service_discovery_providers = set()
|
|
458
|
+
for service_discovery_provider, replication_info in replication_infos.items():
|
|
459
|
+
if len(replication_info) == 0:
|
|
460
|
+
output = (
|
|
461
|
+
"Service %s has no %s replication info. Make sure the discover key in the corresponding config (e.g. smartstack.yaml for Smartstack) is valid!\n"
|
|
462
|
+
) % (instance_config.job_id, service_discovery_provider)
|
|
463
|
+
log.error(output)
|
|
464
|
+
service_is_under_replicated = True
|
|
465
|
+
failed_service_discovery_providers.add(service_discovery_provider)
|
|
466
|
+
else:
|
|
467
|
+
expected_count_per_location = int(expected_count / len(replication_info))
|
|
468
|
+
output_critical = []
|
|
469
|
+
output_ok = []
|
|
470
|
+
under_replication_per_location = []
|
|
471
|
+
|
|
472
|
+
for location, available_backends in sorted(replication_info.items()):
|
|
473
|
+
num_available_in_location = available_backends.get(
|
|
474
|
+
instance_config.job_id, 0
|
|
475
|
+
)
|
|
476
|
+
under_replicated, ratio = is_under_replicated(
|
|
477
|
+
num_available_in_location,
|
|
478
|
+
expected_count_per_location,
|
|
479
|
+
crit_threshold,
|
|
480
|
+
)
|
|
481
|
+
if under_replicated:
|
|
482
|
+
output_critical.append(
|
|
483
|
+
"{} has {}/{} replicas in {} according to {} (CRITICAL: {}%)\n".format(
|
|
484
|
+
instance_config.job_id,
|
|
485
|
+
num_available_in_location,
|
|
486
|
+
expected_count_per_location,
|
|
487
|
+
location,
|
|
488
|
+
service_discovery_provider,
|
|
489
|
+
ratio,
|
|
490
|
+
)
|
|
491
|
+
)
|
|
492
|
+
failed_service_discovery_providers.add(service_discovery_provider)
|
|
493
|
+
else:
|
|
494
|
+
output_ok.append(
|
|
495
|
+
"{} has {}/{} replicas in {} according to {} (OK: {}%)\n".format(
|
|
496
|
+
instance_config.job_id,
|
|
497
|
+
num_available_in_location,
|
|
498
|
+
expected_count_per_location,
|
|
499
|
+
location,
|
|
500
|
+
service_discovery_provider,
|
|
501
|
+
ratio,
|
|
502
|
+
)
|
|
503
|
+
)
|
|
504
|
+
under_replication_per_location.append(under_replicated)
|
|
505
|
+
|
|
506
|
+
output = ", ".join(output_critical)
|
|
507
|
+
if output_critical and output_ok:
|
|
508
|
+
output += ". The following locations are OK: "
|
|
509
|
+
output += ", ".join(output_ok)
|
|
510
|
+
|
|
511
|
+
service_is_under_replicated_anywhere = any(under_replication_per_location)
|
|
512
|
+
service_is_under_replicated |= service_is_under_replicated_anywhere
|
|
513
|
+
if service_is_under_replicated_anywhere:
|
|
514
|
+
log.error(output)
|
|
515
|
+
else:
|
|
516
|
+
log.info(output)
|
|
517
|
+
|
|
518
|
+
if service_is_under_replicated:
|
|
519
|
+
failed_service_discovery_providers_list = ",".join(
|
|
520
|
+
failed_service_discovery_providers
|
|
521
|
+
)
|
|
522
|
+
description = (
|
|
523
|
+
"This replication alert means that a {service_discovery_provider} powered loadbalancer\n"
|
|
524
|
+
"doesn't have enough healthy backends. Not having enough healthy backends\n"
|
|
525
|
+
"means that clients of that service will get 503s (http) or connection refused\n"
|
|
526
|
+
"(tcp) when trying to connect to it.\n"
|
|
527
|
+
"\n"
|
|
528
|
+
"Reasons this might be happening:\n"
|
|
529
|
+
"\n"
|
|
530
|
+
" The service may simply not have enough copies or it could simply be\n"
|
|
531
|
+
" unhealthy in that location. There also may not be enough resources\n"
|
|
532
|
+
" in the cluster to support the requested instance count.\n"
|
|
533
|
+
"\n"
|
|
534
|
+
"Things you can do:\n"
|
|
535
|
+
"\n"
|
|
536
|
+
" * You can view the logs for the job with:\n"
|
|
537
|
+
" paasta logs -s {service} -i {instance} -c {cluster}\n"
|
|
538
|
+
"\n"
|
|
539
|
+
" * Fix the cause of the unhealthy service. Try running:\n"
|
|
540
|
+
"\n"
|
|
541
|
+
" paasta status -s {service} -i {instance} -c {cluster} -vv\n"
|
|
542
|
+
"\n"
|
|
543
|
+
" * Widen {service_discovery_provider} discovery settings\n"
|
|
544
|
+
" * Increase the instance count\n"
|
|
545
|
+
"\n"
|
|
546
|
+
).format(
|
|
547
|
+
service=instance_config.service,
|
|
548
|
+
instance=instance_config.instance,
|
|
549
|
+
cluster=instance_config.cluster,
|
|
550
|
+
service_discovery_provider=failed_service_discovery_providers_list,
|
|
551
|
+
)
|
|
552
|
+
status = pysensu_yelp.Status.CRITICAL
|
|
553
|
+
else:
|
|
554
|
+
description = (
|
|
555
|
+
"{} is well-replicated because it has over {}% of its "
|
|
556
|
+
"expected replicas up."
|
|
557
|
+
).format(instance_config.job_id, crit_threshold)
|
|
558
|
+
status = pysensu_yelp.Status.OK
|
|
559
|
+
|
|
560
|
+
send_replication_event(
|
|
561
|
+
instance_config=instance_config,
|
|
562
|
+
status=status,
|
|
563
|
+
output=output,
|
|
564
|
+
description=description,
|
|
565
|
+
dry_run=dry_run,
|
|
566
|
+
)
|
|
567
|
+
return not service_is_under_replicated
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def check_under_replication(
|
|
571
|
+
instance_config: LongRunningServiceConfig,
|
|
572
|
+
expected_count: int,
|
|
573
|
+
num_available: int,
|
|
574
|
+
sub_component: Optional[str] = None,
|
|
575
|
+
) -> Tuple[bool, str, str]:
|
|
576
|
+
"""Check if a component/sub_component is under-replicated and returns both the result of the check in the form of a
|
|
577
|
+
boolean and a human-readable text to be used in logging or monitoring events.
|
|
578
|
+
"""
|
|
579
|
+
crit_threshold = instance_config.get_replication_crit_percentage()
|
|
580
|
+
|
|
581
|
+
# Keep output short, with rest of context in description. This is because
|
|
582
|
+
# by default, Slack-Sensu messages have a 400 char limit, incl. the output.
|
|
583
|
+
# If it is too long, the runbook and tip won't show up.
|
|
584
|
+
if sub_component is not None:
|
|
585
|
+
output = ("{} has {}/{} replicas of {} available (threshold: {}%)").format(
|
|
586
|
+
instance_config.job_id,
|
|
587
|
+
num_available,
|
|
588
|
+
expected_count,
|
|
589
|
+
sub_component,
|
|
590
|
+
crit_threshold,
|
|
591
|
+
)
|
|
592
|
+
else:
|
|
593
|
+
output = ("{} has {}/{} replicas available (threshold: {}%)").format(
|
|
594
|
+
instance_config.job_id, num_available, expected_count, crit_threshold
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
under_replicated, _ = is_under_replicated(
|
|
598
|
+
num_available, expected_count, crit_threshold
|
|
599
|
+
)
|
|
600
|
+
if under_replicated:
|
|
601
|
+
description = (
|
|
602
|
+
"This replication alert means that PaaSTA can't keep the\n"
|
|
603
|
+
"requested number of replicas up and healthy in the cluster for "
|
|
604
|
+
"the instance {service}.{instance}.\n"
|
|
605
|
+
"\n"
|
|
606
|
+
"Reasons this might be happening:\n"
|
|
607
|
+
"\n"
|
|
608
|
+
" The service may simply be unhealthy. There also may not be enough resources\n"
|
|
609
|
+
" in the cluster to support the requested instance count.\n"
|
|
610
|
+
"\n"
|
|
611
|
+
"Things you can do:\n"
|
|
612
|
+
"\n"
|
|
613
|
+
" * Increase the instance count\n"
|
|
614
|
+
" * Fix the cause of the unhealthy service. Try running:\n"
|
|
615
|
+
"\n"
|
|
616
|
+
" paasta status -s {service} -i {instance} -c {cluster} -vv\n"
|
|
617
|
+
).format(
|
|
618
|
+
service=instance_config.service,
|
|
619
|
+
instance=instance_config.instance,
|
|
620
|
+
cluster=instance_config.cluster,
|
|
621
|
+
)
|
|
622
|
+
else:
|
|
623
|
+
description = (
|
|
624
|
+
"{} is well-replicated because it has over {}% of its "
|
|
625
|
+
"expected replicas up."
|
|
626
|
+
).format(instance_config.job_id, crit_threshold)
|
|
627
|
+
return under_replicated, output, description
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def send_replication_event_if_under_replication(
|
|
631
|
+
instance_config: LongRunningServiceConfig,
|
|
632
|
+
expected_count: int,
|
|
633
|
+
num_available: int,
|
|
634
|
+
sub_component: Optional[str] = None,
|
|
635
|
+
dry_run: bool = False,
|
|
636
|
+
):
|
|
637
|
+
under_replicated, output, description = check_under_replication(
|
|
638
|
+
instance_config, expected_count, num_available, sub_component
|
|
639
|
+
)
|
|
640
|
+
if under_replicated:
|
|
641
|
+
log.error(output)
|
|
642
|
+
status = pysensu_yelp.Status.CRITICAL
|
|
643
|
+
else:
|
|
644
|
+
log.info(output)
|
|
645
|
+
status = pysensu_yelp.Status.OK
|
|
646
|
+
send_replication_event(
|
|
647
|
+
instance_config=instance_config,
|
|
648
|
+
status=status,
|
|
649
|
+
output=output,
|
|
650
|
+
description=description,
|
|
651
|
+
dry_run=dry_run,
|
|
652
|
+
)
|