paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1988 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright 2015-2016 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""Contains methods used by the paasta client to mark a docker image for
|
|
16
|
+
deployment to a cluster.instance.
|
|
17
|
+
"""
|
|
18
|
+
import argparse
|
|
19
|
+
import asyncio
|
|
20
|
+
import concurrent
|
|
21
|
+
import datetime
|
|
22
|
+
import functools
|
|
23
|
+
import getpass
|
|
24
|
+
import logging
|
|
25
|
+
import math
|
|
26
|
+
import os
|
|
27
|
+
import socket
|
|
28
|
+
import sys
|
|
29
|
+
import time
|
|
30
|
+
import traceback
|
|
31
|
+
from threading import Thread
|
|
32
|
+
from typing import Any
|
|
33
|
+
from typing import Callable
|
|
34
|
+
from typing import Collection
|
|
35
|
+
from typing import Dict
|
|
36
|
+
from typing import Iterator
|
|
37
|
+
from typing import List
|
|
38
|
+
from typing import Mapping
|
|
39
|
+
from typing import Optional
|
|
40
|
+
from typing import Set
|
|
41
|
+
from typing import Tuple
|
|
42
|
+
|
|
43
|
+
import a_sync
|
|
44
|
+
import humanize
|
|
45
|
+
import progressbar
|
|
46
|
+
from service_configuration_lib import read_deploy
|
|
47
|
+
from slackclient import SlackClient
|
|
48
|
+
from sticht import state_machine
|
|
49
|
+
from sticht.rollbacks.base import RollbackSlackDeploymentProcess
|
|
50
|
+
from sticht.rollbacks.slo import SLOWatcher
|
|
51
|
+
from sticht.rollbacks.types import MetricWatcher
|
|
52
|
+
from sticht.rollbacks.types import SplunkAuth
|
|
53
|
+
|
|
54
|
+
from paasta_tools import remote_git
|
|
55
|
+
from paasta_tools.api import client
|
|
56
|
+
from paasta_tools.cassandracluster_tools import CassandraClusterDeploymentConfig
|
|
57
|
+
from paasta_tools.cli.cmds.push_to_registry import is_docker_image_already_in_registry
|
|
58
|
+
from paasta_tools.cli.cmds.status import get_main_container
|
|
59
|
+
from paasta_tools.cli.cmds.status import get_version_table_entry
|
|
60
|
+
from paasta_tools.cli.cmds.status import recent_container_restart
|
|
61
|
+
from paasta_tools.cli.utils import get_jenkins_build_output_url
|
|
62
|
+
from paasta_tools.cli.utils import get_paasta_oapi_api_clustername
|
|
63
|
+
from paasta_tools.cli.utils import lazy_choices_completer
|
|
64
|
+
from paasta_tools.cli.utils import list_deploy_groups
|
|
65
|
+
from paasta_tools.cli.utils import trigger_deploys
|
|
66
|
+
from paasta_tools.cli.utils import validate_git_sha
|
|
67
|
+
from paasta_tools.cli.utils import validate_given_deploy_groups
|
|
68
|
+
from paasta_tools.cli.utils import validate_service_name
|
|
69
|
+
from paasta_tools.cli.utils import validate_short_git_sha
|
|
70
|
+
from paasta_tools.deployment_utils import get_currently_deployed_sha
|
|
71
|
+
from paasta_tools.deployment_utils import get_currently_deployed_version
|
|
72
|
+
from paasta_tools.eks_tools import EksDeploymentConfig
|
|
73
|
+
from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig
|
|
74
|
+
from paasta_tools.long_running_service_tools import LongRunningServiceConfig
|
|
75
|
+
from paasta_tools.metrics import metrics_lib
|
|
76
|
+
from paasta_tools.paasta_service_config_loader import PaastaServiceConfigLoader
|
|
77
|
+
from paasta_tools.paastaapi.models import InstanceStatusKubernetesV2
|
|
78
|
+
from paasta_tools.paastaapi.models import KubernetesPodV2
|
|
79
|
+
from paasta_tools.slack import get_slack_client
|
|
80
|
+
from paasta_tools.utils import _log
|
|
81
|
+
from paasta_tools.utils import _log_audit
|
|
82
|
+
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
83
|
+
from paasta_tools.utils import DeploymentVersion
|
|
84
|
+
from paasta_tools.utils import format_tag
|
|
85
|
+
from paasta_tools.utils import get_files_of_type_in_dir
|
|
86
|
+
from paasta_tools.utils import get_git_url
|
|
87
|
+
from paasta_tools.utils import get_paasta_tag_from_deploy_group
|
|
88
|
+
from paasta_tools.utils import get_username
|
|
89
|
+
from paasta_tools.utils import ldap_user_search
|
|
90
|
+
from paasta_tools.utils import list_services
|
|
91
|
+
from paasta_tools.utils import load_system_paasta_config
|
|
92
|
+
from paasta_tools.utils import PaastaColors
|
|
93
|
+
from paasta_tools.utils import RollbackTypes
|
|
94
|
+
from paasta_tools.utils import TimeoutError
|
|
95
|
+
|
|
96
|
+
DEFAULT_DEPLOYMENT_TIMEOUT = 3 * 3600 # seconds
|
|
97
|
+
DEFAULT_WARN_PERCENT = 17 # ~30min for default timeout
|
|
98
|
+
DEFAULT_AUTO_CERTIFY_DELAY = 600 # seconds
|
|
99
|
+
DEFAULT_SLACK_CHANNEL = "#deploy"
|
|
100
|
+
DEFAULT_STUCK_BOUNCE_RUNBOOK = "y/stuckbounce"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
log = logging.getLogger(__name__)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def add_subparser(subparsers: argparse._SubParsersAction) -> None:
|
|
107
|
+
list_parser = subparsers.add_parser(
|
|
108
|
+
"mark-for-deployment",
|
|
109
|
+
help="Mark a docker image for deployment in git",
|
|
110
|
+
description=(
|
|
111
|
+
"'paasta mark-for-deployment' uses Git as the control-plane, to "
|
|
112
|
+
"signal to other PaaSTA components that a particular docker image "
|
|
113
|
+
"is ready to be deployed."
|
|
114
|
+
),
|
|
115
|
+
epilog=(
|
|
116
|
+
"Note: Access and credentials to the Git repo of a service are required "
|
|
117
|
+
"for this command to work."
|
|
118
|
+
),
|
|
119
|
+
)
|
|
120
|
+
list_parser.add_argument(
|
|
121
|
+
"-u",
|
|
122
|
+
"--git-url",
|
|
123
|
+
help=(
|
|
124
|
+
"Git url for service -- where magic mark-for-deployment tags are pushed. "
|
|
125
|
+
"Defaults to the normal git URL for the service."
|
|
126
|
+
),
|
|
127
|
+
default=None,
|
|
128
|
+
)
|
|
129
|
+
list_parser.add_argument(
|
|
130
|
+
"-c",
|
|
131
|
+
"-k",
|
|
132
|
+
"--commit",
|
|
133
|
+
help="Git sha to mark for deployment",
|
|
134
|
+
required=True,
|
|
135
|
+
type=validate_short_git_sha,
|
|
136
|
+
)
|
|
137
|
+
list_parser.add_argument(
|
|
138
|
+
"-i",
|
|
139
|
+
"--image-version",
|
|
140
|
+
help="Extra version metadata to mark for deployment",
|
|
141
|
+
required=False,
|
|
142
|
+
default=None,
|
|
143
|
+
)
|
|
144
|
+
arg_deploy_group = list_parser.add_argument(
|
|
145
|
+
"-l",
|
|
146
|
+
"--deploy-group",
|
|
147
|
+
"--clusterinstance",
|
|
148
|
+
help="Mark the service ready for deployment in this deploy group (e.g. "
|
|
149
|
+
"cluster1.canary, cluster2.main). --clusterinstance is deprecated and "
|
|
150
|
+
"should be replaced with --deploy-group",
|
|
151
|
+
required=True,
|
|
152
|
+
)
|
|
153
|
+
arg_deploy_group.completer = lazy_choices_completer(list_deploy_groups) # type: ignore
|
|
154
|
+
arg_service = list_parser.add_argument(
|
|
155
|
+
"-s",
|
|
156
|
+
"--service",
|
|
157
|
+
help="Name of the service which you wish to mark for deployment. Leading "
|
|
158
|
+
'"services-" will be stripped.',
|
|
159
|
+
required=True,
|
|
160
|
+
)
|
|
161
|
+
arg_service.completer = lazy_choices_completer(list_services) # type: ignore
|
|
162
|
+
list_parser.add_argument(
|
|
163
|
+
"--verify-image-exists",
|
|
164
|
+
help="Check the docker registry and verify the image has been pushed",
|
|
165
|
+
dest="verify_image",
|
|
166
|
+
action="store_true",
|
|
167
|
+
default=False,
|
|
168
|
+
)
|
|
169
|
+
list_parser.add_argument(
|
|
170
|
+
"--wait-for-deployment",
|
|
171
|
+
help="Set to poll paasta and wait for the deployment to finish, "
|
|
172
|
+
"the default strategy is to mark for deployment and exit straightaway",
|
|
173
|
+
dest="block",
|
|
174
|
+
action="store_true",
|
|
175
|
+
default=False,
|
|
176
|
+
)
|
|
177
|
+
list_parser.add_argument(
|
|
178
|
+
"-t",
|
|
179
|
+
"--timeout",
|
|
180
|
+
dest="timeout",
|
|
181
|
+
type=int,
|
|
182
|
+
default=DEFAULT_DEPLOYMENT_TIMEOUT,
|
|
183
|
+
help=(
|
|
184
|
+
"Time in seconds to wait for paasta to deploy the service. "
|
|
185
|
+
"If the timeout is exceeded we return 1. "
|
|
186
|
+
"Default is %(default)s seconds."
|
|
187
|
+
),
|
|
188
|
+
)
|
|
189
|
+
list_parser.add_argument(
|
|
190
|
+
"-w",
|
|
191
|
+
"--warn",
|
|
192
|
+
dest="warn",
|
|
193
|
+
type=int,
|
|
194
|
+
default=DEFAULT_WARN_PERCENT,
|
|
195
|
+
help=(
|
|
196
|
+
"Percent of timeout to warn at if the deployment hasn't finished. "
|
|
197
|
+
"For example, --warn=75 will warn at 75%% of the timeout. "
|
|
198
|
+
"Defaults to %(default)s."
|
|
199
|
+
),
|
|
200
|
+
)
|
|
201
|
+
list_parser.add_argument(
|
|
202
|
+
"--auto-rollback",
|
|
203
|
+
help="Automatically roll back to the previously deployed sha if the deployment "
|
|
204
|
+
"times out or is canceled (ctrl-c). Only applicable with --wait-for-deployment. "
|
|
205
|
+
"Defaults to false.",
|
|
206
|
+
dest="auto_rollback",
|
|
207
|
+
action="store_true",
|
|
208
|
+
default=False,
|
|
209
|
+
)
|
|
210
|
+
list_parser.add_argument(
|
|
211
|
+
"-d",
|
|
212
|
+
"--soa-dir",
|
|
213
|
+
dest="soa_dir",
|
|
214
|
+
metavar="SOA_DIR",
|
|
215
|
+
default=DEFAULT_SOA_DIR,
|
|
216
|
+
help="define a different soa config directory",
|
|
217
|
+
)
|
|
218
|
+
list_parser.add_argument(
|
|
219
|
+
"-v",
|
|
220
|
+
"--verbose",
|
|
221
|
+
action="count",
|
|
222
|
+
dest="verbose",
|
|
223
|
+
default=0,
|
|
224
|
+
help="Print out more output.",
|
|
225
|
+
)
|
|
226
|
+
list_parser.add_argument(
|
|
227
|
+
"--auto-certify-delay",
|
|
228
|
+
dest="auto_certify_delay",
|
|
229
|
+
type=int,
|
|
230
|
+
default=None, # the logic for this is complicated. See MarkForDeploymentProcess.get_auto_certify_delay.
|
|
231
|
+
help="After a deploy finishes, wait this many seconds before automatically certifying."
|
|
232
|
+
f"Default {DEFAULT_AUTO_CERTIFY_DELAY} when --auto-rollback is enabled",
|
|
233
|
+
)
|
|
234
|
+
list_parser.add_argument(
|
|
235
|
+
"--auto-abandon-delay",
|
|
236
|
+
dest="auto_abandon_delay",
|
|
237
|
+
type=int,
|
|
238
|
+
default=600,
|
|
239
|
+
help="After a rollback finishes, wait this many seconds before automatically abandoning.",
|
|
240
|
+
)
|
|
241
|
+
list_parser.add_argument(
|
|
242
|
+
"--auto-rollback-delay",
|
|
243
|
+
dest="auto_rollback_delay",
|
|
244
|
+
type=int,
|
|
245
|
+
default=30,
|
|
246
|
+
help="After noticing an SLO failure, wait this many seconds before automatically rolling back.",
|
|
247
|
+
)
|
|
248
|
+
list_parser.add_argument(
|
|
249
|
+
"--author",
|
|
250
|
+
dest="authors",
|
|
251
|
+
default=None,
|
|
252
|
+
action="append",
|
|
253
|
+
help="Additional author(s) of the deploy, who will be pinged in Slack",
|
|
254
|
+
)
|
|
255
|
+
list_parser.add_argument(
|
|
256
|
+
"--polling-interval",
|
|
257
|
+
dest="polling_interval",
|
|
258
|
+
type=float,
|
|
259
|
+
default=None,
|
|
260
|
+
help="How long to wait between each time we check to see if an instance is done deploying.",
|
|
261
|
+
)
|
|
262
|
+
list_parser.add_argument(
|
|
263
|
+
"--diagnosis-interval",
|
|
264
|
+
dest="diagnosis_interval",
|
|
265
|
+
type=float,
|
|
266
|
+
default=None,
|
|
267
|
+
help="How long to wait between diagnoses of why the bounce isn't done.",
|
|
268
|
+
)
|
|
269
|
+
list_parser.add_argument(
|
|
270
|
+
"--time-before-first-diagnosis",
|
|
271
|
+
dest="time_before_first_diagnosis",
|
|
272
|
+
type=float,
|
|
273
|
+
default=None,
|
|
274
|
+
help="Wait this long before trying to diagnose why the bounce isn't done.",
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
list_parser.set_defaults(command=paasta_mark_for_deployment)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def mark_for_deployment(
|
|
281
|
+
git_url: str,
|
|
282
|
+
deploy_group: str,
|
|
283
|
+
service: str,
|
|
284
|
+
commit: str,
|
|
285
|
+
image_version: Optional[str] = None,
|
|
286
|
+
) -> int:
|
|
287
|
+
"""Mark a docker image for deployment"""
|
|
288
|
+
tag = get_paasta_tag_from_deploy_group(
|
|
289
|
+
identifier=deploy_group, desired_state="deploy", image_version=image_version
|
|
290
|
+
)
|
|
291
|
+
remote_tag = format_tag(tag)
|
|
292
|
+
ref_mutator = remote_git.make_force_push_mutate_refs_func(
|
|
293
|
+
targets=[remote_tag], sha=commit
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
deployment_version = DeploymentVersion(commit, image_version)
|
|
297
|
+
max_attempts = 3
|
|
298
|
+
for attempt in range(1, max_attempts + 1):
|
|
299
|
+
try:
|
|
300
|
+
remote_git.create_remote_refs(
|
|
301
|
+
git_url=git_url, ref_mutator=ref_mutator, force=True
|
|
302
|
+
)
|
|
303
|
+
if "yelpcorp.com" in git_url:
|
|
304
|
+
trigger_deploys(service)
|
|
305
|
+
except Exception as e:
|
|
306
|
+
logline = f"Failed to mark {deployment_version} for deployment in deploy group {deploy_group}! (attempt \
|
|
307
|
+
{attempt}/{max_attempts}, error: {e}) \n Have you pushed your commit?"
|
|
308
|
+
_log(service=service, line=logline, component="deploy", level="event")
|
|
309
|
+
time.sleep(5 * attempt)
|
|
310
|
+
else:
|
|
311
|
+
logline = f"Marked {deployment_version} for deployment in deploy group {deploy_group}"
|
|
312
|
+
_log(service=service, line=logline, component="deploy", level="event")
|
|
313
|
+
|
|
314
|
+
audit_action_details = {
|
|
315
|
+
"deploy_group": deploy_group,
|
|
316
|
+
"commit": commit,
|
|
317
|
+
"image_version": image_version,
|
|
318
|
+
}
|
|
319
|
+
_log_audit(
|
|
320
|
+
action="mark-for-deployment",
|
|
321
|
+
action_details=audit_action_details,
|
|
322
|
+
service=service,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
return 0
|
|
326
|
+
return 1
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def can_user_deploy_service(deploy_info: Dict[str, Any], service: str) -> bool:
|
|
330
|
+
deploy_username = get_username()
|
|
331
|
+
|
|
332
|
+
# Tronjobs can run paasta stop/start/restart
|
|
333
|
+
ssh_client_env = os.environ.get("SSH_CLIENT")
|
|
334
|
+
if ssh_client_env and deploy_username == "batch":
|
|
335
|
+
ssh_client = ssh_client_env.split()[0]
|
|
336
|
+
hostname = socket.gethostbyaddr(ssh_client)[0]
|
|
337
|
+
|
|
338
|
+
if "tron" in hostname:
|
|
339
|
+
return True
|
|
340
|
+
|
|
341
|
+
system_paasta_config = load_system_paasta_config()
|
|
342
|
+
allowed_groups = (
|
|
343
|
+
deploy_info["allowed_push_groups"]
|
|
344
|
+
if deploy_info.get("allowed_push_groups") is not None
|
|
345
|
+
else system_paasta_config.get_default_push_groups()
|
|
346
|
+
)
|
|
347
|
+
if allowed_groups is not None:
|
|
348
|
+
search_base = system_paasta_config.get_ldap_search_base()
|
|
349
|
+
search_ou = system_paasta_config.get_ldap_search_ou()
|
|
350
|
+
host = system_paasta_config.get_ldap_host()
|
|
351
|
+
ldap_username = system_paasta_config.get_ldap_reader_username()
|
|
352
|
+
ldap_password = system_paasta_config.get_ldap_reader_password()
|
|
353
|
+
if not any(
|
|
354
|
+
[
|
|
355
|
+
deploy_username
|
|
356
|
+
in ldap_user_search(
|
|
357
|
+
group, search_base, search_ou, host, ldap_username, ldap_password
|
|
358
|
+
)
|
|
359
|
+
for group in allowed_groups
|
|
360
|
+
]
|
|
361
|
+
):
|
|
362
|
+
logline = f"current user is not authorized to perform this action (should be in one of {allowed_groups})"
|
|
363
|
+
_log(service=service, line=logline, component="deploy", level="event")
|
|
364
|
+
print(logline, file=sys.stderr)
|
|
365
|
+
return False
|
|
366
|
+
return True
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def can_run_metric_watcher_threads(
|
|
370
|
+
service: str,
|
|
371
|
+
soa_dir: str,
|
|
372
|
+
) -> bool:
|
|
373
|
+
"""
|
|
374
|
+
Cannot run slo and metric watcher threads together for now.
|
|
375
|
+
SLO Watcher Threads take precedence over metric watcher threads.
|
|
376
|
+
Metric Watcher Threads can run if there are no SLOs available.
|
|
377
|
+
"""
|
|
378
|
+
slo_files = get_files_of_type_in_dir(
|
|
379
|
+
file_type="slo", service=service, soa_dir=soa_dir
|
|
380
|
+
)
|
|
381
|
+
rollback_files = get_files_of_type_in_dir(
|
|
382
|
+
file_type="rollback", service=service, soa_dir=soa_dir
|
|
383
|
+
)
|
|
384
|
+
return bool(not slo_files and rollback_files)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def report_waiting_aborted(service: str, deploy_group: str) -> None:
|
|
388
|
+
print(
|
|
389
|
+
PaastaColors.red(
|
|
390
|
+
"Waiting for deployment aborted."
|
|
391
|
+
" PaaSTA will continue trying to deploy this code."
|
|
392
|
+
)
|
|
393
|
+
)
|
|
394
|
+
print("If you wish to see the status, run:")
|
|
395
|
+
print()
|
|
396
|
+
print(f" paasta status -s {service} -l {deploy_group} -v")
|
|
397
|
+
print()
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def get_authors_to_be_notified(
|
|
401
|
+
git_url: str, from_sha: str, to_sha: str, authors: Optional[Collection[str]]
|
|
402
|
+
) -> str:
|
|
403
|
+
if from_sha is None:
|
|
404
|
+
return ""
|
|
405
|
+
|
|
406
|
+
if authors:
|
|
407
|
+
authors_to_notify = authors
|
|
408
|
+
elif "git.yelpcorp.com" in git_url:
|
|
409
|
+
ret, git_authors = remote_git.get_authors(
|
|
410
|
+
git_url=git_url, from_sha=from_sha, to_sha=to_sha
|
|
411
|
+
)
|
|
412
|
+
if ret == 0:
|
|
413
|
+
authors_to_notify = git_authors.split()
|
|
414
|
+
else:
|
|
415
|
+
return f"(Could not get authors: {git_authors})"
|
|
416
|
+
else:
|
|
417
|
+
# We have no way of getting authors on the fly if the repository is not on gitolite
|
|
418
|
+
return ""
|
|
419
|
+
|
|
420
|
+
slacky_authors = ", ".join({f"<@{a}>" for a in authors_to_notify})
|
|
421
|
+
log.debug(f"Authors: {slacky_authors}")
|
|
422
|
+
return f"^ {slacky_authors}"
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def deploy_group_is_set_to_notify(
|
|
426
|
+
deploy_info: Dict[str, Any], deploy_group: str, notify_type: str
|
|
427
|
+
) -> bool:
|
|
428
|
+
for step in deploy_info.get("pipeline", []):
|
|
429
|
+
if step.get("step", "") == deploy_group:
|
|
430
|
+
# Use the specific notify_type if available else use slack_notify
|
|
431
|
+
return step.get(notify_type, step.get("slack_notify", False))
|
|
432
|
+
return False
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def get_deploy_info(service: str, soa_dir: str) -> Dict[str, Any]:
|
|
436
|
+
file_path = os.path.join(soa_dir, service, "deploy.yaml")
|
|
437
|
+
return read_deploy(file_path)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def paasta_mark_for_deployment(args: argparse.Namespace) -> int:
|
|
441
|
+
"""Wrapping mark_for_deployment"""
|
|
442
|
+
if args.verbose:
|
|
443
|
+
log.setLevel(level=logging.DEBUG)
|
|
444
|
+
else:
|
|
445
|
+
log.setLevel(level=logging.INFO)
|
|
446
|
+
|
|
447
|
+
service = args.service
|
|
448
|
+
if service and service.startswith("services-"):
|
|
449
|
+
service = service.split("services-", 1)[1]
|
|
450
|
+
validate_service_name(service, soa_dir=args.soa_dir)
|
|
451
|
+
|
|
452
|
+
deploy_group = args.deploy_group
|
|
453
|
+
in_use_deploy_groups = list_deploy_groups(service=service, soa_dir=args.soa_dir)
|
|
454
|
+
_, invalid_deploy_groups = validate_given_deploy_groups(
|
|
455
|
+
in_use_deploy_groups, [deploy_group]
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if len(invalid_deploy_groups) == 1:
|
|
459
|
+
print(
|
|
460
|
+
PaastaColors.red(
|
|
461
|
+
"ERROR: These deploy groups are not currently used anywhere: %s.\n"
|
|
462
|
+
% (",").join(invalid_deploy_groups)
|
|
463
|
+
)
|
|
464
|
+
)
|
|
465
|
+
print(
|
|
466
|
+
PaastaColors.red(
|
|
467
|
+
"This isn't technically wrong because you can mark-for-deployment before deploying there"
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
print(
|
|
471
|
+
PaastaColors.red(
|
|
472
|
+
"but this is probably a typo. Did you mean one of these in-use deploy groups?:"
|
|
473
|
+
)
|
|
474
|
+
)
|
|
475
|
+
print(PaastaColors.red(" %s" % (",").join(in_use_deploy_groups)))
|
|
476
|
+
print()
|
|
477
|
+
print(PaastaColors.red("Continuing regardless..."))
|
|
478
|
+
|
|
479
|
+
if args.git_url is None:
|
|
480
|
+
args.git_url = get_git_url(service=service, soa_dir=args.soa_dir)
|
|
481
|
+
|
|
482
|
+
commit = validate_git_sha(sha=args.commit, git_url=args.git_url)
|
|
483
|
+
deployment_version = DeploymentVersion(commit, args.image_version)
|
|
484
|
+
|
|
485
|
+
old_deployment_version = get_currently_deployed_version(
|
|
486
|
+
service=service, deploy_group=deploy_group
|
|
487
|
+
)
|
|
488
|
+
if deployment_version == old_deployment_version:
|
|
489
|
+
print(
|
|
490
|
+
"Warning: The image asked to be deployed already matches what is set to be deployed:"
|
|
491
|
+
)
|
|
492
|
+
print(deployment_version)
|
|
493
|
+
print("Continuing anyway.")
|
|
494
|
+
|
|
495
|
+
if args.verify_image:
|
|
496
|
+
if not is_docker_image_already_in_registry(
|
|
497
|
+
service, args.soa_dir, commit, deployment_version.image_version
|
|
498
|
+
):
|
|
499
|
+
raise ValueError(
|
|
500
|
+
f"Failed to find image in the registry for the following version {deployment_version}"
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
deploy_info = get_deploy_info(service=service, soa_dir=args.soa_dir)
|
|
504
|
+
if not can_user_deploy_service(deploy_info, service):
|
|
505
|
+
sys.exit(1)
|
|
506
|
+
|
|
507
|
+
metrics_factory: Callable[[str], metrics_lib.BaseMetrics] = metrics_lib.NoMetrics
|
|
508
|
+
# only time if wait for deployment and we are actually deploying a new image
|
|
509
|
+
if args.block and deployment_version != old_deployment_version:
|
|
510
|
+
metrics_factory = metrics_lib.get_metrics_interface
|
|
511
|
+
metrics = metrics_factory("paasta.mark_for_deployment")
|
|
512
|
+
deploy_timer = metrics.create_timer(
|
|
513
|
+
name="deploy_duration",
|
|
514
|
+
default_dimensions=dict(
|
|
515
|
+
paasta_service=service,
|
|
516
|
+
deploy_group=deploy_group,
|
|
517
|
+
old_version=str(old_deployment_version),
|
|
518
|
+
new_version=str(deployment_version),
|
|
519
|
+
deploy_timeout=args.timeout,
|
|
520
|
+
),
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# meteorite deploy timers can be used as context managers; however, they
|
|
524
|
+
# won't emit if the context is exited with an exception, so we need to use
|
|
525
|
+
# a try/finally.
|
|
526
|
+
deploy_timer.start()
|
|
527
|
+
ret = 1 # assume exc, since if success will be set to 0 anyway
|
|
528
|
+
try:
|
|
529
|
+
deploy_process = MarkForDeploymentProcess(
|
|
530
|
+
service=service,
|
|
531
|
+
deploy_info=deploy_info,
|
|
532
|
+
deploy_group=deploy_group,
|
|
533
|
+
commit=commit,
|
|
534
|
+
old_git_sha=old_deployment_version.sha if old_deployment_version else None,
|
|
535
|
+
git_url=args.git_url,
|
|
536
|
+
auto_rollback=args.auto_rollback,
|
|
537
|
+
block=args.block,
|
|
538
|
+
soa_dir=args.soa_dir,
|
|
539
|
+
timeout=args.timeout,
|
|
540
|
+
warn_pct=args.warn,
|
|
541
|
+
auto_certify_delay=args.auto_certify_delay,
|
|
542
|
+
auto_abandon_delay=args.auto_abandon_delay,
|
|
543
|
+
auto_rollback_delay=args.auto_rollback_delay,
|
|
544
|
+
image_version=deployment_version.image_version,
|
|
545
|
+
old_image_version=old_deployment_version.image_version
|
|
546
|
+
if old_deployment_version
|
|
547
|
+
else None,
|
|
548
|
+
authors=args.authors,
|
|
549
|
+
polling_interval=args.polling_interval,
|
|
550
|
+
diagnosis_interval=args.diagnosis_interval,
|
|
551
|
+
time_before_first_diagnosis=args.time_before_first_diagnosis,
|
|
552
|
+
metrics_interface=metrics,
|
|
553
|
+
)
|
|
554
|
+
ret = deploy_process.run()
|
|
555
|
+
return ret
|
|
556
|
+
finally:
|
|
557
|
+
deploy_timer.stop(tmp_dimensions={"exit_status": ret})
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
class Progress:
|
|
561
|
+
waiting_on: Mapping[str, Collection[str]]
|
|
562
|
+
percent: float
|
|
563
|
+
|
|
564
|
+
def __init__(
|
|
565
|
+
self, percent: float = 0, waiting_on: Mapping[str, Collection[str]] = None
|
|
566
|
+
) -> None:
|
|
567
|
+
self.percent = percent
|
|
568
|
+
self.waiting_on = waiting_on
|
|
569
|
+
|
|
570
|
+
def human_readable(self, summary: bool) -> str:
|
|
571
|
+
if self.percent != 0 and self.percent != 100 and not summary:
|
|
572
|
+
s = f"{round(self.percent)}% (Waiting on {self.human_waiting_on()})"
|
|
573
|
+
else:
|
|
574
|
+
s = f"{round(self.percent)}%"
|
|
575
|
+
return s
|
|
576
|
+
|
|
577
|
+
def human_waiting_on(self) -> str:
|
|
578
|
+
if self.waiting_on is None:
|
|
579
|
+
return "N/A"
|
|
580
|
+
things = []
|
|
581
|
+
for cluster, instances in self.waiting_on.items():
|
|
582
|
+
num_instances = len(instances)
|
|
583
|
+
if num_instances == 0:
|
|
584
|
+
continue
|
|
585
|
+
elif num_instances == 1:
|
|
586
|
+
(one_instance,) = instances
|
|
587
|
+
things.append(f"`{cluster}`: `{one_instance}`")
|
|
588
|
+
else:
|
|
589
|
+
things.append(f"`{cluster}`: {len(instances)} instances")
|
|
590
|
+
return ", ".join(things)
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
class MarkForDeploymentProcess(RollbackSlackDeploymentProcess):
|
|
594
|
+
rollback_states = ["start_rollback", "rolling_back", "rolled_back"]
|
|
595
|
+
rollforward_states = ["start_deploy", "deploying", "deployed"]
|
|
596
|
+
default_slack_channel = DEFAULT_SLACK_CHANNEL
|
|
597
|
+
|
|
598
|
+
paasta_status_reminder_handle: asyncio.TimerHandle
|
|
599
|
+
|
|
600
|
+
def __init__(
|
|
601
|
+
self,
|
|
602
|
+
service: str,
|
|
603
|
+
deploy_info: Dict,
|
|
604
|
+
deploy_group: str,
|
|
605
|
+
commit: str,
|
|
606
|
+
old_git_sha: str,
|
|
607
|
+
git_url: str,
|
|
608
|
+
auto_rollback: bool,
|
|
609
|
+
block: bool,
|
|
610
|
+
soa_dir: str,
|
|
611
|
+
timeout: float,
|
|
612
|
+
warn_pct: float,
|
|
613
|
+
auto_certify_delay: float,
|
|
614
|
+
auto_abandon_delay: float,
|
|
615
|
+
auto_rollback_delay: float,
|
|
616
|
+
image_version: Optional[str] = None,
|
|
617
|
+
old_image_version: Optional[str] = None,
|
|
618
|
+
authors: Optional[List[str]] = None,
|
|
619
|
+
polling_interval: float = None,
|
|
620
|
+
diagnosis_interval: float = None,
|
|
621
|
+
time_before_first_diagnosis: float = None,
|
|
622
|
+
metrics_interface: metrics_lib.BaseMetrics = metrics_lib.NoMetrics(
|
|
623
|
+
"paasta.mark_for_deployment"
|
|
624
|
+
),
|
|
625
|
+
) -> None:
|
|
626
|
+
self.service = service
|
|
627
|
+
self.deploy_info = deploy_info
|
|
628
|
+
self.deploy_group = deploy_group
|
|
629
|
+
self.commit = commit
|
|
630
|
+
self.old_git_sha = old_git_sha
|
|
631
|
+
self.image_version = image_version
|
|
632
|
+
self.old_image_version = old_image_version
|
|
633
|
+
self.deployment_version = DeploymentVersion(commit, image_version)
|
|
634
|
+
self.old_deployment_version = DeploymentVersion(old_git_sha, old_image_version)
|
|
635
|
+
self.git_url = git_url
|
|
636
|
+
self.auto_rollback = (
|
|
637
|
+
auto_rollback
|
|
638
|
+
and old_git_sha is not None
|
|
639
|
+
and self.deployment_version != self.old_deployment_version
|
|
640
|
+
)
|
|
641
|
+
self.auto_rollbacks_ever_enabled = self.auto_rollback
|
|
642
|
+
self.block = block
|
|
643
|
+
self.soa_dir = soa_dir
|
|
644
|
+
self.timeout = timeout
|
|
645
|
+
self.warn_pct = warn_pct
|
|
646
|
+
self.mark_for_deployment_return_code = -1
|
|
647
|
+
self.auto_certify_delay = auto_certify_delay
|
|
648
|
+
self.auto_abandon_delay = auto_abandon_delay
|
|
649
|
+
self.auto_rollback_delay = auto_rollback_delay
|
|
650
|
+
self.authors = authors
|
|
651
|
+
self.polling_interval = polling_interval
|
|
652
|
+
self.diagnosis_interval = diagnosis_interval
|
|
653
|
+
self.time_before_first_diagnosis = time_before_first_diagnosis
|
|
654
|
+
self.metrics_interface = metrics_interface
|
|
655
|
+
self.instance_configs_per_cluster: Dict[
|
|
656
|
+
str, List[LongRunningServiceConfig]
|
|
657
|
+
] = get_instance_configs_for_service_in_deploy_group_all_clusters(
|
|
658
|
+
service, deploy_group, soa_dir
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
# Keep track of each wait_for_deployment task so we can cancel it.
|
|
662
|
+
self.wait_for_deployment_tasks: Dict[DeploymentVersion, asyncio.Task] = {}
|
|
663
|
+
|
|
664
|
+
self.human_readable_status = "Waiting on mark-for-deployment to initialize..."
|
|
665
|
+
self.progress = Progress()
|
|
666
|
+
self.last_action = None
|
|
667
|
+
self.slo_watchers: List[SLOWatcher] = []
|
|
668
|
+
self.metric_watchers: List[MetricWatcher] = []
|
|
669
|
+
self.start_slo_watcher_threads(self.service, self.soa_dir)
|
|
670
|
+
|
|
671
|
+
# TODO: Allow both metric and slo watcher threads to run together in the future
|
|
672
|
+
if can_run_metric_watcher_threads(service=self.service, soa_dir=self.soa_dir):
|
|
673
|
+
self.start_metric_watcher_threads(self.service, self.soa_dir)
|
|
674
|
+
|
|
675
|
+
# Initialize Slack threads and send the first message
|
|
676
|
+
super().__init__()
|
|
677
|
+
self.print_who_is_running_this()
|
|
678
|
+
|
|
679
|
+
def get_progress(self, summary: bool = False) -> str:
|
|
680
|
+
if not self.block:
|
|
681
|
+
return "Deploying in background, progress not tracked."
|
|
682
|
+
return self.progress.human_readable(summary)
|
|
683
|
+
|
|
684
|
+
def print_who_is_running_this(self) -> None:
|
|
685
|
+
build_url = get_jenkins_build_output_url()
|
|
686
|
+
if build_url is not None:
|
|
687
|
+
message = f"(<{build_url}|Jenkins Job>)"
|
|
688
|
+
else:
|
|
689
|
+
message = f"(Run by `{getpass.getuser()}` on {socket.getfqdn()})"
|
|
690
|
+
self.update_slack_thread(message)
|
|
691
|
+
|
|
692
|
+
def get_authors(self) -> str:
|
|
693
|
+
# In order to avoid notifying people who aren't part of the current
|
|
694
|
+
# service push, we calculate authors based on commits different since
|
|
695
|
+
# the current production SHA, as opposed to the old SHA on this deploy
|
|
696
|
+
# group.
|
|
697
|
+
#
|
|
698
|
+
# This avoids situations such as:
|
|
699
|
+
# * Notifying people from a previous push which went through stagef,
|
|
700
|
+
# if the new push goes through stageg.
|
|
701
|
+
# * Notifying everybody who has committed to a repo in the past year
|
|
702
|
+
# when updating a "legacy" deploy group (e.g. for yelp-main).
|
|
703
|
+
prod_deploy_group = self.deploy_info.get("production_deploy_group")
|
|
704
|
+
from_sha = None
|
|
705
|
+
if prod_deploy_group is not None:
|
|
706
|
+
from_sha = get_currently_deployed_sha(
|
|
707
|
+
service=self.service, deploy_group=prod_deploy_group
|
|
708
|
+
)
|
|
709
|
+
# If there's no production deploy group, or the production deploy group
|
|
710
|
+
# has never been deployed to, just use the old SHA from this deploy group.
|
|
711
|
+
if from_sha is None:
|
|
712
|
+
from_sha = self.old_git_sha
|
|
713
|
+
return get_authors_to_be_notified(
|
|
714
|
+
git_url=self.git_url,
|
|
715
|
+
from_sha=from_sha,
|
|
716
|
+
to_sha=self.commit,
|
|
717
|
+
authors=self.authors,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
def ping_authors(self, message: str = None) -> None:
|
|
721
|
+
if message:
|
|
722
|
+
self.update_slack_thread(f"{message}\n{self.get_authors()}")
|
|
723
|
+
else:
|
|
724
|
+
self.update_slack_thread(self.get_authors())
|
|
725
|
+
|
|
726
|
+
def get_slack_client(self) -> SlackClient:
|
|
727
|
+
return get_slack_client().sc
|
|
728
|
+
|
|
729
|
+
def get_slack_channel(self) -> str:
|
|
730
|
+
"""Safely get some slack channel to post to. Defaults to ``DEFAULT_SLACK_CHANNEL``.
|
|
731
|
+
Currently only uses the first slack channel available, and doesn't support
|
|
732
|
+
multi-channel notifications."""
|
|
733
|
+
if self.deploy_info.get("slack_notify", True):
|
|
734
|
+
try:
|
|
735
|
+
channel = self.deploy_info.get("slack_channels")[0]
|
|
736
|
+
# Nightly jenkins builds will often re-deploy master. This causes Slack noise that wasn't present before
|
|
737
|
+
# the auto-rollbacks work.
|
|
738
|
+
if self.deployment_version == self.old_deployment_version:
|
|
739
|
+
print(
|
|
740
|
+
f"Rollback image matches rollforward image: {self.deployment_version}, "
|
|
741
|
+
f"Sending slack notifications to {DEFAULT_SLACK_CHANNEL} instead of {channel}."
|
|
742
|
+
)
|
|
743
|
+
return DEFAULT_SLACK_CHANNEL
|
|
744
|
+
else:
|
|
745
|
+
return channel
|
|
746
|
+
except (IndexError, AttributeError, TypeError):
|
|
747
|
+
return DEFAULT_SLACK_CHANNEL
|
|
748
|
+
else:
|
|
749
|
+
return DEFAULT_SLACK_CHANNEL
|
|
750
|
+
|
|
751
|
+
def get_deployment_name(self) -> str:
|
|
752
|
+
return f"Deploy of `{self.deployment_version.short_sha_repr()}` of `{self.service}` to `{self.deploy_group}`:"
|
|
753
|
+
|
|
754
|
+
def on_enter_start_deploy(self) -> None:
|
|
755
|
+
self.update_slack_status(
|
|
756
|
+
f"Marking `{self.deployment_version.short_sha_repr()}` for deployment for {self.deploy_group}..."
|
|
757
|
+
)
|
|
758
|
+
self.mark_for_deployment_return_code = mark_for_deployment(
|
|
759
|
+
git_url=self.git_url,
|
|
760
|
+
deploy_group=self.deploy_group,
|
|
761
|
+
service=self.service,
|
|
762
|
+
commit=self.commit,
|
|
763
|
+
image_version=self.image_version,
|
|
764
|
+
)
|
|
765
|
+
if self.mark_for_deployment_return_code != 0:
|
|
766
|
+
self.trigger("mfd_failed")
|
|
767
|
+
else:
|
|
768
|
+
self.update_slack_thread(
|
|
769
|
+
f"Marked `{self.deployment_version.short_sha_repr()}` for {self.deploy_group}."
|
|
770
|
+
+ (
|
|
771
|
+
"\n" + self.get_authors()
|
|
772
|
+
if self.deploy_group_is_set_to_notify("notify_after_mark")
|
|
773
|
+
else ""
|
|
774
|
+
)
|
|
775
|
+
)
|
|
776
|
+
log.debug("triggering mfd_succeeded")
|
|
777
|
+
self.trigger("mfd_succeeded")
|
|
778
|
+
|
|
779
|
+
def schedule_paasta_status_reminder(self) -> None:
|
|
780
|
+
def waiting_on_to_status(
|
|
781
|
+
waiting_on: Mapping[str, Collection[str]]
|
|
782
|
+
) -> List[str]:
|
|
783
|
+
if waiting_on is None:
|
|
784
|
+
return [
|
|
785
|
+
f"`paasta status --service {self.service} --deploy-group {self.deploy_group} -vv`"
|
|
786
|
+
]
|
|
787
|
+
commands = []
|
|
788
|
+
for cluster, instances in waiting_on.items():
|
|
789
|
+
num_instances = len(instances)
|
|
790
|
+
if num_instances == 0:
|
|
791
|
+
continue
|
|
792
|
+
else:
|
|
793
|
+
commands.append(
|
|
794
|
+
f"`paasta status --service {self.service} --cluster {cluster} --instance {','.join(instances)} -vv`"
|
|
795
|
+
)
|
|
796
|
+
return commands
|
|
797
|
+
|
|
798
|
+
def times_up() -> None:
|
|
799
|
+
try:
|
|
800
|
+
if self.state == "deploying":
|
|
801
|
+
human_max_deploy_time = humanize.naturaldelta(
|
|
802
|
+
datetime.timedelta(seconds=self.timeout)
|
|
803
|
+
)
|
|
804
|
+
stuck_bounce_runbook = os.environ.get(
|
|
805
|
+
"STUCK_BOUNCE_RUNBOOK",
|
|
806
|
+
DEFAULT_STUCK_BOUNCE_RUNBOOK,
|
|
807
|
+
)
|
|
808
|
+
status_commands = "\n".join(
|
|
809
|
+
waiting_on_to_status(self.progress.waiting_on)
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
self.notify_users(
|
|
813
|
+
(
|
|
814
|
+
f"It has been {self.warn_pct}% of the "
|
|
815
|
+
f"maximum deploy time ({human_max_deploy_time}), "
|
|
816
|
+
"which means the deployment may be stuck. "
|
|
817
|
+
"Here are some things you can try:\n\n"
|
|
818
|
+
f"* See {stuck_bounce_runbook} for debugging help\n"
|
|
819
|
+
f"* Run these commands to see the status of instances that "
|
|
820
|
+
"have not yet finished deploying:\n\n"
|
|
821
|
+
f"{status_commands}"
|
|
822
|
+
)
|
|
823
|
+
)
|
|
824
|
+
except Exception as e:
|
|
825
|
+
log.error(
|
|
826
|
+
f"Non-fatal exception encountered when processing the status reminder: {e}"
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
def schedule_callback() -> None:
|
|
830
|
+
time_to_notify = self.timeout * self.warn_pct / 100
|
|
831
|
+
self.paasta_status_reminder_handle = self.event_loop.call_later(
|
|
832
|
+
time_to_notify, times_up
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
try:
|
|
836
|
+
self.event_loop.call_soon_threadsafe(schedule_callback)
|
|
837
|
+
except Exception as e:
|
|
838
|
+
log.error(
|
|
839
|
+
f"Non-fatal error encountered scheduling the status reminder callback: {e}"
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
def cancel_paasta_status_reminder(self) -> None:
|
|
843
|
+
try:
|
|
844
|
+
handle = self.get_paasta_status_reminder_handle()
|
|
845
|
+
if handle is not None:
|
|
846
|
+
handle.cancel()
|
|
847
|
+
self.paasta_status_reminder_handle = None
|
|
848
|
+
except Exception as e:
|
|
849
|
+
log.error(
|
|
850
|
+
f"Non-fatal error encountered when canceling the paasta status reminder: {e}"
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
def get_paasta_status_reminder_handle(self) -> Optional[asyncio.TimerHandle]:
|
|
854
|
+
try:
|
|
855
|
+
return self.paasta_status_reminder_handle
|
|
856
|
+
except AttributeError:
|
|
857
|
+
return None
|
|
858
|
+
|
|
859
|
+
def states(self) -> Collection[str]:
|
|
860
|
+
return [
|
|
861
|
+
"_begin",
|
|
862
|
+
"start_deploy",
|
|
863
|
+
"deploying",
|
|
864
|
+
"deployed",
|
|
865
|
+
"mfd_failed",
|
|
866
|
+
"deploy_errored",
|
|
867
|
+
"deploy_cancelled",
|
|
868
|
+
"start_rollback",
|
|
869
|
+
"rolling_back",
|
|
870
|
+
"rolled_back",
|
|
871
|
+
"abandon",
|
|
872
|
+
"complete",
|
|
873
|
+
]
|
|
874
|
+
|
|
875
|
+
def start_state(self) -> str:
|
|
876
|
+
return "_begin"
|
|
877
|
+
|
|
878
|
+
def start_transition(self) -> str:
|
|
879
|
+
return "start_deploy"
|
|
880
|
+
|
|
881
|
+
def valid_transitions(self) -> Iterator[state_machine.TransitionDefinition]:
|
|
882
|
+
rollback_is_possible = (
|
|
883
|
+
self.old_git_sha is not None
|
|
884
|
+
and self.deployment_version != self.old_deployment_version
|
|
885
|
+
)
|
|
886
|
+
|
|
887
|
+
yield {"source": "_begin", "dest": "start_deploy", "trigger": "start_deploy"}
|
|
888
|
+
yield {
|
|
889
|
+
"source": "start_deploy",
|
|
890
|
+
"dest": "deploying",
|
|
891
|
+
"trigger": "mfd_succeeded",
|
|
892
|
+
}
|
|
893
|
+
yield {"source": "deploying", "dest": "deployed", "trigger": "deploy_finished"}
|
|
894
|
+
|
|
895
|
+
yield {
|
|
896
|
+
"source": ["start_deploy", "start_rollback"],
|
|
897
|
+
"dest": "mfd_failed",
|
|
898
|
+
"trigger": "mfd_failed",
|
|
899
|
+
}
|
|
900
|
+
yield {
|
|
901
|
+
"source": [s for s in self.states() if not self.is_terminal_state(s)],
|
|
902
|
+
"dest": "deploy_errored",
|
|
903
|
+
"trigger": "deploy_errored",
|
|
904
|
+
}
|
|
905
|
+
yield {
|
|
906
|
+
"source": [s for s in self.states() if not self.is_terminal_state(s)],
|
|
907
|
+
"dest": "deploy_cancelled",
|
|
908
|
+
"trigger": "deploy_cancelled",
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
if rollback_is_possible:
|
|
912
|
+
yield {
|
|
913
|
+
"source": self.rollforward_states,
|
|
914
|
+
"dest": "start_rollback",
|
|
915
|
+
"trigger": "rollback_button_clicked",
|
|
916
|
+
"before": self.log_user_rollback,
|
|
917
|
+
}
|
|
918
|
+
yield {
|
|
919
|
+
"source": self.rollback_states,
|
|
920
|
+
"dest": None, # this makes it an "internal transition", effectively a noop.
|
|
921
|
+
"trigger": "rollback_button_clicked",
|
|
922
|
+
}
|
|
923
|
+
yield {
|
|
924
|
+
"source": self.rollforward_states,
|
|
925
|
+
"dest": "start_rollback",
|
|
926
|
+
"trigger": "rollback_slo_failure",
|
|
927
|
+
"before": self.log_slo_rollback,
|
|
928
|
+
}
|
|
929
|
+
yield {
|
|
930
|
+
"source": self.rollback_states,
|
|
931
|
+
"dest": None, # this makes it an "internal transition", effectively a noop.
|
|
932
|
+
"trigger": "rollback_slo_failure",
|
|
933
|
+
}
|
|
934
|
+
yield {
|
|
935
|
+
"source": self.rollforward_states,
|
|
936
|
+
"dest": "start_rollback",
|
|
937
|
+
"trigger": "rollback_metric_failure",
|
|
938
|
+
"before": self.log_metric_rollback,
|
|
939
|
+
}
|
|
940
|
+
yield {
|
|
941
|
+
"source": self.rollback_states,
|
|
942
|
+
"dest": "start_deploy",
|
|
943
|
+
"trigger": "forward_button_clicked",
|
|
944
|
+
}
|
|
945
|
+
yield {
|
|
946
|
+
"source": self.rollforward_states,
|
|
947
|
+
"dest": None, # this makes it an "internal transition", effectively a noop.
|
|
948
|
+
"trigger": "forward_button_clicked",
|
|
949
|
+
}
|
|
950
|
+
yield {
|
|
951
|
+
"source": "start_rollback",
|
|
952
|
+
"dest": "rolling_back",
|
|
953
|
+
"trigger": "mfd_succeeded",
|
|
954
|
+
}
|
|
955
|
+
yield {
|
|
956
|
+
"source": "rolling_back",
|
|
957
|
+
"dest": "rolled_back",
|
|
958
|
+
"trigger": "deploy_finished",
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
yield {
|
|
962
|
+
"source": "deployed",
|
|
963
|
+
"dest": "complete",
|
|
964
|
+
"trigger": "complete_button_clicked",
|
|
965
|
+
}
|
|
966
|
+
yield {"source": "deployed", "dest": "complete", "trigger": "auto_certify"}
|
|
967
|
+
yield {
|
|
968
|
+
"source": ["rolled_back", "rolling_back"],
|
|
969
|
+
"dest": "abandon",
|
|
970
|
+
"trigger": "abandon_button_clicked",
|
|
971
|
+
}
|
|
972
|
+
yield {"source": "rolled_back", "dest": "abandon", "trigger": "auto_abandon"}
|
|
973
|
+
|
|
974
|
+
if rollback_is_possible:
|
|
975
|
+
# Suppress these buttons if it doesn't make sense to roll back.
|
|
976
|
+
yield {
|
|
977
|
+
"source": "*",
|
|
978
|
+
"dest": None, # Don't actually change state, just call the before function.
|
|
979
|
+
"trigger": "enable_auto_rollbacks_button_clicked",
|
|
980
|
+
"unless": [self.auto_rollbacks_enabled],
|
|
981
|
+
"before": self.enable_auto_rollbacks,
|
|
982
|
+
}
|
|
983
|
+
yield {
|
|
984
|
+
"source": "*",
|
|
985
|
+
"dest": None, # Don't actually change state, just call the before function.
|
|
986
|
+
"trigger": "disable_auto_rollbacks_button_clicked",
|
|
987
|
+
"conditions": [
|
|
988
|
+
self.any_rollback_condition_failing,
|
|
989
|
+
self.auto_rollbacks_enabled,
|
|
990
|
+
],
|
|
991
|
+
"before": self.disable_auto_rollbacks,
|
|
992
|
+
}
|
|
993
|
+
yield {
|
|
994
|
+
"source": "*",
|
|
995
|
+
"dest": None,
|
|
996
|
+
"trigger": "slos_started_failing",
|
|
997
|
+
"conditions": [self.auto_rollbacks_enabled],
|
|
998
|
+
"unless": [self.already_rolling_back],
|
|
999
|
+
"before": functools.partial(
|
|
1000
|
+
self.start_auto_rollback_countdown, "rollback_slo_failure"
|
|
1001
|
+
),
|
|
1002
|
+
}
|
|
1003
|
+
yield {
|
|
1004
|
+
"source": "*",
|
|
1005
|
+
"dest": None,
|
|
1006
|
+
"trigger": "slos_stopped_failing",
|
|
1007
|
+
"before": functools.partial(
|
|
1008
|
+
self.cancel_auto_rollback_countdown, "rollback_slo_failure"
|
|
1009
|
+
),
|
|
1010
|
+
}
|
|
1011
|
+
yield {
|
|
1012
|
+
"source": "*",
|
|
1013
|
+
"dest": None,
|
|
1014
|
+
"trigger": "metrics_started_failing",
|
|
1015
|
+
"conditions": [self.auto_rollbacks_enabled],
|
|
1016
|
+
"unless": [self.already_rolling_back],
|
|
1017
|
+
"before": functools.partial(
|
|
1018
|
+
self.start_auto_rollback_countdown, "rollback_metric_failure"
|
|
1019
|
+
),
|
|
1020
|
+
}
|
|
1021
|
+
yield {
|
|
1022
|
+
"source": "*",
|
|
1023
|
+
"dest": None,
|
|
1024
|
+
"trigger": "metrics_stopped_failing",
|
|
1025
|
+
"before": functools.partial(
|
|
1026
|
+
self.cancel_auto_rollback_countdown, "rollback_metric_failure"
|
|
1027
|
+
),
|
|
1028
|
+
}
|
|
1029
|
+
yield {
|
|
1030
|
+
"source": "*",
|
|
1031
|
+
"dest": None,
|
|
1032
|
+
"trigger": "snooze_button_clicked",
|
|
1033
|
+
"before": self.restart_timer,
|
|
1034
|
+
"conditions": [self.is_timer_running],
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
def disable_auto_rollbacks(self, trigger: str) -> None:
|
|
1038
|
+
self.cancel_auto_rollback_countdown(trigger=trigger)
|
|
1039
|
+
self.auto_rollback = False
|
|
1040
|
+
self.update_slack_status(
|
|
1041
|
+
f"Automatic rollback disabled for this deploy. To disable this permanently for this step, edit `deploy.yaml` and set `auto_rollback: false` for the `{self.deploy_group}` step."
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
def enable_auto_rollbacks(self) -> None:
|
|
1045
|
+
self.auto_rollback = True
|
|
1046
|
+
self.auto_rollbacks_ever_enabled = True
|
|
1047
|
+
self.update_slack_status(
|
|
1048
|
+
f"Automatic rollback enabled for this deploy. Will watch for failures and rollback when necessary. To set this permanently, edit `deploy.yaml` and set `auto_rollback: false` for the `{self.deploy_group}` step."
|
|
1049
|
+
)
|
|
1050
|
+
|
|
1051
|
+
def auto_rollbacks_enabled(self) -> bool:
|
|
1052
|
+
"""This getter exists so it can be a condition on transitions, since those need to be callables."""
|
|
1053
|
+
return self.auto_rollback
|
|
1054
|
+
|
|
1055
|
+
def get_auto_rollback_delay(self) -> float:
|
|
1056
|
+
return self.auto_rollback_delay
|
|
1057
|
+
|
|
1058
|
+
def get_auto_certify_delay(self) -> float:
|
|
1059
|
+
if self.auto_certify_delay is not None:
|
|
1060
|
+
return self.auto_certify_delay
|
|
1061
|
+
else:
|
|
1062
|
+
if self.auto_rollbacks_ever_enabled:
|
|
1063
|
+
return DEFAULT_AUTO_CERTIFY_DELAY
|
|
1064
|
+
else:
|
|
1065
|
+
return 0
|
|
1066
|
+
|
|
1067
|
+
def already_rolling_back(self) -> bool:
|
|
1068
|
+
return self.state in self.rollback_states
|
|
1069
|
+
|
|
1070
|
+
def status_code_by_state(self) -> Mapping[str, int]:
|
|
1071
|
+
codes = {
|
|
1072
|
+
"deploy_errored": 2,
|
|
1073
|
+
"deploy_cancelled": 1,
|
|
1074
|
+
"mfd_failed": self.mark_for_deployment_return_code,
|
|
1075
|
+
"abandon": 1,
|
|
1076
|
+
"complete": 0,
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
if not self.block:
|
|
1080
|
+
# If we don't pass --wait-for-deployment, then exit immediately after mark-for-deployment succeeds.
|
|
1081
|
+
codes["deploying"] = 0
|
|
1082
|
+
if self.get_auto_certify_delay() <= 0:
|
|
1083
|
+
# Instead of setting a 0-second timer to move to certify, just exit 0 when the deploy finishes.
|
|
1084
|
+
codes["deployed"] = 0
|
|
1085
|
+
|
|
1086
|
+
return codes
|
|
1087
|
+
|
|
1088
|
+
def get_active_button(self) -> Optional[str]:
|
|
1089
|
+
return {
|
|
1090
|
+
"start_deploy": "forward",
|
|
1091
|
+
"deploying": "forward",
|
|
1092
|
+
"deployed": None,
|
|
1093
|
+
"start_rollback": "rollback",
|
|
1094
|
+
"rolling_back": "rollback",
|
|
1095
|
+
"rolled_back": None,
|
|
1096
|
+
}.get(self.state)
|
|
1097
|
+
|
|
1098
|
+
def on_enter_mfd_failed(self) -> None:
|
|
1099
|
+
self.update_slack_status(
|
|
1100
|
+
f"Marking `{self.deployment_version.short_sha_repr()}` for deployment for {self.deploy_group} failed. Please see Jenkins for more output."
|
|
1101
|
+
) # noqa E501
|
|
1102
|
+
|
|
1103
|
+
def on_enter_deploying(self) -> None:
|
|
1104
|
+
# if self.block is False, then deploying is a terminal state so we will promptly exit.
|
|
1105
|
+
# Don't bother starting the background thread in this case.
|
|
1106
|
+
if self.block:
|
|
1107
|
+
thread = Thread(
|
|
1108
|
+
target=self.do_wait_for_deployment,
|
|
1109
|
+
args=(self.commit, self.image_version),
|
|
1110
|
+
daemon=True,
|
|
1111
|
+
)
|
|
1112
|
+
thread.start()
|
|
1113
|
+
self.cancel_paasta_status_reminder()
|
|
1114
|
+
self.schedule_paasta_status_reminder()
|
|
1115
|
+
|
|
1116
|
+
def on_exit_deploying(self) -> None:
|
|
1117
|
+
self.stop_waiting_for_deployment(self.commit)
|
|
1118
|
+
self.cancel_paasta_status_reminder()
|
|
1119
|
+
|
|
1120
|
+
def on_enter_start_rollback(self) -> None:
|
|
1121
|
+
self.update_slack_status(
|
|
1122
|
+
f"Rolling back ({self.deploy_group}) to {self.old_deployment_version}"
|
|
1123
|
+
)
|
|
1124
|
+
self.mark_for_deployment_return_code = mark_for_deployment(
|
|
1125
|
+
git_url=self.git_url,
|
|
1126
|
+
deploy_group=self.deploy_group,
|
|
1127
|
+
service=self.service,
|
|
1128
|
+
commit=self.old_git_sha,
|
|
1129
|
+
image_version=self.old_image_version,
|
|
1130
|
+
)
|
|
1131
|
+
|
|
1132
|
+
if self.mark_for_deployment_return_code != 0:
|
|
1133
|
+
self.trigger("mfd_failed")
|
|
1134
|
+
else:
|
|
1135
|
+
self.update_slack_thread(
|
|
1136
|
+
f"Marked `{self.old_git_sha[:8]}` for {self.deploy_group}."
|
|
1137
|
+
+ (
|
|
1138
|
+
"\n" + self.get_authors()
|
|
1139
|
+
if self.deploy_group_is_set_to_notify("notify_after_mark")
|
|
1140
|
+
else ""
|
|
1141
|
+
)
|
|
1142
|
+
)
|
|
1143
|
+
|
|
1144
|
+
self.trigger("mfd_succeeded")
|
|
1145
|
+
|
|
1146
|
+
def on_enter_rolling_back(self) -> None:
|
|
1147
|
+
if self.block:
|
|
1148
|
+
thread = Thread(
|
|
1149
|
+
target=self.do_wait_for_deployment,
|
|
1150
|
+
args=(self.old_git_sha, self.old_image_version),
|
|
1151
|
+
daemon=True,
|
|
1152
|
+
)
|
|
1153
|
+
thread.start()
|
|
1154
|
+
|
|
1155
|
+
def on_exit_rolling_back(self) -> None:
|
|
1156
|
+
self.stop_waiting_for_deployment(self.old_git_sha, self.old_image_version)
|
|
1157
|
+
|
|
1158
|
+
def on_enter_deploy_errored(self) -> None:
|
|
1159
|
+
report_waiting_aborted(self.service, self.deploy_group)
|
|
1160
|
+
self.update_slack_status(f"Deploy aborted, but it will still try to converge.")
|
|
1161
|
+
self.send_manual_rollback_instructions()
|
|
1162
|
+
if self.deploy_group_is_set_to_notify("notify_after_abort"):
|
|
1163
|
+
self.ping_authors("Deploy errored")
|
|
1164
|
+
|
|
1165
|
+
def on_enter_deploy_cancelled(self) -> None:
|
|
1166
|
+
if self.deploy_group_is_set_to_notify("notify_after_abort"):
|
|
1167
|
+
self.ping_authors("Deploy cancelled")
|
|
1168
|
+
|
|
1169
|
+
def stop_waiting_for_deployment(
|
|
1170
|
+
self, target_commit: str, target_image_version: Optional[str] = None
|
|
1171
|
+
) -> None:
|
|
1172
|
+
try:
|
|
1173
|
+
target_version = DeploymentVersion(
|
|
1174
|
+
sha=target_commit, image_version=target_image_version
|
|
1175
|
+
)
|
|
1176
|
+
self.wait_for_deployment_tasks[target_version].cancel()
|
|
1177
|
+
del self.wait_for_deployment_tasks[target_version]
|
|
1178
|
+
except (KeyError, asyncio.InvalidStateError):
|
|
1179
|
+
pass
|
|
1180
|
+
|
|
1181
|
+
@a_sync.to_blocking
|
|
1182
|
+
async def do_wait_for_deployment(
|
|
1183
|
+
self, target_commit: str, target_image_version: Optional[str] = None
|
|
1184
|
+
) -> None:
|
|
1185
|
+
try:
|
|
1186
|
+
target_version = DeploymentVersion(
|
|
1187
|
+
sha=target_commit, image_version=target_image_version
|
|
1188
|
+
)
|
|
1189
|
+
self.stop_waiting_for_deployment(target_commit, target_image_version)
|
|
1190
|
+
wait_for_deployment_task = asyncio.create_task(
|
|
1191
|
+
wait_for_deployment(
|
|
1192
|
+
service=self.service,
|
|
1193
|
+
deploy_group=self.deploy_group,
|
|
1194
|
+
instance_configs_per_cluster=self.instance_configs_per_cluster,
|
|
1195
|
+
git_sha=target_commit,
|
|
1196
|
+
image_version=target_image_version,
|
|
1197
|
+
soa_dir=self.soa_dir,
|
|
1198
|
+
timeout=self.timeout,
|
|
1199
|
+
progress=self.progress,
|
|
1200
|
+
polling_interval=self.polling_interval,
|
|
1201
|
+
diagnosis_interval=self.diagnosis_interval,
|
|
1202
|
+
time_before_first_diagnosis=self.time_before_first_diagnosis,
|
|
1203
|
+
notify_fn=self.ping_authors,
|
|
1204
|
+
)
|
|
1205
|
+
)
|
|
1206
|
+
self.wait_for_deployment_tasks[target_version] = wait_for_deployment_task
|
|
1207
|
+
await wait_for_deployment_task
|
|
1208
|
+
if self.deploy_group_is_set_to_notify("notify_after_wait"):
|
|
1209
|
+
self.ping_authors(
|
|
1210
|
+
f"Finished waiting for deployment of {target_version}"
|
|
1211
|
+
)
|
|
1212
|
+
else:
|
|
1213
|
+
self.update_slack_thread(
|
|
1214
|
+
f"Finished waiting for deployment of {target_version}"
|
|
1215
|
+
)
|
|
1216
|
+
self.trigger("deploy_finished")
|
|
1217
|
+
|
|
1218
|
+
except (KeyboardInterrupt, TimeoutError):
|
|
1219
|
+
self.trigger("deploy_cancelled")
|
|
1220
|
+
except NoSuchCluster:
|
|
1221
|
+
self.trigger("deploy_errored")
|
|
1222
|
+
except asyncio.CancelledError:
|
|
1223
|
+
# Don't trigger deploy_errored when someone calls stop_waiting_for_deployment.
|
|
1224
|
+
pass
|
|
1225
|
+
except Exception:
|
|
1226
|
+
log.error("Caught exception in wait_for_deployment:")
|
|
1227
|
+
log.error(traceback.format_exc())
|
|
1228
|
+
self.trigger("deploy_errored")
|
|
1229
|
+
|
|
1230
|
+
def on_enter_rolled_back(self) -> None:
|
|
1231
|
+
self.update_slack_status(
|
|
1232
|
+
f"Finished rolling back to `{self.old_deployment_version.short_sha_repr()}` in {self.deploy_group}"
|
|
1233
|
+
)
|
|
1234
|
+
line = f"Rollback to {self.old_deployment_version.short_sha_repr()} for {self.deploy_group} complete"
|
|
1235
|
+
_log(service=self.service, component="deploy", line=line, level="event")
|
|
1236
|
+
self.start_timer(self.auto_abandon_delay, "auto_abandon", "abandon")
|
|
1237
|
+
|
|
1238
|
+
def on_enter_deployed(self) -> None:
|
|
1239
|
+
self.update_slack_status(
|
|
1240
|
+
f"Finished deployment of `{self.deployment_version.short_sha_repr()}` to {self.deploy_group}"
|
|
1241
|
+
)
|
|
1242
|
+
line = f"Deployment of {self.deployment_version.short_sha_repr()} for {self.deploy_group} complete"
|
|
1243
|
+
_log(service=self.service, component="deploy", line=line, level="event")
|
|
1244
|
+
self.send_manual_rollback_instructions()
|
|
1245
|
+
|
|
1246
|
+
if self.any_slo_failing() and self.auto_rollbacks_enabled():
|
|
1247
|
+
self.ping_authors(
|
|
1248
|
+
"Because an SLO is currently failing, we will not automatically certify. Instead, we will wait indefinitely until you click one of the buttons above."
|
|
1249
|
+
)
|
|
1250
|
+
elif self.any_metric_failing() and self.auto_rollbacks_enabled():
|
|
1251
|
+
self.ping_authors(
|
|
1252
|
+
"Because a rollback-triggering metric for this service is currently failing, we will not automatically certify. Instead, we will wait indefinitely until you click one of the buttons above."
|
|
1253
|
+
)
|
|
1254
|
+
else:
|
|
1255
|
+
if self.get_auto_certify_delay() > 0:
|
|
1256
|
+
self.start_timer(
|
|
1257
|
+
self.get_auto_certify_delay(), "auto_certify", "certify"
|
|
1258
|
+
)
|
|
1259
|
+
if self.deploy_group_is_set_to_notify("notify_after_good_deploy"):
|
|
1260
|
+
self.ping_authors()
|
|
1261
|
+
|
|
1262
|
+
def on_enter_complete(self) -> None:
|
|
1263
|
+
if self.deploy_group_is_set_to_notify("notify_after_good_deploy"):
|
|
1264
|
+
self.ping_authors()
|
|
1265
|
+
|
|
1266
|
+
def send_manual_rollback_instructions(self) -> None:
|
|
1267
|
+
if self.deployment_version != self.old_deployment_version:
|
|
1268
|
+
extra_rollback_args = ""
|
|
1269
|
+
if self.old_deployment_version.image_version:
|
|
1270
|
+
extra_rollback_args = (
|
|
1271
|
+
f" --image-version {self.old_deployment_version.image_version}"
|
|
1272
|
+
)
|
|
1273
|
+
message = (
|
|
1274
|
+
"If you need to roll back manually, run: "
|
|
1275
|
+
f"`paasta rollback --service {self.service} --deploy-group {self.deploy_group} "
|
|
1276
|
+
f"--commit {self.old_git_sha}{extra_rollback_args}`"
|
|
1277
|
+
)
|
|
1278
|
+
self.update_slack_thread(message)
|
|
1279
|
+
print(message)
|
|
1280
|
+
|
|
1281
|
+
def after_state_change(self) -> None:
|
|
1282
|
+
self.update_slack()
|
|
1283
|
+
super().after_state_change()
|
|
1284
|
+
|
|
1285
|
+
def get_signalfx_api_token(self) -> str:
|
|
1286
|
+
return (
|
|
1287
|
+
load_system_paasta_config()
|
|
1288
|
+
.get_monitoring_config()
|
|
1289
|
+
.get("signalfx_api_key", None)
|
|
1290
|
+
)
|
|
1291
|
+
|
|
1292
|
+
def get_splunk_api_token(self) -> SplunkAuth:
|
|
1293
|
+
auth_token = os.environ["SPLUNK_MFD_TOKEN"]
|
|
1294
|
+
auth_data = (
|
|
1295
|
+
load_system_paasta_config()
|
|
1296
|
+
.get_monitoring_config()
|
|
1297
|
+
.get("splunk_mfd_authentication")
|
|
1298
|
+
)
|
|
1299
|
+
|
|
1300
|
+
return SplunkAuth(
|
|
1301
|
+
host=auth_data["host"],
|
|
1302
|
+
port=auth_data["port"],
|
|
1303
|
+
username=auth_data["username"],
|
|
1304
|
+
password=auth_token,
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1307
|
+
def get_button_text(self, button: str, is_active: bool) -> str:
|
|
1308
|
+
# Button text max length 75 characters
|
|
1309
|
+
# Current button templates allow version max length of 36
|
|
1310
|
+
version_short_str = self.deployment_version.short_sha_repr()
|
|
1311
|
+
if len(version_short_str) > 36:
|
|
1312
|
+
# we'll have to depend on subsequent slack messages to show full version
|
|
1313
|
+
version_short_str = "new version"
|
|
1314
|
+
active_button_texts = {
|
|
1315
|
+
"forward": f"Rolling Forward to {version_short_str} :zombocom:"
|
|
1316
|
+
}
|
|
1317
|
+
inactive_button_texts = {
|
|
1318
|
+
"forward": f"Continue Forward to {version_short_str} :arrow_forward:",
|
|
1319
|
+
"complete": f"Complete deploy to {version_short_str} :white_check_mark:",
|
|
1320
|
+
"snooze": f"Reset countdown",
|
|
1321
|
+
"enable_auto_rollbacks": "Enable auto rollbacks :eyes:",
|
|
1322
|
+
"disable_auto_rollbacks": "Disable auto rollbacks :close_eyes_monkey:",
|
|
1323
|
+
}
|
|
1324
|
+
|
|
1325
|
+
if self.old_deployment_version is not None:
|
|
1326
|
+
old_version_short_str = self.old_deployment_version.short_sha_repr()
|
|
1327
|
+
# Current button templates allow old version max length 43
|
|
1328
|
+
if len(old_version_short_str) > 43:
|
|
1329
|
+
old_version_short_str = "old version"
|
|
1330
|
+
active_button_texts.update(
|
|
1331
|
+
{"rollback": f"Rolling Back to {old_version_short_str} :zombocom:"}
|
|
1332
|
+
)
|
|
1333
|
+
inactive_button_texts.update(
|
|
1334
|
+
{
|
|
1335
|
+
"rollback": f"Roll Back to {old_version_short_str} :arrow_backward:",
|
|
1336
|
+
"abandon": f"Abandon deploy, staying on {old_version_short_str} :x:",
|
|
1337
|
+
}
|
|
1338
|
+
)
|
|
1339
|
+
|
|
1340
|
+
return (active_button_texts if is_active else inactive_button_texts)[button]
|
|
1341
|
+
|
|
1342
|
+
def start_auto_rollback_countdown(self, trigger: str, extra_text: str = "") -> None:
|
|
1343
|
+
cancel_button_text = self.get_button_text(
|
|
1344
|
+
button="disable_auto_rollbacks",
|
|
1345
|
+
is_active=False,
|
|
1346
|
+
)
|
|
1347
|
+
super().start_auto_rollback_countdown(
|
|
1348
|
+
trigger=trigger, extra_text=f'Click "{cancel_button_text}" to cancel this!'
|
|
1349
|
+
)
|
|
1350
|
+
if self.deploy_group_is_set_to_notify("notify_after_auto_rollback"):
|
|
1351
|
+
self.ping_authors()
|
|
1352
|
+
|
|
1353
|
+
def deploy_group_is_set_to_notify(self, notify_type: str) -> bool:
|
|
1354
|
+
return deploy_group_is_set_to_notify(
|
|
1355
|
+
self.deploy_info, self.deploy_group, notify_type
|
|
1356
|
+
)
|
|
1357
|
+
|
|
1358
|
+
def __build_rollback_audit_details(
|
|
1359
|
+
self, rollback_type: RollbackTypes
|
|
1360
|
+
) -> Dict[str, str]:
|
|
1361
|
+
return {
|
|
1362
|
+
"rolled_back_from": str(self.deployment_version),
|
|
1363
|
+
"rolled_back_to": str(self.old_deployment_version),
|
|
1364
|
+
"rollback_type": rollback_type.value,
|
|
1365
|
+
"deploy_group": self.deploy_group,
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
def log_slo_rollback(self) -> None:
|
|
1369
|
+
rollback_details = self.__build_rollback_audit_details(
|
|
1370
|
+
RollbackTypes.AUTOMATIC_SLO_ROLLBACK
|
|
1371
|
+
)
|
|
1372
|
+
self._log_rollback(rollback_details)
|
|
1373
|
+
|
|
1374
|
+
def log_metric_rollback(self) -> None:
|
|
1375
|
+
rollback_details = self.__build_rollback_audit_details(
|
|
1376
|
+
RollbackTypes.AUTOMATIC_METRIC_ROLLBACK
|
|
1377
|
+
)
|
|
1378
|
+
self._log_rollback(rollback_details)
|
|
1379
|
+
|
|
1380
|
+
def log_user_rollback(self) -> None:
|
|
1381
|
+
rollback_details = self.__build_rollback_audit_details(
|
|
1382
|
+
RollbackTypes.USER_INITIATED_ROLLBACK
|
|
1383
|
+
)
|
|
1384
|
+
self._log_rollback(rollback_details)
|
|
1385
|
+
|
|
1386
|
+
def _log_rollback(self, rollback_details: Dict[str, str]) -> None:
|
|
1387
|
+
base_dimensions = dict(rollback_details)
|
|
1388
|
+
base_dimensions["paasta_service"] = self.service
|
|
1389
|
+
# Emit one event per cluster to sfx
|
|
1390
|
+
for cluster in self.instance_configs_per_cluster.keys():
|
|
1391
|
+
dimensions = dict(base_dimensions)
|
|
1392
|
+
dimensions["paasta_cluster"] = cluster
|
|
1393
|
+
self.metrics_interface.emit_event(
|
|
1394
|
+
name="rollback",
|
|
1395
|
+
dimensions=dimensions,
|
|
1396
|
+
)
|
|
1397
|
+
_log_audit(
|
|
1398
|
+
action="rollback",
|
|
1399
|
+
action_details=rollback_details,
|
|
1400
|
+
service=self.service,
|
|
1401
|
+
)
|
|
1402
|
+
|
|
1403
|
+
|
|
1404
|
+
async def wait_until_instance_is_done(
|
|
1405
|
+
executor: concurrent.futures.Executor,
|
|
1406
|
+
service: str,
|
|
1407
|
+
instance: str,
|
|
1408
|
+
cluster: str,
|
|
1409
|
+
version: DeploymentVersion,
|
|
1410
|
+
instance_config: LongRunningServiceConfig,
|
|
1411
|
+
polling_interval: float,
|
|
1412
|
+
diagnosis_interval: float,
|
|
1413
|
+
time_before_first_diagnosis: float,
|
|
1414
|
+
should_ping_for_unhealthy_pods: bool,
|
|
1415
|
+
notify_fn: Optional[Callable[[str], None]] = None,
|
|
1416
|
+
) -> Tuple[str, str]:
|
|
1417
|
+
loop = asyncio.get_running_loop()
|
|
1418
|
+
diagnosis_task = asyncio.create_task(
|
|
1419
|
+
periodically_diagnose_instance(
|
|
1420
|
+
executor,
|
|
1421
|
+
service,
|
|
1422
|
+
instance,
|
|
1423
|
+
cluster,
|
|
1424
|
+
version,
|
|
1425
|
+
instance_config,
|
|
1426
|
+
diagnosis_interval,
|
|
1427
|
+
time_before_first_diagnosis,
|
|
1428
|
+
should_ping_for_unhealthy_pods,
|
|
1429
|
+
notify_fn,
|
|
1430
|
+
)
|
|
1431
|
+
)
|
|
1432
|
+
try:
|
|
1433
|
+
while not await loop.run_in_executor(
|
|
1434
|
+
executor,
|
|
1435
|
+
functools.partial(
|
|
1436
|
+
check_if_instance_is_done,
|
|
1437
|
+
service,
|
|
1438
|
+
instance,
|
|
1439
|
+
cluster,
|
|
1440
|
+
version,
|
|
1441
|
+
instance_config,
|
|
1442
|
+
),
|
|
1443
|
+
):
|
|
1444
|
+
await asyncio.sleep(polling_interval)
|
|
1445
|
+
return (
|
|
1446
|
+
cluster,
|
|
1447
|
+
instance,
|
|
1448
|
+
) # for the convenience of the caller, to know which future is finishing.
|
|
1449
|
+
finally:
|
|
1450
|
+
diagnosis_task.cancel()
|
|
1451
|
+
|
|
1452
|
+
|
|
1453
|
+
async def periodically_diagnose_instance(
|
|
1454
|
+
executor: concurrent.futures.Executor,
|
|
1455
|
+
service: str,
|
|
1456
|
+
instance: str,
|
|
1457
|
+
cluster: str,
|
|
1458
|
+
version: DeploymentVersion,
|
|
1459
|
+
instance_config: LongRunningServiceConfig,
|
|
1460
|
+
diagnosis_interval: float,
|
|
1461
|
+
time_before_first_diagnosis: float,
|
|
1462
|
+
should_ping_for_unhealthy_pods: bool,
|
|
1463
|
+
notify_fn: Optional[Callable[[str], None]] = None,
|
|
1464
|
+
) -> None:
|
|
1465
|
+
await asyncio.sleep(time_before_first_diagnosis)
|
|
1466
|
+
loop = asyncio.get_running_loop()
|
|
1467
|
+
while True:
|
|
1468
|
+
try:
|
|
1469
|
+
await loop.run_in_executor(
|
|
1470
|
+
executor,
|
|
1471
|
+
functools.partial(
|
|
1472
|
+
diagnose_why_instance_is_stuck,
|
|
1473
|
+
service,
|
|
1474
|
+
instance,
|
|
1475
|
+
cluster,
|
|
1476
|
+
version,
|
|
1477
|
+
instance_config,
|
|
1478
|
+
should_ping_for_unhealthy_pods,
|
|
1479
|
+
notify_fn,
|
|
1480
|
+
),
|
|
1481
|
+
)
|
|
1482
|
+
except asyncio.CancelledError:
|
|
1483
|
+
raise
|
|
1484
|
+
except Exception:
|
|
1485
|
+
print(f"Couldn't get status of {service}.{instance}:")
|
|
1486
|
+
traceback.print_exc()
|
|
1487
|
+
await asyncio.sleep(diagnosis_interval)
|
|
1488
|
+
|
|
1489
|
+
|
|
1490
|
+
def diagnose_why_instance_is_stuck(
|
|
1491
|
+
service: str,
|
|
1492
|
+
instance: str,
|
|
1493
|
+
cluster: str,
|
|
1494
|
+
version: DeploymentVersion,
|
|
1495
|
+
instance_config: LongRunningServiceConfig,
|
|
1496
|
+
should_ping_for_unhealthy_pods: bool,
|
|
1497
|
+
notify_fn: Optional[Callable[[str], None]] = None,
|
|
1498
|
+
) -> None:
|
|
1499
|
+
api = client.get_paasta_oapi_client(
|
|
1500
|
+
cluster=get_paasta_oapi_api_clustername(
|
|
1501
|
+
cluster=cluster,
|
|
1502
|
+
is_eks=(instance_config.get_instance_type() == "eks"),
|
|
1503
|
+
),
|
|
1504
|
+
)
|
|
1505
|
+
try:
|
|
1506
|
+
status = api.service.status_instance(
|
|
1507
|
+
service=service,
|
|
1508
|
+
instance=instance,
|
|
1509
|
+
include_envoy=False,
|
|
1510
|
+
include_mesos=False,
|
|
1511
|
+
new=True,
|
|
1512
|
+
)
|
|
1513
|
+
except api.api_error as e:
|
|
1514
|
+
log.warning(
|
|
1515
|
+
"Error getting service status from PaaSTA API for "
|
|
1516
|
+
f"{cluster}: {e.status} {e.reason}"
|
|
1517
|
+
)
|
|
1518
|
+
return
|
|
1519
|
+
|
|
1520
|
+
print(f" Status for {service}.{instance} in {cluster}:")
|
|
1521
|
+
for active_version in status.kubernetes_v2.versions:
|
|
1522
|
+
# We call get_version_table_entry directly so that we can set version_name_suffix based on git_sha instead of
|
|
1523
|
+
# creation time of the version (which is what get_versions_table does.)
|
|
1524
|
+
# Without this, we'd call the old version "new" until the new version is actually created, which would be confusing.
|
|
1525
|
+
for line in get_version_table_entry(
|
|
1526
|
+
active_version,
|
|
1527
|
+
service,
|
|
1528
|
+
instance,
|
|
1529
|
+
cluster,
|
|
1530
|
+
version_name_suffix="new"
|
|
1531
|
+
if active_version.git_sha == version.sha
|
|
1532
|
+
and active_version.image_version == version.image_version
|
|
1533
|
+
else "old",
|
|
1534
|
+
show_config_sha=True,
|
|
1535
|
+
verbose=0,
|
|
1536
|
+
):
|
|
1537
|
+
print(f" {line}")
|
|
1538
|
+
print("")
|
|
1539
|
+
|
|
1540
|
+
if should_ping_for_unhealthy_pods and notify_fn:
|
|
1541
|
+
maybe_ping_for_unhealthy_pods(
|
|
1542
|
+
service, instance, cluster, version, status, notify_fn
|
|
1543
|
+
)
|
|
1544
|
+
|
|
1545
|
+
|
|
1546
|
+
already_pinged = False
|
|
1547
|
+
|
|
1548
|
+
|
|
1549
|
+
def maybe_ping_for_unhealthy_pods(
|
|
1550
|
+
service: str,
|
|
1551
|
+
instance: str,
|
|
1552
|
+
cluster: str,
|
|
1553
|
+
version: DeploymentVersion,
|
|
1554
|
+
status: InstanceStatusKubernetesV2,
|
|
1555
|
+
notify_fn: Callable[[str], None],
|
|
1556
|
+
) -> None:
|
|
1557
|
+
global already_pinged
|
|
1558
|
+
|
|
1559
|
+
if not already_pinged:
|
|
1560
|
+
# there can be multiple current versions, e.g. if someone changes yelpsoa-configs during a bounce.
|
|
1561
|
+
current_versions = [
|
|
1562
|
+
v
|
|
1563
|
+
for v in status.kubernetes_v2.versions
|
|
1564
|
+
if v.git_sha == version.sha and v.image_version == version.image_version
|
|
1565
|
+
]
|
|
1566
|
+
pingable_pods = [
|
|
1567
|
+
pod
|
|
1568
|
+
for current_version in current_versions
|
|
1569
|
+
for pod in current_version.pods
|
|
1570
|
+
if should_ping_for_pod(pod)
|
|
1571
|
+
]
|
|
1572
|
+
if pingable_pods:
|
|
1573
|
+
already_pinged = True
|
|
1574
|
+
ping_for_pods(service, instance, cluster, pingable_pods, notify_fn)
|
|
1575
|
+
|
|
1576
|
+
|
|
1577
|
+
def should_ping_for_pod(pod: KubernetesPodV2) -> bool:
|
|
1578
|
+
return recent_container_restart(get_main_container(pod))
|
|
1579
|
+
|
|
1580
|
+
|
|
1581
|
+
def ping_for_pods(
|
|
1582
|
+
service: str,
|
|
1583
|
+
instance: str,
|
|
1584
|
+
cluster: str,
|
|
1585
|
+
pods: List[KubernetesPodV2],
|
|
1586
|
+
notify_fn: Callable[[str], None],
|
|
1587
|
+
) -> None:
|
|
1588
|
+
pods_by_reason: Dict[str, List[KubernetesPodV2]] = {}
|
|
1589
|
+
for pod in pods:
|
|
1590
|
+
pods_by_reason.setdefault(get_main_container(pod).reason, []).append(pod)
|
|
1591
|
+
|
|
1592
|
+
for reason, pods_with_reason in pods_by_reason.items():
|
|
1593
|
+
explanation = {
|
|
1594
|
+
"Error": "crashed on startup",
|
|
1595
|
+
"OOMKilled": "run out of memory",
|
|
1596
|
+
"CrashLoopBackOff": "crashed on startup several times, and Kubernetes is backing off restarting them",
|
|
1597
|
+
}.get(reason, f"restarted ({reason})")
|
|
1598
|
+
|
|
1599
|
+
status_tip = f"Take a look at the output of your unhealthy pods with `paasta status -s {service} -i {instance} -c {cluster} -vv` (more -v for more output.)"
|
|
1600
|
+
|
|
1601
|
+
tip = {
|
|
1602
|
+
"Error": (
|
|
1603
|
+
f"This may indicate a bug in your code, a misconfiguration in yelpsoa-configs, or missing srv-configs. {status_tip}"
|
|
1604
|
+
),
|
|
1605
|
+
"CrashLoopBackOff": f"This may indicate a bug in your code, a misconfiguration in yelpsoa-configs, or missing srv-configs. {status_tip}",
|
|
1606
|
+
"OOMKilled": " ".join(
|
|
1607
|
+
(
|
|
1608
|
+
"This probably means your new version of code requires more memory than the old version."
|
|
1609
|
+
"You may want to increase memory in yelpsoa-configs or roll back."
|
|
1610
|
+
"Ask #paasta if you need help with this.",
|
|
1611
|
+
)
|
|
1612
|
+
),
|
|
1613
|
+
}.get(reason, "")
|
|
1614
|
+
|
|
1615
|
+
notify_fn(
|
|
1616
|
+
f"Some of the replicas of your new version have {explanation}: {', '.join(f'`{p.name}`' for p in pods_with_reason)}\n{tip}"
|
|
1617
|
+
)
|
|
1618
|
+
|
|
1619
|
+
|
|
1620
|
+
def check_if_instance_is_done(
|
|
1621
|
+
service: str,
|
|
1622
|
+
instance: str,
|
|
1623
|
+
cluster: str,
|
|
1624
|
+
version: DeploymentVersion,
|
|
1625
|
+
instance_config: LongRunningServiceConfig,
|
|
1626
|
+
api: Optional[client.PaastaOApiClient] = None,
|
|
1627
|
+
) -> bool:
|
|
1628
|
+
if api is None:
|
|
1629
|
+
api = client.get_paasta_oapi_client(
|
|
1630
|
+
cluster=get_paasta_oapi_api_clustername(
|
|
1631
|
+
cluster=cluster,
|
|
1632
|
+
is_eks=(instance_config.get_instance_type() == "eks"),
|
|
1633
|
+
),
|
|
1634
|
+
)
|
|
1635
|
+
if not api:
|
|
1636
|
+
log.warning(
|
|
1637
|
+
"Couldn't reach the PaaSTA api for {}! Assuming it is not "
|
|
1638
|
+
"deployed there yet.".format(cluster)
|
|
1639
|
+
)
|
|
1640
|
+
return False
|
|
1641
|
+
|
|
1642
|
+
inst_str = f"{service}.{instance} in {cluster}"
|
|
1643
|
+
log.debug(f"Inspecting the deployment status of {inst_str}")
|
|
1644
|
+
|
|
1645
|
+
status = None
|
|
1646
|
+
try:
|
|
1647
|
+
status = api.service.bounce_status_instance(service=service, instance=instance)
|
|
1648
|
+
except api.api_error as e:
|
|
1649
|
+
if e.status == 404: # non-existent instance
|
|
1650
|
+
# TODO(PAASTA-17290): just print the error message so that we
|
|
1651
|
+
# can distinguish between sources of 404s
|
|
1652
|
+
log.warning(
|
|
1653
|
+
"Can't get status for instance {}, service {} in "
|
|
1654
|
+
"cluster {}. This is normally because it is a new "
|
|
1655
|
+
"service that hasn't been deployed by PaaSTA yet.".format(
|
|
1656
|
+
instance, service, cluster
|
|
1657
|
+
)
|
|
1658
|
+
)
|
|
1659
|
+
elif e.status == 599: # Temporary issue
|
|
1660
|
+
log.warning(
|
|
1661
|
+
f"Temporary issue fetching service status from PaaSTA API for {cluster}. Will retry on next poll interval."
|
|
1662
|
+
)
|
|
1663
|
+
else: # 500 - error talking to api
|
|
1664
|
+
log.warning(
|
|
1665
|
+
"Error getting service status from PaaSTA API for "
|
|
1666
|
+
f"{cluster}: {e.status} {e.reason}"
|
|
1667
|
+
)
|
|
1668
|
+
|
|
1669
|
+
log.debug(f"No status for {inst_str}. Not deployed yet.")
|
|
1670
|
+
return False
|
|
1671
|
+
|
|
1672
|
+
if not status: # 204 - instance is not bounceable
|
|
1673
|
+
log.debug(
|
|
1674
|
+
f"{inst_str} is not a supported bounceable instance. "
|
|
1675
|
+
"Only long-running instances running on Kubernetes are currently "
|
|
1676
|
+
"supported. Continuing without watching."
|
|
1677
|
+
)
|
|
1678
|
+
return True
|
|
1679
|
+
|
|
1680
|
+
# Case: instance is stopped
|
|
1681
|
+
if status.expected_instance_count == 0 or status.desired_state == "stop":
|
|
1682
|
+
log.debug(f"{inst_str} is marked as stopped. Ignoring it.")
|
|
1683
|
+
return True
|
|
1684
|
+
|
|
1685
|
+
active_deploy_versions = {
|
|
1686
|
+
DeploymentVersion(sha=g, image_version=i) for g, i, c in status.active_versions
|
|
1687
|
+
}
|
|
1688
|
+
if version in active_deploy_versions:
|
|
1689
|
+
non_desired_versions = active_deploy_versions.difference({version})
|
|
1690
|
+
# Case: bounce in-progress
|
|
1691
|
+
if len(non_desired_versions) == 1:
|
|
1692
|
+
(other_version,) = non_desired_versions
|
|
1693
|
+
print(f" {inst_str} is still bouncing, from {other_version} to {version}")
|
|
1694
|
+
return False
|
|
1695
|
+
|
|
1696
|
+
# Case: previous bounces not yet finished when this one was triggered
|
|
1697
|
+
elif len(non_desired_versions) > 1:
|
|
1698
|
+
print(
|
|
1699
|
+
f" {inst_str} is still bouncing to {version}, but there are "
|
|
1700
|
+
f"multiple other bouncing versions running: {non_desired_versions}"
|
|
1701
|
+
)
|
|
1702
|
+
return False
|
|
1703
|
+
else:
|
|
1704
|
+
# Case: bounce not yet started
|
|
1705
|
+
print(
|
|
1706
|
+
f" {inst_str} hasn't started bouncing to {version}; "
|
|
1707
|
+
f"only the following versions are running: {active_deploy_versions}"
|
|
1708
|
+
)
|
|
1709
|
+
return False
|
|
1710
|
+
|
|
1711
|
+
# Case: instance is in not running
|
|
1712
|
+
if status.deploy_status not in {"Running", "Deploying", "Waiting"}:
|
|
1713
|
+
print(
|
|
1714
|
+
f" {inst_str} isn't running yet; it is in the state: {status.deploy_status}"
|
|
1715
|
+
)
|
|
1716
|
+
return False
|
|
1717
|
+
|
|
1718
|
+
# Case: not enough replicas are up for the instance to be considered bounced
|
|
1719
|
+
# The bounce margin factor defines what proportion of instances we need to be "safe",
|
|
1720
|
+
# so consider it scaled up "enough" if we have that proportion of instances ready.
|
|
1721
|
+
required_instance_count = int(
|
|
1722
|
+
math.ceil(
|
|
1723
|
+
instance_config.get_bounce_margin_factor() * status.expected_instance_count
|
|
1724
|
+
)
|
|
1725
|
+
)
|
|
1726
|
+
if required_instance_count > status.running_instance_count:
|
|
1727
|
+
print(
|
|
1728
|
+
f" {inst_str} has only {status.running_instance_count} replicas up, "
|
|
1729
|
+
f"below the required minimum of {required_instance_count}"
|
|
1730
|
+
)
|
|
1731
|
+
return False
|
|
1732
|
+
|
|
1733
|
+
# Case: completed
|
|
1734
|
+
print(
|
|
1735
|
+
f"Complete: {service}.{instance} on {cluster} is 100% deployed at "
|
|
1736
|
+
f"{status.running_instance_count} replicas on {DeploymentVersion(sha=status.active_versions[0][0], image_version=status.active_versions[0][1])}"
|
|
1737
|
+
)
|
|
1738
|
+
return True
|
|
1739
|
+
|
|
1740
|
+
|
|
1741
|
+
WAIT_FOR_INSTANCE_CLASSES = [
|
|
1742
|
+
KubernetesDeploymentConfig,
|
|
1743
|
+
EksDeploymentConfig,
|
|
1744
|
+
CassandraClusterDeploymentConfig,
|
|
1745
|
+
]
|
|
1746
|
+
|
|
1747
|
+
|
|
1748
|
+
def get_instance_configs_for_service_in_cluster_and_deploy_group(
|
|
1749
|
+
service_configs: PaastaServiceConfigLoader, cluster: str, deploy_group: str
|
|
1750
|
+
) -> Iterator[LongRunningServiceConfig]:
|
|
1751
|
+
for instance_class in WAIT_FOR_INSTANCE_CLASSES:
|
|
1752
|
+
for instance_config in service_configs.instance_configs(
|
|
1753
|
+
cluster=cluster, instance_type_class=instance_class
|
|
1754
|
+
):
|
|
1755
|
+
if instance_config.get_deploy_group() == deploy_group:
|
|
1756
|
+
yield instance_config
|
|
1757
|
+
|
|
1758
|
+
|
|
1759
|
+
def get_instance_configs_for_service_in_deploy_group_all_clusters(
|
|
1760
|
+
service: str, deploy_group: str, soa_dir: str
|
|
1761
|
+
) -> Dict[str, List[LongRunningServiceConfig]]:
|
|
1762
|
+
service_configs = PaastaServiceConfigLoader(
|
|
1763
|
+
service=service, soa_dir=soa_dir, load_deployments=False
|
|
1764
|
+
)
|
|
1765
|
+
|
|
1766
|
+
instance_configs_per_cluster = {}
|
|
1767
|
+
|
|
1768
|
+
api_endpoints = load_system_paasta_config().get_api_endpoints()
|
|
1769
|
+
for cluster in service_configs.clusters:
|
|
1770
|
+
if cluster not in api_endpoints:
|
|
1771
|
+
print(
|
|
1772
|
+
PaastaColors.red(
|
|
1773
|
+
"Cluster %s is NOT in paasta-api endpoints config." % cluster
|
|
1774
|
+
)
|
|
1775
|
+
)
|
|
1776
|
+
raise NoSuchCluster
|
|
1777
|
+
|
|
1778
|
+
instance_configs_per_cluster[cluster] = list(
|
|
1779
|
+
get_instance_configs_for_service_in_cluster_and_deploy_group(
|
|
1780
|
+
service_configs, cluster, deploy_group
|
|
1781
|
+
)
|
|
1782
|
+
)
|
|
1783
|
+
|
|
1784
|
+
return instance_configs_per_cluster
|
|
1785
|
+
|
|
1786
|
+
|
|
1787
|
+
async def wait_for_deployment(
|
|
1788
|
+
service: str,
|
|
1789
|
+
deploy_group: str,
|
|
1790
|
+
git_sha: str,
|
|
1791
|
+
soa_dir: str,
|
|
1792
|
+
timeout: float,
|
|
1793
|
+
image_version: Optional[str] = None,
|
|
1794
|
+
instance_configs_per_cluster: Optional[
|
|
1795
|
+
Dict[str, List[LongRunningServiceConfig]]
|
|
1796
|
+
] = None,
|
|
1797
|
+
progress: Optional[Progress] = None,
|
|
1798
|
+
polling_interval: float = None,
|
|
1799
|
+
diagnosis_interval: float = None,
|
|
1800
|
+
time_before_first_diagnosis: float = None,
|
|
1801
|
+
notify_fn: Optional[Callable[[str], None]] = None,
|
|
1802
|
+
) -> Optional[int]:
|
|
1803
|
+
if not instance_configs_per_cluster:
|
|
1804
|
+
instance_configs_per_cluster = (
|
|
1805
|
+
get_instance_configs_for_service_in_deploy_group_all_clusters(
|
|
1806
|
+
service, deploy_group, soa_dir
|
|
1807
|
+
)
|
|
1808
|
+
)
|
|
1809
|
+
total_instances = sum(len(ics) for ics in instance_configs_per_cluster.values())
|
|
1810
|
+
|
|
1811
|
+
target_version = DeploymentVersion(sha=git_sha, image_version=image_version)
|
|
1812
|
+
if not instance_configs_per_cluster:
|
|
1813
|
+
_log(
|
|
1814
|
+
service=service,
|
|
1815
|
+
component="deploy",
|
|
1816
|
+
line=(
|
|
1817
|
+
"Couldn't find any long-running instances for service {} in deploy group {}. Exiting.".format(
|
|
1818
|
+
service, deploy_group
|
|
1819
|
+
)
|
|
1820
|
+
),
|
|
1821
|
+
level="event",
|
|
1822
|
+
)
|
|
1823
|
+
return None
|
|
1824
|
+
|
|
1825
|
+
print(
|
|
1826
|
+
"Waiting for deployment of {} for '{}' to complete...".format(
|
|
1827
|
+
target_version, deploy_group
|
|
1828
|
+
)
|
|
1829
|
+
)
|
|
1830
|
+
|
|
1831
|
+
system_paasta_config = load_system_paasta_config()
|
|
1832
|
+
max_workers = system_paasta_config.get_mark_for_deployment_max_polling_threads()
|
|
1833
|
+
if polling_interval is None:
|
|
1834
|
+
polling_interval = (
|
|
1835
|
+
system_paasta_config.get_mark_for_deployment_default_polling_interval()
|
|
1836
|
+
)
|
|
1837
|
+
if diagnosis_interval is None:
|
|
1838
|
+
diagnosis_interval = (
|
|
1839
|
+
system_paasta_config.get_mark_for_deployment_default_diagnosis_interval()
|
|
1840
|
+
)
|
|
1841
|
+
if time_before_first_diagnosis is None:
|
|
1842
|
+
time_before_first_diagnosis = (
|
|
1843
|
+
system_paasta_config.get_mark_for_deployment_default_time_before_first_diagnosis()
|
|
1844
|
+
)
|
|
1845
|
+
|
|
1846
|
+
with progressbar.ProgressBar(maxval=total_instances) as bar:
|
|
1847
|
+
instance_done_futures = []
|
|
1848
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1849
|
+
for cluster, instance_configs in instance_configs_per_cluster.items():
|
|
1850
|
+
for instance_config in instance_configs:
|
|
1851
|
+
instance_done_futures.append(
|
|
1852
|
+
asyncio.ensure_future(
|
|
1853
|
+
wait_until_instance_is_done(
|
|
1854
|
+
executor,
|
|
1855
|
+
service,
|
|
1856
|
+
instance_config.get_instance(),
|
|
1857
|
+
cluster,
|
|
1858
|
+
target_version,
|
|
1859
|
+
instance_config,
|
|
1860
|
+
polling_interval=polling_interval,
|
|
1861
|
+
diagnosis_interval=diagnosis_interval,
|
|
1862
|
+
time_before_first_diagnosis=time_before_first_diagnosis,
|
|
1863
|
+
should_ping_for_unhealthy_pods=instance_config.get_should_ping_for_unhealthy_pods(
|
|
1864
|
+
system_paasta_config.get_mark_for_deployment_should_ping_for_unhealthy_pods()
|
|
1865
|
+
),
|
|
1866
|
+
notify_fn=notify_fn,
|
|
1867
|
+
),
|
|
1868
|
+
)
|
|
1869
|
+
)
|
|
1870
|
+
|
|
1871
|
+
remaining_instances: Dict[str, Set[str]] = {
|
|
1872
|
+
cluster: {ic.get_instance() for ic in instance_configs}
|
|
1873
|
+
for cluster, instance_configs in instance_configs_per_cluster.items()
|
|
1874
|
+
}
|
|
1875
|
+
finished_instances = 0
|
|
1876
|
+
|
|
1877
|
+
async def periodically_update_progressbar() -> None:
|
|
1878
|
+
while True:
|
|
1879
|
+
await asyncio.sleep(60)
|
|
1880
|
+
bar.update(finished_instances)
|
|
1881
|
+
print()
|
|
1882
|
+
|
|
1883
|
+
periodically_update_progressbar_task = asyncio.create_task(
|
|
1884
|
+
periodically_update_progressbar()
|
|
1885
|
+
)
|
|
1886
|
+
|
|
1887
|
+
try:
|
|
1888
|
+
for coro in asyncio.as_completed(
|
|
1889
|
+
instance_done_futures, timeout=timeout
|
|
1890
|
+
):
|
|
1891
|
+
cluster, instance = await coro
|
|
1892
|
+
finished_instances += 1
|
|
1893
|
+
bar.update(finished_instances)
|
|
1894
|
+
if progress is not None:
|
|
1895
|
+
progress.percent = bar.percentage
|
|
1896
|
+
remaining_instances[cluster].remove(instance)
|
|
1897
|
+
progress.waiting_on = remaining_instances
|
|
1898
|
+
except asyncio.TimeoutError:
|
|
1899
|
+
_log(
|
|
1900
|
+
service=service,
|
|
1901
|
+
component="deploy",
|
|
1902
|
+
line=compose_timeout_message(
|
|
1903
|
+
remaining_instances,
|
|
1904
|
+
timeout,
|
|
1905
|
+
deploy_group,
|
|
1906
|
+
service,
|
|
1907
|
+
target_version,
|
|
1908
|
+
),
|
|
1909
|
+
level="event",
|
|
1910
|
+
)
|
|
1911
|
+
raise TimeoutError
|
|
1912
|
+
except asyncio.CancelledError:
|
|
1913
|
+
# Wait for all the tasks to finish before closing out the ThreadPoolExecutor, to avoid RuntimeError('cannot schedule new futures after shutdown')
|
|
1914
|
+
for coro in instance_done_futures:
|
|
1915
|
+
coro.cancel()
|
|
1916
|
+
try:
|
|
1917
|
+
await coro
|
|
1918
|
+
except asyncio.CancelledError:
|
|
1919
|
+
pass
|
|
1920
|
+
raise
|
|
1921
|
+
else:
|
|
1922
|
+
sys.stdout.flush()
|
|
1923
|
+
if progress is not None:
|
|
1924
|
+
progress.percent = 100.0
|
|
1925
|
+
progress.waiting_on = None
|
|
1926
|
+
return 0
|
|
1927
|
+
finally:
|
|
1928
|
+
periodically_update_progressbar_task.cancel()
|
|
1929
|
+
|
|
1930
|
+
|
|
1931
|
+
def compose_timeout_message(
|
|
1932
|
+
remaining_instances: Mapping[str, Collection[str]],
|
|
1933
|
+
timeout: float,
|
|
1934
|
+
deploy_group: str,
|
|
1935
|
+
service: str,
|
|
1936
|
+
version: DeploymentVersion,
|
|
1937
|
+
) -> str:
|
|
1938
|
+
paasta_status = []
|
|
1939
|
+
paasta_logs = []
|
|
1940
|
+
for cluster, instances in sorted(remaining_instances.items()):
|
|
1941
|
+
if instances:
|
|
1942
|
+
joined_instances = ",".join(instances)
|
|
1943
|
+
paasta_status.append(
|
|
1944
|
+
"paasta status -c {cluster} -s {service} -i {instances}".format(
|
|
1945
|
+
cluster=cluster, service=service, instances=joined_instances
|
|
1946
|
+
)
|
|
1947
|
+
)
|
|
1948
|
+
paasta_logs.append(
|
|
1949
|
+
"paasta logs -c {cluster} -s {service} -i {instances} -C deploy -l 1000".format(
|
|
1950
|
+
cluster=cluster, service=service, instances=joined_instances
|
|
1951
|
+
)
|
|
1952
|
+
)
|
|
1953
|
+
|
|
1954
|
+
return (
|
|
1955
|
+
"\n\nTimed out after {timeout} seconds, waiting for {service} "
|
|
1956
|
+
"in {deploy_group} to be deployed by PaaSTA.\n"
|
|
1957
|
+
"This probably means the deploy hasn't succeeded. The new service "
|
|
1958
|
+
"might not be healthy or one or more clusters could be having issues.\n\n"
|
|
1959
|
+
"To debug, follow steps in {stuck_bounce_runbook}, "
|
|
1960
|
+
"or try running the following to see the status of instances we tried to deploy:\n\n"
|
|
1961
|
+
" {status_commands}\n\n {logs_commands}"
|
|
1962
|
+
"\n\nIf the service is known to be slow to start you may wish to "
|
|
1963
|
+
"increase the timeout on this step.\n"
|
|
1964
|
+
"To wait a little longer run:\n\n"
|
|
1965
|
+
" paasta wait-for-deployment -s {service} -l {deploy_group} -c {git_sha}{image_arg}".format(
|
|
1966
|
+
timeout=timeout,
|
|
1967
|
+
deploy_group=deploy_group,
|
|
1968
|
+
service=service,
|
|
1969
|
+
git_sha=version.sha,
|
|
1970
|
+
image_arg=f" --image-version {version.image_version}"
|
|
1971
|
+
if version.image_version
|
|
1972
|
+
else "",
|
|
1973
|
+
status_commands="\n ".join(paasta_status),
|
|
1974
|
+
logs_commands="\n ".join(paasta_logs),
|
|
1975
|
+
stuck_bounce_runbook=os.environ.get(
|
|
1976
|
+
"STUCK_BOUNCE_RUNBOOK",
|
|
1977
|
+
DEFAULT_STUCK_BOUNCE_RUNBOOK,
|
|
1978
|
+
),
|
|
1979
|
+
)
|
|
1980
|
+
)
|
|
1981
|
+
|
|
1982
|
+
|
|
1983
|
+
class NoSuchCluster(Exception):
|
|
1984
|
+
"""To be raised by wait_for_deployment() when a service has a
|
|
1985
|
+
kubernetes config for a cluster that is not listed in /etc/paasta/api_endpoints.json.
|
|
1986
|
+
"""
|
|
1987
|
+
|
|
1988
|
+
pass
|