paasta-tools 1.21.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (348) hide show
  1. k8s_itests/__init__.py +0 -0
  2. k8s_itests/test_autoscaling.py +23 -0
  3. k8s_itests/utils.py +38 -0
  4. paasta_tools/__init__.py +20 -0
  5. paasta_tools/adhoc_tools.py +142 -0
  6. paasta_tools/api/__init__.py +13 -0
  7. paasta_tools/api/api.py +330 -0
  8. paasta_tools/api/api_docs/swagger.json +2323 -0
  9. paasta_tools/api/client.py +106 -0
  10. paasta_tools/api/settings.py +33 -0
  11. paasta_tools/api/tweens/__init__.py +6 -0
  12. paasta_tools/api/tweens/auth.py +125 -0
  13. paasta_tools/api/tweens/profiling.py +108 -0
  14. paasta_tools/api/tweens/request_logger.py +124 -0
  15. paasta_tools/api/views/__init__.py +13 -0
  16. paasta_tools/api/views/autoscaler.py +100 -0
  17. paasta_tools/api/views/exception.py +45 -0
  18. paasta_tools/api/views/flink.py +73 -0
  19. paasta_tools/api/views/instance.py +395 -0
  20. paasta_tools/api/views/pause_autoscaler.py +71 -0
  21. paasta_tools/api/views/remote_run.py +113 -0
  22. paasta_tools/api/views/resources.py +76 -0
  23. paasta_tools/api/views/service.py +35 -0
  24. paasta_tools/api/views/version.py +25 -0
  25. paasta_tools/apply_external_resources.py +79 -0
  26. paasta_tools/async_utils.py +109 -0
  27. paasta_tools/autoscaling/__init__.py +0 -0
  28. paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
  29. paasta_tools/autoscaling/forecasting.py +106 -0
  30. paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
  31. paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
  32. paasta_tools/autoscaling/utils.py +52 -0
  33. paasta_tools/bounce_lib.py +184 -0
  34. paasta_tools/broadcast_log_to_services.py +62 -0
  35. paasta_tools/cassandracluster_tools.py +210 -0
  36. paasta_tools/check_autoscaler_max_instances.py +212 -0
  37. paasta_tools/check_cassandracluster_services_replication.py +35 -0
  38. paasta_tools/check_flink_services_health.py +203 -0
  39. paasta_tools/check_kubernetes_api.py +57 -0
  40. paasta_tools/check_kubernetes_services_replication.py +141 -0
  41. paasta_tools/check_oom_events.py +244 -0
  42. paasta_tools/check_services_replication_tools.py +324 -0
  43. paasta_tools/check_spark_jobs.py +234 -0
  44. paasta_tools/cleanup_kubernetes_cr.py +138 -0
  45. paasta_tools/cleanup_kubernetes_crd.py +145 -0
  46. paasta_tools/cleanup_kubernetes_jobs.py +344 -0
  47. paasta_tools/cleanup_tron_namespaces.py +96 -0
  48. paasta_tools/cli/__init__.py +13 -0
  49. paasta_tools/cli/authentication.py +85 -0
  50. paasta_tools/cli/cli.py +260 -0
  51. paasta_tools/cli/cmds/__init__.py +13 -0
  52. paasta_tools/cli/cmds/autoscale.py +143 -0
  53. paasta_tools/cli/cmds/check.py +334 -0
  54. paasta_tools/cli/cmds/cook_image.py +147 -0
  55. paasta_tools/cli/cmds/get_docker_image.py +76 -0
  56. paasta_tools/cli/cmds/get_image_version.py +172 -0
  57. paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
  58. paasta_tools/cli/cmds/info.py +155 -0
  59. paasta_tools/cli/cmds/itest.py +117 -0
  60. paasta_tools/cli/cmds/list.py +66 -0
  61. paasta_tools/cli/cmds/list_clusters.py +42 -0
  62. paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
  63. paasta_tools/cli/cmds/list_namespaces.py +84 -0
  64. paasta_tools/cli/cmds/local_run.py +1396 -0
  65. paasta_tools/cli/cmds/logs.py +1601 -0
  66. paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
  67. paasta_tools/cli/cmds/mesh_status.py +174 -0
  68. paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
  69. paasta_tools/cli/cmds/push_to_registry.py +275 -0
  70. paasta_tools/cli/cmds/remote_run.py +252 -0
  71. paasta_tools/cli/cmds/rollback.py +347 -0
  72. paasta_tools/cli/cmds/secret.py +549 -0
  73. paasta_tools/cli/cmds/security_check.py +59 -0
  74. paasta_tools/cli/cmds/spark_run.py +1400 -0
  75. paasta_tools/cli/cmds/start_stop_restart.py +401 -0
  76. paasta_tools/cli/cmds/status.py +2302 -0
  77. paasta_tools/cli/cmds/validate.py +1012 -0
  78. paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
  79. paasta_tools/cli/fsm/__init__.py +13 -0
  80. paasta_tools/cli/fsm/autosuggest.py +82 -0
  81. paasta_tools/cli/fsm/template/README.md +8 -0
  82. paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
  83. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
  84. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
  85. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
  86. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
  87. paasta_tools/cli/fsm_cmd.py +121 -0
  88. paasta_tools/cli/paasta_tabcomplete.sh +23 -0
  89. paasta_tools/cli/schemas/adhoc_schema.json +199 -0
  90. paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
  91. paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
  92. paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
  93. paasta_tools/cli/schemas/deploy_schema.json +173 -0
  94. paasta_tools/cli/schemas/eks_schema.json +970 -0
  95. paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
  96. paasta_tools/cli/schemas/rollback_schema.json +160 -0
  97. paasta_tools/cli/schemas/service_schema.json +25 -0
  98. paasta_tools/cli/schemas/smartstack_schema.json +322 -0
  99. paasta_tools/cli/schemas/tron_schema.json +699 -0
  100. paasta_tools/cli/utils.py +1118 -0
  101. paasta_tools/clusterman.py +21 -0
  102. paasta_tools/config_utils.py +385 -0
  103. paasta_tools/contrib/__init__.py +0 -0
  104. paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
  105. paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
  106. paasta_tools/contrib/check_orphans.py +306 -0
  107. paasta_tools/contrib/create_dynamodb_table.py +35 -0
  108. paasta_tools/contrib/create_paasta_playground.py +105 -0
  109. paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
  110. paasta_tools/contrib/get_running_task_allocation.py +346 -0
  111. paasta_tools/contrib/habitat_fixer.py +86 -0
  112. paasta_tools/contrib/ide_helper.py +316 -0
  113. paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
  114. paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
  115. paasta_tools/contrib/kill_bad_containers.py +109 -0
  116. paasta_tools/contrib/mass-deploy-tag.sh +44 -0
  117. paasta_tools/contrib/mock_patch_checker.py +86 -0
  118. paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
  119. paasta_tools/contrib/render_template.py +129 -0
  120. paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
  121. paasta_tools/contrib/service_shard_remove.py +157 -0
  122. paasta_tools/contrib/service_shard_update.py +373 -0
  123. paasta_tools/contrib/shared_ip_check.py +77 -0
  124. paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
  125. paasta_tools/delete_kubernetes_deployments.py +89 -0
  126. paasta_tools/deployment_utils.py +44 -0
  127. paasta_tools/docker_wrapper.py +234 -0
  128. paasta_tools/docker_wrapper_imports.py +13 -0
  129. paasta_tools/drain_lib.py +351 -0
  130. paasta_tools/dump_locally_running_services.py +71 -0
  131. paasta_tools/eks_tools.py +119 -0
  132. paasta_tools/envoy_tools.py +373 -0
  133. paasta_tools/firewall.py +504 -0
  134. paasta_tools/firewall_logging.py +154 -0
  135. paasta_tools/firewall_update.py +172 -0
  136. paasta_tools/flink_tools.py +345 -0
  137. paasta_tools/flinkeks_tools.py +90 -0
  138. paasta_tools/frameworks/__init__.py +0 -0
  139. paasta_tools/frameworks/adhoc_scheduler.py +71 -0
  140. paasta_tools/frameworks/constraints.py +87 -0
  141. paasta_tools/frameworks/native_scheduler.py +652 -0
  142. paasta_tools/frameworks/native_service_config.py +301 -0
  143. paasta_tools/frameworks/task_store.py +245 -0
  144. paasta_tools/generate_all_deployments +9 -0
  145. paasta_tools/generate_authenticating_services.py +94 -0
  146. paasta_tools/generate_deployments_for_service.py +255 -0
  147. paasta_tools/generate_services_file.py +114 -0
  148. paasta_tools/generate_services_yaml.py +30 -0
  149. paasta_tools/hacheck.py +76 -0
  150. paasta_tools/instance/__init__.py +0 -0
  151. paasta_tools/instance/hpa_metrics_parser.py +122 -0
  152. paasta_tools/instance/kubernetes.py +1362 -0
  153. paasta_tools/iptables.py +240 -0
  154. paasta_tools/kafkacluster_tools.py +143 -0
  155. paasta_tools/kubernetes/__init__.py +0 -0
  156. paasta_tools/kubernetes/application/__init__.py +0 -0
  157. paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
  158. paasta_tools/kubernetes/application/tools.py +90 -0
  159. paasta_tools/kubernetes/bin/__init__.py +0 -0
  160. paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
  161. paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
  162. paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
  163. paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
  164. paasta_tools/kubernetes/remote_run.py +558 -0
  165. paasta_tools/kubernetes_tools.py +4679 -0
  166. paasta_tools/list_kubernetes_service_instances.py +128 -0
  167. paasta_tools/list_tron_namespaces.py +60 -0
  168. paasta_tools/long_running_service_tools.py +678 -0
  169. paasta_tools/mac_address.py +44 -0
  170. paasta_tools/marathon_dashboard.py +0 -0
  171. paasta_tools/mesos/__init__.py +0 -0
  172. paasta_tools/mesos/cfg.py +46 -0
  173. paasta_tools/mesos/cluster.py +60 -0
  174. paasta_tools/mesos/exceptions.py +59 -0
  175. paasta_tools/mesos/framework.py +77 -0
  176. paasta_tools/mesos/log.py +48 -0
  177. paasta_tools/mesos/master.py +306 -0
  178. paasta_tools/mesos/mesos_file.py +169 -0
  179. paasta_tools/mesos/parallel.py +52 -0
  180. paasta_tools/mesos/slave.py +115 -0
  181. paasta_tools/mesos/task.py +94 -0
  182. paasta_tools/mesos/util.py +69 -0
  183. paasta_tools/mesos/zookeeper.py +37 -0
  184. paasta_tools/mesos_maintenance.py +848 -0
  185. paasta_tools/mesos_tools.py +1051 -0
  186. paasta_tools/metrics/__init__.py +0 -0
  187. paasta_tools/metrics/metastatus_lib.py +1110 -0
  188. paasta_tools/metrics/metrics_lib.py +217 -0
  189. paasta_tools/monitoring/__init__.py +13 -0
  190. paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
  191. paasta_tools/monitoring_tools.py +652 -0
  192. paasta_tools/monkrelaycluster_tools.py +146 -0
  193. paasta_tools/nrtsearchservice_tools.py +143 -0
  194. paasta_tools/nrtsearchserviceeks_tools.py +68 -0
  195. paasta_tools/oom_logger.py +321 -0
  196. paasta_tools/paasta_deploy_tron_jobs +3 -0
  197. paasta_tools/paasta_execute_docker_command.py +123 -0
  198. paasta_tools/paasta_native_serviceinit.py +21 -0
  199. paasta_tools/paasta_service_config_loader.py +201 -0
  200. paasta_tools/paastaapi/__init__.py +29 -0
  201. paasta_tools/paastaapi/api/__init__.py +3 -0
  202. paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
  203. paasta_tools/paastaapi/api/default_api.py +569 -0
  204. paasta_tools/paastaapi/api/remote_run_api.py +604 -0
  205. paasta_tools/paastaapi/api/resources_api.py +157 -0
  206. paasta_tools/paastaapi/api/service_api.py +1736 -0
  207. paasta_tools/paastaapi/api_client.py +818 -0
  208. paasta_tools/paastaapi/apis/__init__.py +22 -0
  209. paasta_tools/paastaapi/configuration.py +455 -0
  210. paasta_tools/paastaapi/exceptions.py +137 -0
  211. paasta_tools/paastaapi/model/__init__.py +5 -0
  212. paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
  213. paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
  214. paasta_tools/paastaapi/model/deploy_queue.py +178 -0
  215. paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
  216. paasta_tools/paastaapi/model/envoy_backend.py +185 -0
  217. paasta_tools/paastaapi/model/envoy_location.py +184 -0
  218. paasta_tools/paastaapi/model/envoy_status.py +181 -0
  219. paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
  220. paasta_tools/paastaapi/model/flink_config.py +173 -0
  221. paasta_tools/paastaapi/model/flink_job.py +186 -0
  222. paasta_tools/paastaapi/model/flink_job_details.py +192 -0
  223. paasta_tools/paastaapi/model/flink_jobs.py +175 -0
  224. paasta_tools/paastaapi/model/float_and_error.py +173 -0
  225. paasta_tools/paastaapi/model/hpa_metric.py +176 -0
  226. paasta_tools/paastaapi/model/inline_object.py +170 -0
  227. paasta_tools/paastaapi/model/inline_response200.py +170 -0
  228. paasta_tools/paastaapi/model/inline_response2001.py +170 -0
  229. paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
  230. paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
  231. paasta_tools/paastaapi/model/instance_status.py +220 -0
  232. paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
  233. paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
  234. paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
  235. paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
  236. paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
  237. paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
  238. paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
  239. paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
  240. paasta_tools/paastaapi/model/instance_tasks.py +182 -0
  241. paasta_tools/paastaapi/model/integer_and_error.py +173 -0
  242. paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
  243. paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
  244. paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
  245. paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
  246. paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
  247. paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
  248. paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
  249. paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
  250. paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
  251. paasta_tools/paastaapi/model/remote_run_start.py +185 -0
  252. paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
  253. paasta_tools/paastaapi/model/remote_run_token.py +173 -0
  254. paasta_tools/paastaapi/model/resource.py +187 -0
  255. paasta_tools/paastaapi/model/resource_item.py +187 -0
  256. paasta_tools/paastaapi/model/resource_value.py +176 -0
  257. paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
  258. paasta_tools/paastaapi/model/smartstack_location.py +181 -0
  259. paasta_tools/paastaapi/model/smartstack_status.py +181 -0
  260. paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
  261. paasta_tools/paastaapi/model_utils.py +1879 -0
  262. paasta_tools/paastaapi/models/__init__.py +62 -0
  263. paasta_tools/paastaapi/rest.py +287 -0
  264. paasta_tools/prune_completed_pods.py +220 -0
  265. paasta_tools/puppet_service_tools.py +59 -0
  266. paasta_tools/py.typed +1 -0
  267. paasta_tools/remote_git.py +127 -0
  268. paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
  269. paasta_tools/run-paasta-api-playground.py +51 -0
  270. paasta_tools/secret_providers/__init__.py +66 -0
  271. paasta_tools/secret_providers/vault.py +214 -0
  272. paasta_tools/secret_tools.py +277 -0
  273. paasta_tools/setup_istio_mesh.py +353 -0
  274. paasta_tools/setup_kubernetes_cr.py +412 -0
  275. paasta_tools/setup_kubernetes_crd.py +138 -0
  276. paasta_tools/setup_kubernetes_internal_crd.py +154 -0
  277. paasta_tools/setup_kubernetes_job.py +353 -0
  278. paasta_tools/setup_prometheus_adapter_config.py +1028 -0
  279. paasta_tools/setup_tron_namespace.py +248 -0
  280. paasta_tools/slack.py +75 -0
  281. paasta_tools/smartstack_tools.py +676 -0
  282. paasta_tools/spark_tools.py +283 -0
  283. paasta_tools/synapse_srv_namespaces_fact.py +42 -0
  284. paasta_tools/tron/__init__.py +0 -0
  285. paasta_tools/tron/client.py +158 -0
  286. paasta_tools/tron/tron_command_context.py +194 -0
  287. paasta_tools/tron/tron_timeutils.py +101 -0
  288. paasta_tools/tron_tools.py +1448 -0
  289. paasta_tools/utils.py +4307 -0
  290. paasta_tools/yaml_tools.py +44 -0
  291. paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
  292. paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
  293. paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
  294. paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
  295. paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
  296. paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
  297. paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
  298. paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
  299. paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
  300. paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
  301. paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
  302. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
  303. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
  304. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
  305. paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
  306. paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
  307. paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
  308. paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
  309. paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
  310. paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
  311. paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
  312. paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
  313. paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
  314. paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
  315. paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
  316. paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
  317. paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
  318. paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
  319. paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
  320. paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
  321. paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
  322. paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
  323. paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
  324. paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
  325. paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
  326. paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
  327. paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
  328. paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
  329. paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
  330. paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
  331. paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
  332. paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
  333. paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
  334. paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
  335. paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
  336. paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
  337. paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
  338. paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
  339. paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
  340. paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
  341. paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
  342. paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
  343. paasta_tools-1.21.3.dist-info/LICENSE +201 -0
  344. paasta_tools-1.21.3.dist-info/METADATA +74 -0
  345. paasta_tools-1.21.3.dist-info/RECORD +348 -0
  346. paasta_tools-1.21.3.dist-info/WHEEL +5 -0
  347. paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
  348. paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1988 @@
1
+ #!/usr/bin/env python
2
+ # Copyright 2015-2016 Yelp Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Contains methods used by the paasta client to mark a docker image for
16
+ deployment to a cluster.instance.
17
+ """
18
+ import argparse
19
+ import asyncio
20
+ import concurrent
21
+ import datetime
22
+ import functools
23
+ import getpass
24
+ import logging
25
+ import math
26
+ import os
27
+ import socket
28
+ import sys
29
+ import time
30
+ import traceback
31
+ from threading import Thread
32
+ from typing import Any
33
+ from typing import Callable
34
+ from typing import Collection
35
+ from typing import Dict
36
+ from typing import Iterator
37
+ from typing import List
38
+ from typing import Mapping
39
+ from typing import Optional
40
+ from typing import Set
41
+ from typing import Tuple
42
+
43
+ import a_sync
44
+ import humanize
45
+ import progressbar
46
+ from service_configuration_lib import read_deploy
47
+ from slackclient import SlackClient
48
+ from sticht import state_machine
49
+ from sticht.rollbacks.base import RollbackSlackDeploymentProcess
50
+ from sticht.rollbacks.slo import SLOWatcher
51
+ from sticht.rollbacks.types import MetricWatcher
52
+ from sticht.rollbacks.types import SplunkAuth
53
+
54
+ from paasta_tools import remote_git
55
+ from paasta_tools.api import client
56
+ from paasta_tools.cassandracluster_tools import CassandraClusterDeploymentConfig
57
+ from paasta_tools.cli.cmds.push_to_registry import is_docker_image_already_in_registry
58
+ from paasta_tools.cli.cmds.status import get_main_container
59
+ from paasta_tools.cli.cmds.status import get_version_table_entry
60
+ from paasta_tools.cli.cmds.status import recent_container_restart
61
+ from paasta_tools.cli.utils import get_jenkins_build_output_url
62
+ from paasta_tools.cli.utils import get_paasta_oapi_api_clustername
63
+ from paasta_tools.cli.utils import lazy_choices_completer
64
+ from paasta_tools.cli.utils import list_deploy_groups
65
+ from paasta_tools.cli.utils import trigger_deploys
66
+ from paasta_tools.cli.utils import validate_git_sha
67
+ from paasta_tools.cli.utils import validate_given_deploy_groups
68
+ from paasta_tools.cli.utils import validate_service_name
69
+ from paasta_tools.cli.utils import validate_short_git_sha
70
+ from paasta_tools.deployment_utils import get_currently_deployed_sha
71
+ from paasta_tools.deployment_utils import get_currently_deployed_version
72
+ from paasta_tools.eks_tools import EksDeploymentConfig
73
+ from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig
74
+ from paasta_tools.long_running_service_tools import LongRunningServiceConfig
75
+ from paasta_tools.metrics import metrics_lib
76
+ from paasta_tools.paasta_service_config_loader import PaastaServiceConfigLoader
77
+ from paasta_tools.paastaapi.models import InstanceStatusKubernetesV2
78
+ from paasta_tools.paastaapi.models import KubernetesPodV2
79
+ from paasta_tools.slack import get_slack_client
80
+ from paasta_tools.utils import _log
81
+ from paasta_tools.utils import _log_audit
82
+ from paasta_tools.utils import DEFAULT_SOA_DIR
83
+ from paasta_tools.utils import DeploymentVersion
84
+ from paasta_tools.utils import format_tag
85
+ from paasta_tools.utils import get_files_of_type_in_dir
86
+ from paasta_tools.utils import get_git_url
87
+ from paasta_tools.utils import get_paasta_tag_from_deploy_group
88
+ from paasta_tools.utils import get_username
89
+ from paasta_tools.utils import ldap_user_search
90
+ from paasta_tools.utils import list_services
91
+ from paasta_tools.utils import load_system_paasta_config
92
+ from paasta_tools.utils import PaastaColors
93
+ from paasta_tools.utils import RollbackTypes
94
+ from paasta_tools.utils import TimeoutError
95
+
96
+ DEFAULT_DEPLOYMENT_TIMEOUT = 3 * 3600 # seconds
97
+ DEFAULT_WARN_PERCENT = 17 # ~30min for default timeout
98
+ DEFAULT_AUTO_CERTIFY_DELAY = 600 # seconds
99
+ DEFAULT_SLACK_CHANNEL = "#deploy"
100
+ DEFAULT_STUCK_BOUNCE_RUNBOOK = "y/stuckbounce"
101
+
102
+
103
+ log = logging.getLogger(__name__)
104
+
105
+
106
+ def add_subparser(subparsers: argparse._SubParsersAction) -> None:
107
+ list_parser = subparsers.add_parser(
108
+ "mark-for-deployment",
109
+ help="Mark a docker image for deployment in git",
110
+ description=(
111
+ "'paasta mark-for-deployment' uses Git as the control-plane, to "
112
+ "signal to other PaaSTA components that a particular docker image "
113
+ "is ready to be deployed."
114
+ ),
115
+ epilog=(
116
+ "Note: Access and credentials to the Git repo of a service are required "
117
+ "for this command to work."
118
+ ),
119
+ )
120
+ list_parser.add_argument(
121
+ "-u",
122
+ "--git-url",
123
+ help=(
124
+ "Git url for service -- where magic mark-for-deployment tags are pushed. "
125
+ "Defaults to the normal git URL for the service."
126
+ ),
127
+ default=None,
128
+ )
129
+ list_parser.add_argument(
130
+ "-c",
131
+ "-k",
132
+ "--commit",
133
+ help="Git sha to mark for deployment",
134
+ required=True,
135
+ type=validate_short_git_sha,
136
+ )
137
+ list_parser.add_argument(
138
+ "-i",
139
+ "--image-version",
140
+ help="Extra version metadata to mark for deployment",
141
+ required=False,
142
+ default=None,
143
+ )
144
+ arg_deploy_group = list_parser.add_argument(
145
+ "-l",
146
+ "--deploy-group",
147
+ "--clusterinstance",
148
+ help="Mark the service ready for deployment in this deploy group (e.g. "
149
+ "cluster1.canary, cluster2.main). --clusterinstance is deprecated and "
150
+ "should be replaced with --deploy-group",
151
+ required=True,
152
+ )
153
+ arg_deploy_group.completer = lazy_choices_completer(list_deploy_groups) # type: ignore
154
+ arg_service = list_parser.add_argument(
155
+ "-s",
156
+ "--service",
157
+ help="Name of the service which you wish to mark for deployment. Leading "
158
+ '"services-" will be stripped.',
159
+ required=True,
160
+ )
161
+ arg_service.completer = lazy_choices_completer(list_services) # type: ignore
162
+ list_parser.add_argument(
163
+ "--verify-image-exists",
164
+ help="Check the docker registry and verify the image has been pushed",
165
+ dest="verify_image",
166
+ action="store_true",
167
+ default=False,
168
+ )
169
+ list_parser.add_argument(
170
+ "--wait-for-deployment",
171
+ help="Set to poll paasta and wait for the deployment to finish, "
172
+ "the default strategy is to mark for deployment and exit straightaway",
173
+ dest="block",
174
+ action="store_true",
175
+ default=False,
176
+ )
177
+ list_parser.add_argument(
178
+ "-t",
179
+ "--timeout",
180
+ dest="timeout",
181
+ type=int,
182
+ default=DEFAULT_DEPLOYMENT_TIMEOUT,
183
+ help=(
184
+ "Time in seconds to wait for paasta to deploy the service. "
185
+ "If the timeout is exceeded we return 1. "
186
+ "Default is %(default)s seconds."
187
+ ),
188
+ )
189
+ list_parser.add_argument(
190
+ "-w",
191
+ "--warn",
192
+ dest="warn",
193
+ type=int,
194
+ default=DEFAULT_WARN_PERCENT,
195
+ help=(
196
+ "Percent of timeout to warn at if the deployment hasn't finished. "
197
+ "For example, --warn=75 will warn at 75%% of the timeout. "
198
+ "Defaults to %(default)s."
199
+ ),
200
+ )
201
+ list_parser.add_argument(
202
+ "--auto-rollback",
203
+ help="Automatically roll back to the previously deployed sha if the deployment "
204
+ "times out or is canceled (ctrl-c). Only applicable with --wait-for-deployment. "
205
+ "Defaults to false.",
206
+ dest="auto_rollback",
207
+ action="store_true",
208
+ default=False,
209
+ )
210
+ list_parser.add_argument(
211
+ "-d",
212
+ "--soa-dir",
213
+ dest="soa_dir",
214
+ metavar="SOA_DIR",
215
+ default=DEFAULT_SOA_DIR,
216
+ help="define a different soa config directory",
217
+ )
218
+ list_parser.add_argument(
219
+ "-v",
220
+ "--verbose",
221
+ action="count",
222
+ dest="verbose",
223
+ default=0,
224
+ help="Print out more output.",
225
+ )
226
+ list_parser.add_argument(
227
+ "--auto-certify-delay",
228
+ dest="auto_certify_delay",
229
+ type=int,
230
+ default=None, # the logic for this is complicated. See MarkForDeploymentProcess.get_auto_certify_delay.
231
+ help="After a deploy finishes, wait this many seconds before automatically certifying."
232
+ f"Default {DEFAULT_AUTO_CERTIFY_DELAY} when --auto-rollback is enabled",
233
+ )
234
+ list_parser.add_argument(
235
+ "--auto-abandon-delay",
236
+ dest="auto_abandon_delay",
237
+ type=int,
238
+ default=600,
239
+ help="After a rollback finishes, wait this many seconds before automatically abandoning.",
240
+ )
241
+ list_parser.add_argument(
242
+ "--auto-rollback-delay",
243
+ dest="auto_rollback_delay",
244
+ type=int,
245
+ default=30,
246
+ help="After noticing an SLO failure, wait this many seconds before automatically rolling back.",
247
+ )
248
+ list_parser.add_argument(
249
+ "--author",
250
+ dest="authors",
251
+ default=None,
252
+ action="append",
253
+ help="Additional author(s) of the deploy, who will be pinged in Slack",
254
+ )
255
+ list_parser.add_argument(
256
+ "--polling-interval",
257
+ dest="polling_interval",
258
+ type=float,
259
+ default=None,
260
+ help="How long to wait between each time we check to see if an instance is done deploying.",
261
+ )
262
+ list_parser.add_argument(
263
+ "--diagnosis-interval",
264
+ dest="diagnosis_interval",
265
+ type=float,
266
+ default=None,
267
+ help="How long to wait between diagnoses of why the bounce isn't done.",
268
+ )
269
+ list_parser.add_argument(
270
+ "--time-before-first-diagnosis",
271
+ dest="time_before_first_diagnosis",
272
+ type=float,
273
+ default=None,
274
+ help="Wait this long before trying to diagnose why the bounce isn't done.",
275
+ )
276
+
277
+ list_parser.set_defaults(command=paasta_mark_for_deployment)
278
+
279
+
280
+ def mark_for_deployment(
281
+ git_url: str,
282
+ deploy_group: str,
283
+ service: str,
284
+ commit: str,
285
+ image_version: Optional[str] = None,
286
+ ) -> int:
287
+ """Mark a docker image for deployment"""
288
+ tag = get_paasta_tag_from_deploy_group(
289
+ identifier=deploy_group, desired_state="deploy", image_version=image_version
290
+ )
291
+ remote_tag = format_tag(tag)
292
+ ref_mutator = remote_git.make_force_push_mutate_refs_func(
293
+ targets=[remote_tag], sha=commit
294
+ )
295
+
296
+ deployment_version = DeploymentVersion(commit, image_version)
297
+ max_attempts = 3
298
+ for attempt in range(1, max_attempts + 1):
299
+ try:
300
+ remote_git.create_remote_refs(
301
+ git_url=git_url, ref_mutator=ref_mutator, force=True
302
+ )
303
+ if "yelpcorp.com" in git_url:
304
+ trigger_deploys(service)
305
+ except Exception as e:
306
+ logline = f"Failed to mark {deployment_version} for deployment in deploy group {deploy_group}! (attempt \
307
+ {attempt}/{max_attempts}, error: {e}) \n Have you pushed your commit?"
308
+ _log(service=service, line=logline, component="deploy", level="event")
309
+ time.sleep(5 * attempt)
310
+ else:
311
+ logline = f"Marked {deployment_version} for deployment in deploy group {deploy_group}"
312
+ _log(service=service, line=logline, component="deploy", level="event")
313
+
314
+ audit_action_details = {
315
+ "deploy_group": deploy_group,
316
+ "commit": commit,
317
+ "image_version": image_version,
318
+ }
319
+ _log_audit(
320
+ action="mark-for-deployment",
321
+ action_details=audit_action_details,
322
+ service=service,
323
+ )
324
+
325
+ return 0
326
+ return 1
327
+
328
+
329
+ def can_user_deploy_service(deploy_info: Dict[str, Any], service: str) -> bool:
330
+ deploy_username = get_username()
331
+
332
+ # Tronjobs can run paasta stop/start/restart
333
+ ssh_client_env = os.environ.get("SSH_CLIENT")
334
+ if ssh_client_env and deploy_username == "batch":
335
+ ssh_client = ssh_client_env.split()[0]
336
+ hostname = socket.gethostbyaddr(ssh_client)[0]
337
+
338
+ if "tron" in hostname:
339
+ return True
340
+
341
+ system_paasta_config = load_system_paasta_config()
342
+ allowed_groups = (
343
+ deploy_info["allowed_push_groups"]
344
+ if deploy_info.get("allowed_push_groups") is not None
345
+ else system_paasta_config.get_default_push_groups()
346
+ )
347
+ if allowed_groups is not None:
348
+ search_base = system_paasta_config.get_ldap_search_base()
349
+ search_ou = system_paasta_config.get_ldap_search_ou()
350
+ host = system_paasta_config.get_ldap_host()
351
+ ldap_username = system_paasta_config.get_ldap_reader_username()
352
+ ldap_password = system_paasta_config.get_ldap_reader_password()
353
+ if not any(
354
+ [
355
+ deploy_username
356
+ in ldap_user_search(
357
+ group, search_base, search_ou, host, ldap_username, ldap_password
358
+ )
359
+ for group in allowed_groups
360
+ ]
361
+ ):
362
+ logline = f"current user is not authorized to perform this action (should be in one of {allowed_groups})"
363
+ _log(service=service, line=logline, component="deploy", level="event")
364
+ print(logline, file=sys.stderr)
365
+ return False
366
+ return True
367
+
368
+
369
+ def can_run_metric_watcher_threads(
370
+ service: str,
371
+ soa_dir: str,
372
+ ) -> bool:
373
+ """
374
+ Cannot run slo and metric watcher threads together for now.
375
+ SLO Watcher Threads take precedence over metric watcher threads.
376
+ Metric Watcher Threads can run if there are no SLOs available.
377
+ """
378
+ slo_files = get_files_of_type_in_dir(
379
+ file_type="slo", service=service, soa_dir=soa_dir
380
+ )
381
+ rollback_files = get_files_of_type_in_dir(
382
+ file_type="rollback", service=service, soa_dir=soa_dir
383
+ )
384
+ return bool(not slo_files and rollback_files)
385
+
386
+
387
+ def report_waiting_aborted(service: str, deploy_group: str) -> None:
388
+ print(
389
+ PaastaColors.red(
390
+ "Waiting for deployment aborted."
391
+ " PaaSTA will continue trying to deploy this code."
392
+ )
393
+ )
394
+ print("If you wish to see the status, run:")
395
+ print()
396
+ print(f" paasta status -s {service} -l {deploy_group} -v")
397
+ print()
398
+
399
+
400
+ def get_authors_to_be_notified(
401
+ git_url: str, from_sha: str, to_sha: str, authors: Optional[Collection[str]]
402
+ ) -> str:
403
+ if from_sha is None:
404
+ return ""
405
+
406
+ if authors:
407
+ authors_to_notify = authors
408
+ elif "git.yelpcorp.com" in git_url:
409
+ ret, git_authors = remote_git.get_authors(
410
+ git_url=git_url, from_sha=from_sha, to_sha=to_sha
411
+ )
412
+ if ret == 0:
413
+ authors_to_notify = git_authors.split()
414
+ else:
415
+ return f"(Could not get authors: {git_authors})"
416
+ else:
417
+ # We have no way of getting authors on the fly if the repository is not on gitolite
418
+ return ""
419
+
420
+ slacky_authors = ", ".join({f"<@{a}>" for a in authors_to_notify})
421
+ log.debug(f"Authors: {slacky_authors}")
422
+ return f"^ {slacky_authors}"
423
+
424
+
425
+ def deploy_group_is_set_to_notify(
426
+ deploy_info: Dict[str, Any], deploy_group: str, notify_type: str
427
+ ) -> bool:
428
+ for step in deploy_info.get("pipeline", []):
429
+ if step.get("step", "") == deploy_group:
430
+ # Use the specific notify_type if available else use slack_notify
431
+ return step.get(notify_type, step.get("slack_notify", False))
432
+ return False
433
+
434
+
435
+ def get_deploy_info(service: str, soa_dir: str) -> Dict[str, Any]:
436
+ file_path = os.path.join(soa_dir, service, "deploy.yaml")
437
+ return read_deploy(file_path)
438
+
439
+
440
+ def paasta_mark_for_deployment(args: argparse.Namespace) -> int:
441
+ """Wrapping mark_for_deployment"""
442
+ if args.verbose:
443
+ log.setLevel(level=logging.DEBUG)
444
+ else:
445
+ log.setLevel(level=logging.INFO)
446
+
447
+ service = args.service
448
+ if service and service.startswith("services-"):
449
+ service = service.split("services-", 1)[1]
450
+ validate_service_name(service, soa_dir=args.soa_dir)
451
+
452
+ deploy_group = args.deploy_group
453
+ in_use_deploy_groups = list_deploy_groups(service=service, soa_dir=args.soa_dir)
454
+ _, invalid_deploy_groups = validate_given_deploy_groups(
455
+ in_use_deploy_groups, [deploy_group]
456
+ )
457
+
458
+ if len(invalid_deploy_groups) == 1:
459
+ print(
460
+ PaastaColors.red(
461
+ "ERROR: These deploy groups are not currently used anywhere: %s.\n"
462
+ % (",").join(invalid_deploy_groups)
463
+ )
464
+ )
465
+ print(
466
+ PaastaColors.red(
467
+ "This isn't technically wrong because you can mark-for-deployment before deploying there"
468
+ )
469
+ )
470
+ print(
471
+ PaastaColors.red(
472
+ "but this is probably a typo. Did you mean one of these in-use deploy groups?:"
473
+ )
474
+ )
475
+ print(PaastaColors.red(" %s" % (",").join(in_use_deploy_groups)))
476
+ print()
477
+ print(PaastaColors.red("Continuing regardless..."))
478
+
479
+ if args.git_url is None:
480
+ args.git_url = get_git_url(service=service, soa_dir=args.soa_dir)
481
+
482
+ commit = validate_git_sha(sha=args.commit, git_url=args.git_url)
483
+ deployment_version = DeploymentVersion(commit, args.image_version)
484
+
485
+ old_deployment_version = get_currently_deployed_version(
486
+ service=service, deploy_group=deploy_group
487
+ )
488
+ if deployment_version == old_deployment_version:
489
+ print(
490
+ "Warning: The image asked to be deployed already matches what is set to be deployed:"
491
+ )
492
+ print(deployment_version)
493
+ print("Continuing anyway.")
494
+
495
+ if args.verify_image:
496
+ if not is_docker_image_already_in_registry(
497
+ service, args.soa_dir, commit, deployment_version.image_version
498
+ ):
499
+ raise ValueError(
500
+ f"Failed to find image in the registry for the following version {deployment_version}"
501
+ )
502
+
503
+ deploy_info = get_deploy_info(service=service, soa_dir=args.soa_dir)
504
+ if not can_user_deploy_service(deploy_info, service):
505
+ sys.exit(1)
506
+
507
+ metrics_factory: Callable[[str], metrics_lib.BaseMetrics] = metrics_lib.NoMetrics
508
+ # only time if wait for deployment and we are actually deploying a new image
509
+ if args.block and deployment_version != old_deployment_version:
510
+ metrics_factory = metrics_lib.get_metrics_interface
511
+ metrics = metrics_factory("paasta.mark_for_deployment")
512
+ deploy_timer = metrics.create_timer(
513
+ name="deploy_duration",
514
+ default_dimensions=dict(
515
+ paasta_service=service,
516
+ deploy_group=deploy_group,
517
+ old_version=str(old_deployment_version),
518
+ new_version=str(deployment_version),
519
+ deploy_timeout=args.timeout,
520
+ ),
521
+ )
522
+
523
+ # meteorite deploy timers can be used as context managers; however, they
524
+ # won't emit if the context is exited with an exception, so we need to use
525
+ # a try/finally.
526
+ deploy_timer.start()
527
+ ret = 1 # assume exc, since if success will be set to 0 anyway
528
+ try:
529
+ deploy_process = MarkForDeploymentProcess(
530
+ service=service,
531
+ deploy_info=deploy_info,
532
+ deploy_group=deploy_group,
533
+ commit=commit,
534
+ old_git_sha=old_deployment_version.sha if old_deployment_version else None,
535
+ git_url=args.git_url,
536
+ auto_rollback=args.auto_rollback,
537
+ block=args.block,
538
+ soa_dir=args.soa_dir,
539
+ timeout=args.timeout,
540
+ warn_pct=args.warn,
541
+ auto_certify_delay=args.auto_certify_delay,
542
+ auto_abandon_delay=args.auto_abandon_delay,
543
+ auto_rollback_delay=args.auto_rollback_delay,
544
+ image_version=deployment_version.image_version,
545
+ old_image_version=old_deployment_version.image_version
546
+ if old_deployment_version
547
+ else None,
548
+ authors=args.authors,
549
+ polling_interval=args.polling_interval,
550
+ diagnosis_interval=args.diagnosis_interval,
551
+ time_before_first_diagnosis=args.time_before_first_diagnosis,
552
+ metrics_interface=metrics,
553
+ )
554
+ ret = deploy_process.run()
555
+ return ret
556
+ finally:
557
+ deploy_timer.stop(tmp_dimensions={"exit_status": ret})
558
+
559
+
560
+ class Progress:
561
+ waiting_on: Mapping[str, Collection[str]]
562
+ percent: float
563
+
564
+ def __init__(
565
+ self, percent: float = 0, waiting_on: Mapping[str, Collection[str]] = None
566
+ ) -> None:
567
+ self.percent = percent
568
+ self.waiting_on = waiting_on
569
+
570
+ def human_readable(self, summary: bool) -> str:
571
+ if self.percent != 0 and self.percent != 100 and not summary:
572
+ s = f"{round(self.percent)}% (Waiting on {self.human_waiting_on()})"
573
+ else:
574
+ s = f"{round(self.percent)}%"
575
+ return s
576
+
577
+ def human_waiting_on(self) -> str:
578
+ if self.waiting_on is None:
579
+ return "N/A"
580
+ things = []
581
+ for cluster, instances in self.waiting_on.items():
582
+ num_instances = len(instances)
583
+ if num_instances == 0:
584
+ continue
585
+ elif num_instances == 1:
586
+ (one_instance,) = instances
587
+ things.append(f"`{cluster}`: `{one_instance}`")
588
+ else:
589
+ things.append(f"`{cluster}`: {len(instances)} instances")
590
+ return ", ".join(things)
591
+
592
+
593
+ class MarkForDeploymentProcess(RollbackSlackDeploymentProcess):
594
+ rollback_states = ["start_rollback", "rolling_back", "rolled_back"]
595
+ rollforward_states = ["start_deploy", "deploying", "deployed"]
596
+ default_slack_channel = DEFAULT_SLACK_CHANNEL
597
+
598
+ paasta_status_reminder_handle: asyncio.TimerHandle
599
+
600
+ def __init__(
601
+ self,
602
+ service: str,
603
+ deploy_info: Dict,
604
+ deploy_group: str,
605
+ commit: str,
606
+ old_git_sha: str,
607
+ git_url: str,
608
+ auto_rollback: bool,
609
+ block: bool,
610
+ soa_dir: str,
611
+ timeout: float,
612
+ warn_pct: float,
613
+ auto_certify_delay: float,
614
+ auto_abandon_delay: float,
615
+ auto_rollback_delay: float,
616
+ image_version: Optional[str] = None,
617
+ old_image_version: Optional[str] = None,
618
+ authors: Optional[List[str]] = None,
619
+ polling_interval: float = None,
620
+ diagnosis_interval: float = None,
621
+ time_before_first_diagnosis: float = None,
622
+ metrics_interface: metrics_lib.BaseMetrics = metrics_lib.NoMetrics(
623
+ "paasta.mark_for_deployment"
624
+ ),
625
+ ) -> None:
626
+ self.service = service
627
+ self.deploy_info = deploy_info
628
+ self.deploy_group = deploy_group
629
+ self.commit = commit
630
+ self.old_git_sha = old_git_sha
631
+ self.image_version = image_version
632
+ self.old_image_version = old_image_version
633
+ self.deployment_version = DeploymentVersion(commit, image_version)
634
+ self.old_deployment_version = DeploymentVersion(old_git_sha, old_image_version)
635
+ self.git_url = git_url
636
+ self.auto_rollback = (
637
+ auto_rollback
638
+ and old_git_sha is not None
639
+ and self.deployment_version != self.old_deployment_version
640
+ )
641
+ self.auto_rollbacks_ever_enabled = self.auto_rollback
642
+ self.block = block
643
+ self.soa_dir = soa_dir
644
+ self.timeout = timeout
645
+ self.warn_pct = warn_pct
646
+ self.mark_for_deployment_return_code = -1
647
+ self.auto_certify_delay = auto_certify_delay
648
+ self.auto_abandon_delay = auto_abandon_delay
649
+ self.auto_rollback_delay = auto_rollback_delay
650
+ self.authors = authors
651
+ self.polling_interval = polling_interval
652
+ self.diagnosis_interval = diagnosis_interval
653
+ self.time_before_first_diagnosis = time_before_first_diagnosis
654
+ self.metrics_interface = metrics_interface
655
+ self.instance_configs_per_cluster: Dict[
656
+ str, List[LongRunningServiceConfig]
657
+ ] = get_instance_configs_for_service_in_deploy_group_all_clusters(
658
+ service, deploy_group, soa_dir
659
+ )
660
+
661
+ # Keep track of each wait_for_deployment task so we can cancel it.
662
+ self.wait_for_deployment_tasks: Dict[DeploymentVersion, asyncio.Task] = {}
663
+
664
+ self.human_readable_status = "Waiting on mark-for-deployment to initialize..."
665
+ self.progress = Progress()
666
+ self.last_action = None
667
+ self.slo_watchers: List[SLOWatcher] = []
668
+ self.metric_watchers: List[MetricWatcher] = []
669
+ self.start_slo_watcher_threads(self.service, self.soa_dir)
670
+
671
+ # TODO: Allow both metric and slo watcher threads to run together in the future
672
+ if can_run_metric_watcher_threads(service=self.service, soa_dir=self.soa_dir):
673
+ self.start_metric_watcher_threads(self.service, self.soa_dir)
674
+
675
+ # Initialize Slack threads and send the first message
676
+ super().__init__()
677
+ self.print_who_is_running_this()
678
+
679
+ def get_progress(self, summary: bool = False) -> str:
680
+ if not self.block:
681
+ return "Deploying in background, progress not tracked."
682
+ return self.progress.human_readable(summary)
683
+
684
+ def print_who_is_running_this(self) -> None:
685
+ build_url = get_jenkins_build_output_url()
686
+ if build_url is not None:
687
+ message = f"(<{build_url}|Jenkins Job>)"
688
+ else:
689
+ message = f"(Run by `{getpass.getuser()}` on {socket.getfqdn()})"
690
+ self.update_slack_thread(message)
691
+
692
+ def get_authors(self) -> str:
693
+ # In order to avoid notifying people who aren't part of the current
694
+ # service push, we calculate authors based on commits different since
695
+ # the current production SHA, as opposed to the old SHA on this deploy
696
+ # group.
697
+ #
698
+ # This avoids situations such as:
699
+ # * Notifying people from a previous push which went through stagef,
700
+ # if the new push goes through stageg.
701
+ # * Notifying everybody who has committed to a repo in the past year
702
+ # when updating a "legacy" deploy group (e.g. for yelp-main).
703
+ prod_deploy_group = self.deploy_info.get("production_deploy_group")
704
+ from_sha = None
705
+ if prod_deploy_group is not None:
706
+ from_sha = get_currently_deployed_sha(
707
+ service=self.service, deploy_group=prod_deploy_group
708
+ )
709
+ # If there's no production deploy group, or the production deploy group
710
+ # has never been deployed to, just use the old SHA from this deploy group.
711
+ if from_sha is None:
712
+ from_sha = self.old_git_sha
713
+ return get_authors_to_be_notified(
714
+ git_url=self.git_url,
715
+ from_sha=from_sha,
716
+ to_sha=self.commit,
717
+ authors=self.authors,
718
+ )
719
+
720
+ def ping_authors(self, message: str = None) -> None:
721
+ if message:
722
+ self.update_slack_thread(f"{message}\n{self.get_authors()}")
723
+ else:
724
+ self.update_slack_thread(self.get_authors())
725
+
726
+ def get_slack_client(self) -> SlackClient:
727
+ return get_slack_client().sc
728
+
729
+ def get_slack_channel(self) -> str:
730
+ """Safely get some slack channel to post to. Defaults to ``DEFAULT_SLACK_CHANNEL``.
731
+ Currently only uses the first slack channel available, and doesn't support
732
+ multi-channel notifications."""
733
+ if self.deploy_info.get("slack_notify", True):
734
+ try:
735
+ channel = self.deploy_info.get("slack_channels")[0]
736
+ # Nightly jenkins builds will often re-deploy master. This causes Slack noise that wasn't present before
737
+ # the auto-rollbacks work.
738
+ if self.deployment_version == self.old_deployment_version:
739
+ print(
740
+ f"Rollback image matches rollforward image: {self.deployment_version}, "
741
+ f"Sending slack notifications to {DEFAULT_SLACK_CHANNEL} instead of {channel}."
742
+ )
743
+ return DEFAULT_SLACK_CHANNEL
744
+ else:
745
+ return channel
746
+ except (IndexError, AttributeError, TypeError):
747
+ return DEFAULT_SLACK_CHANNEL
748
+ else:
749
+ return DEFAULT_SLACK_CHANNEL
750
+
751
+ def get_deployment_name(self) -> str:
752
+ return f"Deploy of `{self.deployment_version.short_sha_repr()}` of `{self.service}` to `{self.deploy_group}`:"
753
+
754
+ def on_enter_start_deploy(self) -> None:
755
+ self.update_slack_status(
756
+ f"Marking `{self.deployment_version.short_sha_repr()}` for deployment for {self.deploy_group}..."
757
+ )
758
+ self.mark_for_deployment_return_code = mark_for_deployment(
759
+ git_url=self.git_url,
760
+ deploy_group=self.deploy_group,
761
+ service=self.service,
762
+ commit=self.commit,
763
+ image_version=self.image_version,
764
+ )
765
+ if self.mark_for_deployment_return_code != 0:
766
+ self.trigger("mfd_failed")
767
+ else:
768
+ self.update_slack_thread(
769
+ f"Marked `{self.deployment_version.short_sha_repr()}` for {self.deploy_group}."
770
+ + (
771
+ "\n" + self.get_authors()
772
+ if self.deploy_group_is_set_to_notify("notify_after_mark")
773
+ else ""
774
+ )
775
+ )
776
+ log.debug("triggering mfd_succeeded")
777
+ self.trigger("mfd_succeeded")
778
+
779
+ def schedule_paasta_status_reminder(self) -> None:
780
+ def waiting_on_to_status(
781
+ waiting_on: Mapping[str, Collection[str]]
782
+ ) -> List[str]:
783
+ if waiting_on is None:
784
+ return [
785
+ f"`paasta status --service {self.service} --deploy-group {self.deploy_group} -vv`"
786
+ ]
787
+ commands = []
788
+ for cluster, instances in waiting_on.items():
789
+ num_instances = len(instances)
790
+ if num_instances == 0:
791
+ continue
792
+ else:
793
+ commands.append(
794
+ f"`paasta status --service {self.service} --cluster {cluster} --instance {','.join(instances)} -vv`"
795
+ )
796
+ return commands
797
+
798
+ def times_up() -> None:
799
+ try:
800
+ if self.state == "deploying":
801
+ human_max_deploy_time = humanize.naturaldelta(
802
+ datetime.timedelta(seconds=self.timeout)
803
+ )
804
+ stuck_bounce_runbook = os.environ.get(
805
+ "STUCK_BOUNCE_RUNBOOK",
806
+ DEFAULT_STUCK_BOUNCE_RUNBOOK,
807
+ )
808
+ status_commands = "\n".join(
809
+ waiting_on_to_status(self.progress.waiting_on)
810
+ )
811
+
812
+ self.notify_users(
813
+ (
814
+ f"It has been {self.warn_pct}% of the "
815
+ f"maximum deploy time ({human_max_deploy_time}), "
816
+ "which means the deployment may be stuck. "
817
+ "Here are some things you can try:\n\n"
818
+ f"* See {stuck_bounce_runbook} for debugging help\n"
819
+ f"* Run these commands to see the status of instances that "
820
+ "have not yet finished deploying:\n\n"
821
+ f"{status_commands}"
822
+ )
823
+ )
824
+ except Exception as e:
825
+ log.error(
826
+ f"Non-fatal exception encountered when processing the status reminder: {e}"
827
+ )
828
+
829
+ def schedule_callback() -> None:
830
+ time_to_notify = self.timeout * self.warn_pct / 100
831
+ self.paasta_status_reminder_handle = self.event_loop.call_later(
832
+ time_to_notify, times_up
833
+ )
834
+
835
+ try:
836
+ self.event_loop.call_soon_threadsafe(schedule_callback)
837
+ except Exception as e:
838
+ log.error(
839
+ f"Non-fatal error encountered scheduling the status reminder callback: {e}"
840
+ )
841
+
842
+ def cancel_paasta_status_reminder(self) -> None:
843
+ try:
844
+ handle = self.get_paasta_status_reminder_handle()
845
+ if handle is not None:
846
+ handle.cancel()
847
+ self.paasta_status_reminder_handle = None
848
+ except Exception as e:
849
+ log.error(
850
+ f"Non-fatal error encountered when canceling the paasta status reminder: {e}"
851
+ )
852
+
853
+ def get_paasta_status_reminder_handle(self) -> Optional[asyncio.TimerHandle]:
854
+ try:
855
+ return self.paasta_status_reminder_handle
856
+ except AttributeError:
857
+ return None
858
+
859
+ def states(self) -> Collection[str]:
860
+ return [
861
+ "_begin",
862
+ "start_deploy",
863
+ "deploying",
864
+ "deployed",
865
+ "mfd_failed",
866
+ "deploy_errored",
867
+ "deploy_cancelled",
868
+ "start_rollback",
869
+ "rolling_back",
870
+ "rolled_back",
871
+ "abandon",
872
+ "complete",
873
+ ]
874
+
875
+ def start_state(self) -> str:
876
+ return "_begin"
877
+
878
+ def start_transition(self) -> str:
879
+ return "start_deploy"
880
+
881
+ def valid_transitions(self) -> Iterator[state_machine.TransitionDefinition]:
882
+ rollback_is_possible = (
883
+ self.old_git_sha is not None
884
+ and self.deployment_version != self.old_deployment_version
885
+ )
886
+
887
+ yield {"source": "_begin", "dest": "start_deploy", "trigger": "start_deploy"}
888
+ yield {
889
+ "source": "start_deploy",
890
+ "dest": "deploying",
891
+ "trigger": "mfd_succeeded",
892
+ }
893
+ yield {"source": "deploying", "dest": "deployed", "trigger": "deploy_finished"}
894
+
895
+ yield {
896
+ "source": ["start_deploy", "start_rollback"],
897
+ "dest": "mfd_failed",
898
+ "trigger": "mfd_failed",
899
+ }
900
+ yield {
901
+ "source": [s for s in self.states() if not self.is_terminal_state(s)],
902
+ "dest": "deploy_errored",
903
+ "trigger": "deploy_errored",
904
+ }
905
+ yield {
906
+ "source": [s for s in self.states() if not self.is_terminal_state(s)],
907
+ "dest": "deploy_cancelled",
908
+ "trigger": "deploy_cancelled",
909
+ }
910
+
911
+ if rollback_is_possible:
912
+ yield {
913
+ "source": self.rollforward_states,
914
+ "dest": "start_rollback",
915
+ "trigger": "rollback_button_clicked",
916
+ "before": self.log_user_rollback,
917
+ }
918
+ yield {
919
+ "source": self.rollback_states,
920
+ "dest": None, # this makes it an "internal transition", effectively a noop.
921
+ "trigger": "rollback_button_clicked",
922
+ }
923
+ yield {
924
+ "source": self.rollforward_states,
925
+ "dest": "start_rollback",
926
+ "trigger": "rollback_slo_failure",
927
+ "before": self.log_slo_rollback,
928
+ }
929
+ yield {
930
+ "source": self.rollback_states,
931
+ "dest": None, # this makes it an "internal transition", effectively a noop.
932
+ "trigger": "rollback_slo_failure",
933
+ }
934
+ yield {
935
+ "source": self.rollforward_states,
936
+ "dest": "start_rollback",
937
+ "trigger": "rollback_metric_failure",
938
+ "before": self.log_metric_rollback,
939
+ }
940
+ yield {
941
+ "source": self.rollback_states,
942
+ "dest": "start_deploy",
943
+ "trigger": "forward_button_clicked",
944
+ }
945
+ yield {
946
+ "source": self.rollforward_states,
947
+ "dest": None, # this makes it an "internal transition", effectively a noop.
948
+ "trigger": "forward_button_clicked",
949
+ }
950
+ yield {
951
+ "source": "start_rollback",
952
+ "dest": "rolling_back",
953
+ "trigger": "mfd_succeeded",
954
+ }
955
+ yield {
956
+ "source": "rolling_back",
957
+ "dest": "rolled_back",
958
+ "trigger": "deploy_finished",
959
+ }
960
+
961
+ yield {
962
+ "source": "deployed",
963
+ "dest": "complete",
964
+ "trigger": "complete_button_clicked",
965
+ }
966
+ yield {"source": "deployed", "dest": "complete", "trigger": "auto_certify"}
967
+ yield {
968
+ "source": ["rolled_back", "rolling_back"],
969
+ "dest": "abandon",
970
+ "trigger": "abandon_button_clicked",
971
+ }
972
+ yield {"source": "rolled_back", "dest": "abandon", "trigger": "auto_abandon"}
973
+
974
+ if rollback_is_possible:
975
+ # Suppress these buttons if it doesn't make sense to roll back.
976
+ yield {
977
+ "source": "*",
978
+ "dest": None, # Don't actually change state, just call the before function.
979
+ "trigger": "enable_auto_rollbacks_button_clicked",
980
+ "unless": [self.auto_rollbacks_enabled],
981
+ "before": self.enable_auto_rollbacks,
982
+ }
983
+ yield {
984
+ "source": "*",
985
+ "dest": None, # Don't actually change state, just call the before function.
986
+ "trigger": "disable_auto_rollbacks_button_clicked",
987
+ "conditions": [
988
+ self.any_rollback_condition_failing,
989
+ self.auto_rollbacks_enabled,
990
+ ],
991
+ "before": self.disable_auto_rollbacks,
992
+ }
993
+ yield {
994
+ "source": "*",
995
+ "dest": None,
996
+ "trigger": "slos_started_failing",
997
+ "conditions": [self.auto_rollbacks_enabled],
998
+ "unless": [self.already_rolling_back],
999
+ "before": functools.partial(
1000
+ self.start_auto_rollback_countdown, "rollback_slo_failure"
1001
+ ),
1002
+ }
1003
+ yield {
1004
+ "source": "*",
1005
+ "dest": None,
1006
+ "trigger": "slos_stopped_failing",
1007
+ "before": functools.partial(
1008
+ self.cancel_auto_rollback_countdown, "rollback_slo_failure"
1009
+ ),
1010
+ }
1011
+ yield {
1012
+ "source": "*",
1013
+ "dest": None,
1014
+ "trigger": "metrics_started_failing",
1015
+ "conditions": [self.auto_rollbacks_enabled],
1016
+ "unless": [self.already_rolling_back],
1017
+ "before": functools.partial(
1018
+ self.start_auto_rollback_countdown, "rollback_metric_failure"
1019
+ ),
1020
+ }
1021
+ yield {
1022
+ "source": "*",
1023
+ "dest": None,
1024
+ "trigger": "metrics_stopped_failing",
1025
+ "before": functools.partial(
1026
+ self.cancel_auto_rollback_countdown, "rollback_metric_failure"
1027
+ ),
1028
+ }
1029
+ yield {
1030
+ "source": "*",
1031
+ "dest": None,
1032
+ "trigger": "snooze_button_clicked",
1033
+ "before": self.restart_timer,
1034
+ "conditions": [self.is_timer_running],
1035
+ }
1036
+
1037
+ def disable_auto_rollbacks(self, trigger: str) -> None:
1038
+ self.cancel_auto_rollback_countdown(trigger=trigger)
1039
+ self.auto_rollback = False
1040
+ self.update_slack_status(
1041
+ f"Automatic rollback disabled for this deploy. To disable this permanently for this step, edit `deploy.yaml` and set `auto_rollback: false` for the `{self.deploy_group}` step."
1042
+ )
1043
+
1044
+ def enable_auto_rollbacks(self) -> None:
1045
+ self.auto_rollback = True
1046
+ self.auto_rollbacks_ever_enabled = True
1047
+ self.update_slack_status(
1048
+ f"Automatic rollback enabled for this deploy. Will watch for failures and rollback when necessary. To set this permanently, edit `deploy.yaml` and set `auto_rollback: false` for the `{self.deploy_group}` step."
1049
+ )
1050
+
1051
+ def auto_rollbacks_enabled(self) -> bool:
1052
+ """This getter exists so it can be a condition on transitions, since those need to be callables."""
1053
+ return self.auto_rollback
1054
+
1055
+ def get_auto_rollback_delay(self) -> float:
1056
+ return self.auto_rollback_delay
1057
+
1058
+ def get_auto_certify_delay(self) -> float:
1059
+ if self.auto_certify_delay is not None:
1060
+ return self.auto_certify_delay
1061
+ else:
1062
+ if self.auto_rollbacks_ever_enabled:
1063
+ return DEFAULT_AUTO_CERTIFY_DELAY
1064
+ else:
1065
+ return 0
1066
+
1067
+ def already_rolling_back(self) -> bool:
1068
+ return self.state in self.rollback_states
1069
+
1070
+ def status_code_by_state(self) -> Mapping[str, int]:
1071
+ codes = {
1072
+ "deploy_errored": 2,
1073
+ "deploy_cancelled": 1,
1074
+ "mfd_failed": self.mark_for_deployment_return_code,
1075
+ "abandon": 1,
1076
+ "complete": 0,
1077
+ }
1078
+
1079
+ if not self.block:
1080
+ # If we don't pass --wait-for-deployment, then exit immediately after mark-for-deployment succeeds.
1081
+ codes["deploying"] = 0
1082
+ if self.get_auto_certify_delay() <= 0:
1083
+ # Instead of setting a 0-second timer to move to certify, just exit 0 when the deploy finishes.
1084
+ codes["deployed"] = 0
1085
+
1086
+ return codes
1087
+
1088
+ def get_active_button(self) -> Optional[str]:
1089
+ return {
1090
+ "start_deploy": "forward",
1091
+ "deploying": "forward",
1092
+ "deployed": None,
1093
+ "start_rollback": "rollback",
1094
+ "rolling_back": "rollback",
1095
+ "rolled_back": None,
1096
+ }.get(self.state)
1097
+
1098
+ def on_enter_mfd_failed(self) -> None:
1099
+ self.update_slack_status(
1100
+ f"Marking `{self.deployment_version.short_sha_repr()}` for deployment for {self.deploy_group} failed. Please see Jenkins for more output."
1101
+ ) # noqa E501
1102
+
1103
+ def on_enter_deploying(self) -> None:
1104
+ # if self.block is False, then deploying is a terminal state so we will promptly exit.
1105
+ # Don't bother starting the background thread in this case.
1106
+ if self.block:
1107
+ thread = Thread(
1108
+ target=self.do_wait_for_deployment,
1109
+ args=(self.commit, self.image_version),
1110
+ daemon=True,
1111
+ )
1112
+ thread.start()
1113
+ self.cancel_paasta_status_reminder()
1114
+ self.schedule_paasta_status_reminder()
1115
+
1116
+ def on_exit_deploying(self) -> None:
1117
+ self.stop_waiting_for_deployment(self.commit)
1118
+ self.cancel_paasta_status_reminder()
1119
+
1120
+ def on_enter_start_rollback(self) -> None:
1121
+ self.update_slack_status(
1122
+ f"Rolling back ({self.deploy_group}) to {self.old_deployment_version}"
1123
+ )
1124
+ self.mark_for_deployment_return_code = mark_for_deployment(
1125
+ git_url=self.git_url,
1126
+ deploy_group=self.deploy_group,
1127
+ service=self.service,
1128
+ commit=self.old_git_sha,
1129
+ image_version=self.old_image_version,
1130
+ )
1131
+
1132
+ if self.mark_for_deployment_return_code != 0:
1133
+ self.trigger("mfd_failed")
1134
+ else:
1135
+ self.update_slack_thread(
1136
+ f"Marked `{self.old_git_sha[:8]}` for {self.deploy_group}."
1137
+ + (
1138
+ "\n" + self.get_authors()
1139
+ if self.deploy_group_is_set_to_notify("notify_after_mark")
1140
+ else ""
1141
+ )
1142
+ )
1143
+
1144
+ self.trigger("mfd_succeeded")
1145
+
1146
+ def on_enter_rolling_back(self) -> None:
1147
+ if self.block:
1148
+ thread = Thread(
1149
+ target=self.do_wait_for_deployment,
1150
+ args=(self.old_git_sha, self.old_image_version),
1151
+ daemon=True,
1152
+ )
1153
+ thread.start()
1154
+
1155
+ def on_exit_rolling_back(self) -> None:
1156
+ self.stop_waiting_for_deployment(self.old_git_sha, self.old_image_version)
1157
+
1158
+ def on_enter_deploy_errored(self) -> None:
1159
+ report_waiting_aborted(self.service, self.deploy_group)
1160
+ self.update_slack_status(f"Deploy aborted, but it will still try to converge.")
1161
+ self.send_manual_rollback_instructions()
1162
+ if self.deploy_group_is_set_to_notify("notify_after_abort"):
1163
+ self.ping_authors("Deploy errored")
1164
+
1165
+ def on_enter_deploy_cancelled(self) -> None:
1166
+ if self.deploy_group_is_set_to_notify("notify_after_abort"):
1167
+ self.ping_authors("Deploy cancelled")
1168
+
1169
+ def stop_waiting_for_deployment(
1170
+ self, target_commit: str, target_image_version: Optional[str] = None
1171
+ ) -> None:
1172
+ try:
1173
+ target_version = DeploymentVersion(
1174
+ sha=target_commit, image_version=target_image_version
1175
+ )
1176
+ self.wait_for_deployment_tasks[target_version].cancel()
1177
+ del self.wait_for_deployment_tasks[target_version]
1178
+ except (KeyError, asyncio.InvalidStateError):
1179
+ pass
1180
+
1181
+ @a_sync.to_blocking
1182
+ async def do_wait_for_deployment(
1183
+ self, target_commit: str, target_image_version: Optional[str] = None
1184
+ ) -> None:
1185
+ try:
1186
+ target_version = DeploymentVersion(
1187
+ sha=target_commit, image_version=target_image_version
1188
+ )
1189
+ self.stop_waiting_for_deployment(target_commit, target_image_version)
1190
+ wait_for_deployment_task = asyncio.create_task(
1191
+ wait_for_deployment(
1192
+ service=self.service,
1193
+ deploy_group=self.deploy_group,
1194
+ instance_configs_per_cluster=self.instance_configs_per_cluster,
1195
+ git_sha=target_commit,
1196
+ image_version=target_image_version,
1197
+ soa_dir=self.soa_dir,
1198
+ timeout=self.timeout,
1199
+ progress=self.progress,
1200
+ polling_interval=self.polling_interval,
1201
+ diagnosis_interval=self.diagnosis_interval,
1202
+ time_before_first_diagnosis=self.time_before_first_diagnosis,
1203
+ notify_fn=self.ping_authors,
1204
+ )
1205
+ )
1206
+ self.wait_for_deployment_tasks[target_version] = wait_for_deployment_task
1207
+ await wait_for_deployment_task
1208
+ if self.deploy_group_is_set_to_notify("notify_after_wait"):
1209
+ self.ping_authors(
1210
+ f"Finished waiting for deployment of {target_version}"
1211
+ )
1212
+ else:
1213
+ self.update_slack_thread(
1214
+ f"Finished waiting for deployment of {target_version}"
1215
+ )
1216
+ self.trigger("deploy_finished")
1217
+
1218
+ except (KeyboardInterrupt, TimeoutError):
1219
+ self.trigger("deploy_cancelled")
1220
+ except NoSuchCluster:
1221
+ self.trigger("deploy_errored")
1222
+ except asyncio.CancelledError:
1223
+ # Don't trigger deploy_errored when someone calls stop_waiting_for_deployment.
1224
+ pass
1225
+ except Exception:
1226
+ log.error("Caught exception in wait_for_deployment:")
1227
+ log.error(traceback.format_exc())
1228
+ self.trigger("deploy_errored")
1229
+
1230
+ def on_enter_rolled_back(self) -> None:
1231
+ self.update_slack_status(
1232
+ f"Finished rolling back to `{self.old_deployment_version.short_sha_repr()}` in {self.deploy_group}"
1233
+ )
1234
+ line = f"Rollback to {self.old_deployment_version.short_sha_repr()} for {self.deploy_group} complete"
1235
+ _log(service=self.service, component="deploy", line=line, level="event")
1236
+ self.start_timer(self.auto_abandon_delay, "auto_abandon", "abandon")
1237
+
1238
+ def on_enter_deployed(self) -> None:
1239
+ self.update_slack_status(
1240
+ f"Finished deployment of `{self.deployment_version.short_sha_repr()}` to {self.deploy_group}"
1241
+ )
1242
+ line = f"Deployment of {self.deployment_version.short_sha_repr()} for {self.deploy_group} complete"
1243
+ _log(service=self.service, component="deploy", line=line, level="event")
1244
+ self.send_manual_rollback_instructions()
1245
+
1246
+ if self.any_slo_failing() and self.auto_rollbacks_enabled():
1247
+ self.ping_authors(
1248
+ "Because an SLO is currently failing, we will not automatically certify. Instead, we will wait indefinitely until you click one of the buttons above."
1249
+ )
1250
+ elif self.any_metric_failing() and self.auto_rollbacks_enabled():
1251
+ self.ping_authors(
1252
+ "Because a rollback-triggering metric for this service is currently failing, we will not automatically certify. Instead, we will wait indefinitely until you click one of the buttons above."
1253
+ )
1254
+ else:
1255
+ if self.get_auto_certify_delay() > 0:
1256
+ self.start_timer(
1257
+ self.get_auto_certify_delay(), "auto_certify", "certify"
1258
+ )
1259
+ if self.deploy_group_is_set_to_notify("notify_after_good_deploy"):
1260
+ self.ping_authors()
1261
+
1262
+ def on_enter_complete(self) -> None:
1263
+ if self.deploy_group_is_set_to_notify("notify_after_good_deploy"):
1264
+ self.ping_authors()
1265
+
1266
+ def send_manual_rollback_instructions(self) -> None:
1267
+ if self.deployment_version != self.old_deployment_version:
1268
+ extra_rollback_args = ""
1269
+ if self.old_deployment_version.image_version:
1270
+ extra_rollback_args = (
1271
+ f" --image-version {self.old_deployment_version.image_version}"
1272
+ )
1273
+ message = (
1274
+ "If you need to roll back manually, run: "
1275
+ f"`paasta rollback --service {self.service} --deploy-group {self.deploy_group} "
1276
+ f"--commit {self.old_git_sha}{extra_rollback_args}`"
1277
+ )
1278
+ self.update_slack_thread(message)
1279
+ print(message)
1280
+
1281
+ def after_state_change(self) -> None:
1282
+ self.update_slack()
1283
+ super().after_state_change()
1284
+
1285
+ def get_signalfx_api_token(self) -> str:
1286
+ return (
1287
+ load_system_paasta_config()
1288
+ .get_monitoring_config()
1289
+ .get("signalfx_api_key", None)
1290
+ )
1291
+
1292
+ def get_splunk_api_token(self) -> SplunkAuth:
1293
+ auth_token = os.environ["SPLUNK_MFD_TOKEN"]
1294
+ auth_data = (
1295
+ load_system_paasta_config()
1296
+ .get_monitoring_config()
1297
+ .get("splunk_mfd_authentication")
1298
+ )
1299
+
1300
+ return SplunkAuth(
1301
+ host=auth_data["host"],
1302
+ port=auth_data["port"],
1303
+ username=auth_data["username"],
1304
+ password=auth_token,
1305
+ )
1306
+
1307
+ def get_button_text(self, button: str, is_active: bool) -> str:
1308
+ # Button text max length 75 characters
1309
+ # Current button templates allow version max length of 36
1310
+ version_short_str = self.deployment_version.short_sha_repr()
1311
+ if len(version_short_str) > 36:
1312
+ # we'll have to depend on subsequent slack messages to show full version
1313
+ version_short_str = "new version"
1314
+ active_button_texts = {
1315
+ "forward": f"Rolling Forward to {version_short_str} :zombocom:"
1316
+ }
1317
+ inactive_button_texts = {
1318
+ "forward": f"Continue Forward to {version_short_str} :arrow_forward:",
1319
+ "complete": f"Complete deploy to {version_short_str} :white_check_mark:",
1320
+ "snooze": f"Reset countdown",
1321
+ "enable_auto_rollbacks": "Enable auto rollbacks :eyes:",
1322
+ "disable_auto_rollbacks": "Disable auto rollbacks :close_eyes_monkey:",
1323
+ }
1324
+
1325
+ if self.old_deployment_version is not None:
1326
+ old_version_short_str = self.old_deployment_version.short_sha_repr()
1327
+ # Current button templates allow old version max length 43
1328
+ if len(old_version_short_str) > 43:
1329
+ old_version_short_str = "old version"
1330
+ active_button_texts.update(
1331
+ {"rollback": f"Rolling Back to {old_version_short_str} :zombocom:"}
1332
+ )
1333
+ inactive_button_texts.update(
1334
+ {
1335
+ "rollback": f"Roll Back to {old_version_short_str} :arrow_backward:",
1336
+ "abandon": f"Abandon deploy, staying on {old_version_short_str} :x:",
1337
+ }
1338
+ )
1339
+
1340
+ return (active_button_texts if is_active else inactive_button_texts)[button]
1341
+
1342
+ def start_auto_rollback_countdown(self, trigger: str, extra_text: str = "") -> None:
1343
+ cancel_button_text = self.get_button_text(
1344
+ button="disable_auto_rollbacks",
1345
+ is_active=False,
1346
+ )
1347
+ super().start_auto_rollback_countdown(
1348
+ trigger=trigger, extra_text=f'Click "{cancel_button_text}" to cancel this!'
1349
+ )
1350
+ if self.deploy_group_is_set_to_notify("notify_after_auto_rollback"):
1351
+ self.ping_authors()
1352
+
1353
+ def deploy_group_is_set_to_notify(self, notify_type: str) -> bool:
1354
+ return deploy_group_is_set_to_notify(
1355
+ self.deploy_info, self.deploy_group, notify_type
1356
+ )
1357
+
1358
+ def __build_rollback_audit_details(
1359
+ self, rollback_type: RollbackTypes
1360
+ ) -> Dict[str, str]:
1361
+ return {
1362
+ "rolled_back_from": str(self.deployment_version),
1363
+ "rolled_back_to": str(self.old_deployment_version),
1364
+ "rollback_type": rollback_type.value,
1365
+ "deploy_group": self.deploy_group,
1366
+ }
1367
+
1368
+ def log_slo_rollback(self) -> None:
1369
+ rollback_details = self.__build_rollback_audit_details(
1370
+ RollbackTypes.AUTOMATIC_SLO_ROLLBACK
1371
+ )
1372
+ self._log_rollback(rollback_details)
1373
+
1374
+ def log_metric_rollback(self) -> None:
1375
+ rollback_details = self.__build_rollback_audit_details(
1376
+ RollbackTypes.AUTOMATIC_METRIC_ROLLBACK
1377
+ )
1378
+ self._log_rollback(rollback_details)
1379
+
1380
+ def log_user_rollback(self) -> None:
1381
+ rollback_details = self.__build_rollback_audit_details(
1382
+ RollbackTypes.USER_INITIATED_ROLLBACK
1383
+ )
1384
+ self._log_rollback(rollback_details)
1385
+
1386
+ def _log_rollback(self, rollback_details: Dict[str, str]) -> None:
1387
+ base_dimensions = dict(rollback_details)
1388
+ base_dimensions["paasta_service"] = self.service
1389
+ # Emit one event per cluster to sfx
1390
+ for cluster in self.instance_configs_per_cluster.keys():
1391
+ dimensions = dict(base_dimensions)
1392
+ dimensions["paasta_cluster"] = cluster
1393
+ self.metrics_interface.emit_event(
1394
+ name="rollback",
1395
+ dimensions=dimensions,
1396
+ )
1397
+ _log_audit(
1398
+ action="rollback",
1399
+ action_details=rollback_details,
1400
+ service=self.service,
1401
+ )
1402
+
1403
+
1404
+ async def wait_until_instance_is_done(
1405
+ executor: concurrent.futures.Executor,
1406
+ service: str,
1407
+ instance: str,
1408
+ cluster: str,
1409
+ version: DeploymentVersion,
1410
+ instance_config: LongRunningServiceConfig,
1411
+ polling_interval: float,
1412
+ diagnosis_interval: float,
1413
+ time_before_first_diagnosis: float,
1414
+ should_ping_for_unhealthy_pods: bool,
1415
+ notify_fn: Optional[Callable[[str], None]] = None,
1416
+ ) -> Tuple[str, str]:
1417
+ loop = asyncio.get_running_loop()
1418
+ diagnosis_task = asyncio.create_task(
1419
+ periodically_diagnose_instance(
1420
+ executor,
1421
+ service,
1422
+ instance,
1423
+ cluster,
1424
+ version,
1425
+ instance_config,
1426
+ diagnosis_interval,
1427
+ time_before_first_diagnosis,
1428
+ should_ping_for_unhealthy_pods,
1429
+ notify_fn,
1430
+ )
1431
+ )
1432
+ try:
1433
+ while not await loop.run_in_executor(
1434
+ executor,
1435
+ functools.partial(
1436
+ check_if_instance_is_done,
1437
+ service,
1438
+ instance,
1439
+ cluster,
1440
+ version,
1441
+ instance_config,
1442
+ ),
1443
+ ):
1444
+ await asyncio.sleep(polling_interval)
1445
+ return (
1446
+ cluster,
1447
+ instance,
1448
+ ) # for the convenience of the caller, to know which future is finishing.
1449
+ finally:
1450
+ diagnosis_task.cancel()
1451
+
1452
+
1453
+ async def periodically_diagnose_instance(
1454
+ executor: concurrent.futures.Executor,
1455
+ service: str,
1456
+ instance: str,
1457
+ cluster: str,
1458
+ version: DeploymentVersion,
1459
+ instance_config: LongRunningServiceConfig,
1460
+ diagnosis_interval: float,
1461
+ time_before_first_diagnosis: float,
1462
+ should_ping_for_unhealthy_pods: bool,
1463
+ notify_fn: Optional[Callable[[str], None]] = None,
1464
+ ) -> None:
1465
+ await asyncio.sleep(time_before_first_diagnosis)
1466
+ loop = asyncio.get_running_loop()
1467
+ while True:
1468
+ try:
1469
+ await loop.run_in_executor(
1470
+ executor,
1471
+ functools.partial(
1472
+ diagnose_why_instance_is_stuck,
1473
+ service,
1474
+ instance,
1475
+ cluster,
1476
+ version,
1477
+ instance_config,
1478
+ should_ping_for_unhealthy_pods,
1479
+ notify_fn,
1480
+ ),
1481
+ )
1482
+ except asyncio.CancelledError:
1483
+ raise
1484
+ except Exception:
1485
+ print(f"Couldn't get status of {service}.{instance}:")
1486
+ traceback.print_exc()
1487
+ await asyncio.sleep(diagnosis_interval)
1488
+
1489
+
1490
+ def diagnose_why_instance_is_stuck(
1491
+ service: str,
1492
+ instance: str,
1493
+ cluster: str,
1494
+ version: DeploymentVersion,
1495
+ instance_config: LongRunningServiceConfig,
1496
+ should_ping_for_unhealthy_pods: bool,
1497
+ notify_fn: Optional[Callable[[str], None]] = None,
1498
+ ) -> None:
1499
+ api = client.get_paasta_oapi_client(
1500
+ cluster=get_paasta_oapi_api_clustername(
1501
+ cluster=cluster,
1502
+ is_eks=(instance_config.get_instance_type() == "eks"),
1503
+ ),
1504
+ )
1505
+ try:
1506
+ status = api.service.status_instance(
1507
+ service=service,
1508
+ instance=instance,
1509
+ include_envoy=False,
1510
+ include_mesos=False,
1511
+ new=True,
1512
+ )
1513
+ except api.api_error as e:
1514
+ log.warning(
1515
+ "Error getting service status from PaaSTA API for "
1516
+ f"{cluster}: {e.status} {e.reason}"
1517
+ )
1518
+ return
1519
+
1520
+ print(f" Status for {service}.{instance} in {cluster}:")
1521
+ for active_version in status.kubernetes_v2.versions:
1522
+ # We call get_version_table_entry directly so that we can set version_name_suffix based on git_sha instead of
1523
+ # creation time of the version (which is what get_versions_table does.)
1524
+ # Without this, we'd call the old version "new" until the new version is actually created, which would be confusing.
1525
+ for line in get_version_table_entry(
1526
+ active_version,
1527
+ service,
1528
+ instance,
1529
+ cluster,
1530
+ version_name_suffix="new"
1531
+ if active_version.git_sha == version.sha
1532
+ and active_version.image_version == version.image_version
1533
+ else "old",
1534
+ show_config_sha=True,
1535
+ verbose=0,
1536
+ ):
1537
+ print(f" {line}")
1538
+ print("")
1539
+
1540
+ if should_ping_for_unhealthy_pods and notify_fn:
1541
+ maybe_ping_for_unhealthy_pods(
1542
+ service, instance, cluster, version, status, notify_fn
1543
+ )
1544
+
1545
+
1546
+ already_pinged = False
1547
+
1548
+
1549
+ def maybe_ping_for_unhealthy_pods(
1550
+ service: str,
1551
+ instance: str,
1552
+ cluster: str,
1553
+ version: DeploymentVersion,
1554
+ status: InstanceStatusKubernetesV2,
1555
+ notify_fn: Callable[[str], None],
1556
+ ) -> None:
1557
+ global already_pinged
1558
+
1559
+ if not already_pinged:
1560
+ # there can be multiple current versions, e.g. if someone changes yelpsoa-configs during a bounce.
1561
+ current_versions = [
1562
+ v
1563
+ for v in status.kubernetes_v2.versions
1564
+ if v.git_sha == version.sha and v.image_version == version.image_version
1565
+ ]
1566
+ pingable_pods = [
1567
+ pod
1568
+ for current_version in current_versions
1569
+ for pod in current_version.pods
1570
+ if should_ping_for_pod(pod)
1571
+ ]
1572
+ if pingable_pods:
1573
+ already_pinged = True
1574
+ ping_for_pods(service, instance, cluster, pingable_pods, notify_fn)
1575
+
1576
+
1577
+ def should_ping_for_pod(pod: KubernetesPodV2) -> bool:
1578
+ return recent_container_restart(get_main_container(pod))
1579
+
1580
+
1581
+ def ping_for_pods(
1582
+ service: str,
1583
+ instance: str,
1584
+ cluster: str,
1585
+ pods: List[KubernetesPodV2],
1586
+ notify_fn: Callable[[str], None],
1587
+ ) -> None:
1588
+ pods_by_reason: Dict[str, List[KubernetesPodV2]] = {}
1589
+ for pod in pods:
1590
+ pods_by_reason.setdefault(get_main_container(pod).reason, []).append(pod)
1591
+
1592
+ for reason, pods_with_reason in pods_by_reason.items():
1593
+ explanation = {
1594
+ "Error": "crashed on startup",
1595
+ "OOMKilled": "run out of memory",
1596
+ "CrashLoopBackOff": "crashed on startup several times, and Kubernetes is backing off restarting them",
1597
+ }.get(reason, f"restarted ({reason})")
1598
+
1599
+ status_tip = f"Take a look at the output of your unhealthy pods with `paasta status -s {service} -i {instance} -c {cluster} -vv` (more -v for more output.)"
1600
+
1601
+ tip = {
1602
+ "Error": (
1603
+ f"This may indicate a bug in your code, a misconfiguration in yelpsoa-configs, or missing srv-configs. {status_tip}"
1604
+ ),
1605
+ "CrashLoopBackOff": f"This may indicate a bug in your code, a misconfiguration in yelpsoa-configs, or missing srv-configs. {status_tip}",
1606
+ "OOMKilled": " ".join(
1607
+ (
1608
+ "This probably means your new version of code requires more memory than the old version."
1609
+ "You may want to increase memory in yelpsoa-configs or roll back."
1610
+ "Ask #paasta if you need help with this.",
1611
+ )
1612
+ ),
1613
+ }.get(reason, "")
1614
+
1615
+ notify_fn(
1616
+ f"Some of the replicas of your new version have {explanation}: {', '.join(f'`{p.name}`' for p in pods_with_reason)}\n{tip}"
1617
+ )
1618
+
1619
+
1620
+ def check_if_instance_is_done(
1621
+ service: str,
1622
+ instance: str,
1623
+ cluster: str,
1624
+ version: DeploymentVersion,
1625
+ instance_config: LongRunningServiceConfig,
1626
+ api: Optional[client.PaastaOApiClient] = None,
1627
+ ) -> bool:
1628
+ if api is None:
1629
+ api = client.get_paasta_oapi_client(
1630
+ cluster=get_paasta_oapi_api_clustername(
1631
+ cluster=cluster,
1632
+ is_eks=(instance_config.get_instance_type() == "eks"),
1633
+ ),
1634
+ )
1635
+ if not api:
1636
+ log.warning(
1637
+ "Couldn't reach the PaaSTA api for {}! Assuming it is not "
1638
+ "deployed there yet.".format(cluster)
1639
+ )
1640
+ return False
1641
+
1642
+ inst_str = f"{service}.{instance} in {cluster}"
1643
+ log.debug(f"Inspecting the deployment status of {inst_str}")
1644
+
1645
+ status = None
1646
+ try:
1647
+ status = api.service.bounce_status_instance(service=service, instance=instance)
1648
+ except api.api_error as e:
1649
+ if e.status == 404: # non-existent instance
1650
+ # TODO(PAASTA-17290): just print the error message so that we
1651
+ # can distinguish between sources of 404s
1652
+ log.warning(
1653
+ "Can't get status for instance {}, service {} in "
1654
+ "cluster {}. This is normally because it is a new "
1655
+ "service that hasn't been deployed by PaaSTA yet.".format(
1656
+ instance, service, cluster
1657
+ )
1658
+ )
1659
+ elif e.status == 599: # Temporary issue
1660
+ log.warning(
1661
+ f"Temporary issue fetching service status from PaaSTA API for {cluster}. Will retry on next poll interval."
1662
+ )
1663
+ else: # 500 - error talking to api
1664
+ log.warning(
1665
+ "Error getting service status from PaaSTA API for "
1666
+ f"{cluster}: {e.status} {e.reason}"
1667
+ )
1668
+
1669
+ log.debug(f"No status for {inst_str}. Not deployed yet.")
1670
+ return False
1671
+
1672
+ if not status: # 204 - instance is not bounceable
1673
+ log.debug(
1674
+ f"{inst_str} is not a supported bounceable instance. "
1675
+ "Only long-running instances running on Kubernetes are currently "
1676
+ "supported. Continuing without watching."
1677
+ )
1678
+ return True
1679
+
1680
+ # Case: instance is stopped
1681
+ if status.expected_instance_count == 0 or status.desired_state == "stop":
1682
+ log.debug(f"{inst_str} is marked as stopped. Ignoring it.")
1683
+ return True
1684
+
1685
+ active_deploy_versions = {
1686
+ DeploymentVersion(sha=g, image_version=i) for g, i, c in status.active_versions
1687
+ }
1688
+ if version in active_deploy_versions:
1689
+ non_desired_versions = active_deploy_versions.difference({version})
1690
+ # Case: bounce in-progress
1691
+ if len(non_desired_versions) == 1:
1692
+ (other_version,) = non_desired_versions
1693
+ print(f" {inst_str} is still bouncing, from {other_version} to {version}")
1694
+ return False
1695
+
1696
+ # Case: previous bounces not yet finished when this one was triggered
1697
+ elif len(non_desired_versions) > 1:
1698
+ print(
1699
+ f" {inst_str} is still bouncing to {version}, but there are "
1700
+ f"multiple other bouncing versions running: {non_desired_versions}"
1701
+ )
1702
+ return False
1703
+ else:
1704
+ # Case: bounce not yet started
1705
+ print(
1706
+ f" {inst_str} hasn't started bouncing to {version}; "
1707
+ f"only the following versions are running: {active_deploy_versions}"
1708
+ )
1709
+ return False
1710
+
1711
+ # Case: instance is in not running
1712
+ if status.deploy_status not in {"Running", "Deploying", "Waiting"}:
1713
+ print(
1714
+ f" {inst_str} isn't running yet; it is in the state: {status.deploy_status}"
1715
+ )
1716
+ return False
1717
+
1718
+ # Case: not enough replicas are up for the instance to be considered bounced
1719
+ # The bounce margin factor defines what proportion of instances we need to be "safe",
1720
+ # so consider it scaled up "enough" if we have that proportion of instances ready.
1721
+ required_instance_count = int(
1722
+ math.ceil(
1723
+ instance_config.get_bounce_margin_factor() * status.expected_instance_count
1724
+ )
1725
+ )
1726
+ if required_instance_count > status.running_instance_count:
1727
+ print(
1728
+ f" {inst_str} has only {status.running_instance_count} replicas up, "
1729
+ f"below the required minimum of {required_instance_count}"
1730
+ )
1731
+ return False
1732
+
1733
+ # Case: completed
1734
+ print(
1735
+ f"Complete: {service}.{instance} on {cluster} is 100% deployed at "
1736
+ f"{status.running_instance_count} replicas on {DeploymentVersion(sha=status.active_versions[0][0], image_version=status.active_versions[0][1])}"
1737
+ )
1738
+ return True
1739
+
1740
+
1741
+ WAIT_FOR_INSTANCE_CLASSES = [
1742
+ KubernetesDeploymentConfig,
1743
+ EksDeploymentConfig,
1744
+ CassandraClusterDeploymentConfig,
1745
+ ]
1746
+
1747
+
1748
+ def get_instance_configs_for_service_in_cluster_and_deploy_group(
1749
+ service_configs: PaastaServiceConfigLoader, cluster: str, deploy_group: str
1750
+ ) -> Iterator[LongRunningServiceConfig]:
1751
+ for instance_class in WAIT_FOR_INSTANCE_CLASSES:
1752
+ for instance_config in service_configs.instance_configs(
1753
+ cluster=cluster, instance_type_class=instance_class
1754
+ ):
1755
+ if instance_config.get_deploy_group() == deploy_group:
1756
+ yield instance_config
1757
+
1758
+
1759
+ def get_instance_configs_for_service_in_deploy_group_all_clusters(
1760
+ service: str, deploy_group: str, soa_dir: str
1761
+ ) -> Dict[str, List[LongRunningServiceConfig]]:
1762
+ service_configs = PaastaServiceConfigLoader(
1763
+ service=service, soa_dir=soa_dir, load_deployments=False
1764
+ )
1765
+
1766
+ instance_configs_per_cluster = {}
1767
+
1768
+ api_endpoints = load_system_paasta_config().get_api_endpoints()
1769
+ for cluster in service_configs.clusters:
1770
+ if cluster not in api_endpoints:
1771
+ print(
1772
+ PaastaColors.red(
1773
+ "Cluster %s is NOT in paasta-api endpoints config." % cluster
1774
+ )
1775
+ )
1776
+ raise NoSuchCluster
1777
+
1778
+ instance_configs_per_cluster[cluster] = list(
1779
+ get_instance_configs_for_service_in_cluster_and_deploy_group(
1780
+ service_configs, cluster, deploy_group
1781
+ )
1782
+ )
1783
+
1784
+ return instance_configs_per_cluster
1785
+
1786
+
1787
+ async def wait_for_deployment(
1788
+ service: str,
1789
+ deploy_group: str,
1790
+ git_sha: str,
1791
+ soa_dir: str,
1792
+ timeout: float,
1793
+ image_version: Optional[str] = None,
1794
+ instance_configs_per_cluster: Optional[
1795
+ Dict[str, List[LongRunningServiceConfig]]
1796
+ ] = None,
1797
+ progress: Optional[Progress] = None,
1798
+ polling_interval: float = None,
1799
+ diagnosis_interval: float = None,
1800
+ time_before_first_diagnosis: float = None,
1801
+ notify_fn: Optional[Callable[[str], None]] = None,
1802
+ ) -> Optional[int]:
1803
+ if not instance_configs_per_cluster:
1804
+ instance_configs_per_cluster = (
1805
+ get_instance_configs_for_service_in_deploy_group_all_clusters(
1806
+ service, deploy_group, soa_dir
1807
+ )
1808
+ )
1809
+ total_instances = sum(len(ics) for ics in instance_configs_per_cluster.values())
1810
+
1811
+ target_version = DeploymentVersion(sha=git_sha, image_version=image_version)
1812
+ if not instance_configs_per_cluster:
1813
+ _log(
1814
+ service=service,
1815
+ component="deploy",
1816
+ line=(
1817
+ "Couldn't find any long-running instances for service {} in deploy group {}. Exiting.".format(
1818
+ service, deploy_group
1819
+ )
1820
+ ),
1821
+ level="event",
1822
+ )
1823
+ return None
1824
+
1825
+ print(
1826
+ "Waiting for deployment of {} for '{}' to complete...".format(
1827
+ target_version, deploy_group
1828
+ )
1829
+ )
1830
+
1831
+ system_paasta_config = load_system_paasta_config()
1832
+ max_workers = system_paasta_config.get_mark_for_deployment_max_polling_threads()
1833
+ if polling_interval is None:
1834
+ polling_interval = (
1835
+ system_paasta_config.get_mark_for_deployment_default_polling_interval()
1836
+ )
1837
+ if diagnosis_interval is None:
1838
+ diagnosis_interval = (
1839
+ system_paasta_config.get_mark_for_deployment_default_diagnosis_interval()
1840
+ )
1841
+ if time_before_first_diagnosis is None:
1842
+ time_before_first_diagnosis = (
1843
+ system_paasta_config.get_mark_for_deployment_default_time_before_first_diagnosis()
1844
+ )
1845
+
1846
+ with progressbar.ProgressBar(maxval=total_instances) as bar:
1847
+ instance_done_futures = []
1848
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
1849
+ for cluster, instance_configs in instance_configs_per_cluster.items():
1850
+ for instance_config in instance_configs:
1851
+ instance_done_futures.append(
1852
+ asyncio.ensure_future(
1853
+ wait_until_instance_is_done(
1854
+ executor,
1855
+ service,
1856
+ instance_config.get_instance(),
1857
+ cluster,
1858
+ target_version,
1859
+ instance_config,
1860
+ polling_interval=polling_interval,
1861
+ diagnosis_interval=diagnosis_interval,
1862
+ time_before_first_diagnosis=time_before_first_diagnosis,
1863
+ should_ping_for_unhealthy_pods=instance_config.get_should_ping_for_unhealthy_pods(
1864
+ system_paasta_config.get_mark_for_deployment_should_ping_for_unhealthy_pods()
1865
+ ),
1866
+ notify_fn=notify_fn,
1867
+ ),
1868
+ )
1869
+ )
1870
+
1871
+ remaining_instances: Dict[str, Set[str]] = {
1872
+ cluster: {ic.get_instance() for ic in instance_configs}
1873
+ for cluster, instance_configs in instance_configs_per_cluster.items()
1874
+ }
1875
+ finished_instances = 0
1876
+
1877
+ async def periodically_update_progressbar() -> None:
1878
+ while True:
1879
+ await asyncio.sleep(60)
1880
+ bar.update(finished_instances)
1881
+ print()
1882
+
1883
+ periodically_update_progressbar_task = asyncio.create_task(
1884
+ periodically_update_progressbar()
1885
+ )
1886
+
1887
+ try:
1888
+ for coro in asyncio.as_completed(
1889
+ instance_done_futures, timeout=timeout
1890
+ ):
1891
+ cluster, instance = await coro
1892
+ finished_instances += 1
1893
+ bar.update(finished_instances)
1894
+ if progress is not None:
1895
+ progress.percent = bar.percentage
1896
+ remaining_instances[cluster].remove(instance)
1897
+ progress.waiting_on = remaining_instances
1898
+ except asyncio.TimeoutError:
1899
+ _log(
1900
+ service=service,
1901
+ component="deploy",
1902
+ line=compose_timeout_message(
1903
+ remaining_instances,
1904
+ timeout,
1905
+ deploy_group,
1906
+ service,
1907
+ target_version,
1908
+ ),
1909
+ level="event",
1910
+ )
1911
+ raise TimeoutError
1912
+ except asyncio.CancelledError:
1913
+ # Wait for all the tasks to finish before closing out the ThreadPoolExecutor, to avoid RuntimeError('cannot schedule new futures after shutdown')
1914
+ for coro in instance_done_futures:
1915
+ coro.cancel()
1916
+ try:
1917
+ await coro
1918
+ except asyncio.CancelledError:
1919
+ pass
1920
+ raise
1921
+ else:
1922
+ sys.stdout.flush()
1923
+ if progress is not None:
1924
+ progress.percent = 100.0
1925
+ progress.waiting_on = None
1926
+ return 0
1927
+ finally:
1928
+ periodically_update_progressbar_task.cancel()
1929
+
1930
+
1931
+ def compose_timeout_message(
1932
+ remaining_instances: Mapping[str, Collection[str]],
1933
+ timeout: float,
1934
+ deploy_group: str,
1935
+ service: str,
1936
+ version: DeploymentVersion,
1937
+ ) -> str:
1938
+ paasta_status = []
1939
+ paasta_logs = []
1940
+ for cluster, instances in sorted(remaining_instances.items()):
1941
+ if instances:
1942
+ joined_instances = ",".join(instances)
1943
+ paasta_status.append(
1944
+ "paasta status -c {cluster} -s {service} -i {instances}".format(
1945
+ cluster=cluster, service=service, instances=joined_instances
1946
+ )
1947
+ )
1948
+ paasta_logs.append(
1949
+ "paasta logs -c {cluster} -s {service} -i {instances} -C deploy -l 1000".format(
1950
+ cluster=cluster, service=service, instances=joined_instances
1951
+ )
1952
+ )
1953
+
1954
+ return (
1955
+ "\n\nTimed out after {timeout} seconds, waiting for {service} "
1956
+ "in {deploy_group} to be deployed by PaaSTA.\n"
1957
+ "This probably means the deploy hasn't succeeded. The new service "
1958
+ "might not be healthy or one or more clusters could be having issues.\n\n"
1959
+ "To debug, follow steps in {stuck_bounce_runbook}, "
1960
+ "or try running the following to see the status of instances we tried to deploy:\n\n"
1961
+ " {status_commands}\n\n {logs_commands}"
1962
+ "\n\nIf the service is known to be slow to start you may wish to "
1963
+ "increase the timeout on this step.\n"
1964
+ "To wait a little longer run:\n\n"
1965
+ " paasta wait-for-deployment -s {service} -l {deploy_group} -c {git_sha}{image_arg}".format(
1966
+ timeout=timeout,
1967
+ deploy_group=deploy_group,
1968
+ service=service,
1969
+ git_sha=version.sha,
1970
+ image_arg=f" --image-version {version.image_version}"
1971
+ if version.image_version
1972
+ else "",
1973
+ status_commands="\n ".join(paasta_status),
1974
+ logs_commands="\n ".join(paasta_logs),
1975
+ stuck_bounce_runbook=os.environ.get(
1976
+ "STUCK_BOUNCE_RUNBOOK",
1977
+ DEFAULT_STUCK_BOUNCE_RUNBOOK,
1978
+ ),
1979
+ )
1980
+ )
1981
+
1982
+
1983
+ class NoSuchCluster(Exception):
1984
+ """To be raised by wait_for_deployment() when a service has a
1985
+ kubernetes config for a cluster that is not listed in /etc/paasta/api_endpoints.json.
1986
+ """
1987
+
1988
+ pass