paasta-tools 1.21.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (348) hide show
  1. k8s_itests/__init__.py +0 -0
  2. k8s_itests/test_autoscaling.py +23 -0
  3. k8s_itests/utils.py +38 -0
  4. paasta_tools/__init__.py +20 -0
  5. paasta_tools/adhoc_tools.py +142 -0
  6. paasta_tools/api/__init__.py +13 -0
  7. paasta_tools/api/api.py +330 -0
  8. paasta_tools/api/api_docs/swagger.json +2323 -0
  9. paasta_tools/api/client.py +106 -0
  10. paasta_tools/api/settings.py +33 -0
  11. paasta_tools/api/tweens/__init__.py +6 -0
  12. paasta_tools/api/tweens/auth.py +125 -0
  13. paasta_tools/api/tweens/profiling.py +108 -0
  14. paasta_tools/api/tweens/request_logger.py +124 -0
  15. paasta_tools/api/views/__init__.py +13 -0
  16. paasta_tools/api/views/autoscaler.py +100 -0
  17. paasta_tools/api/views/exception.py +45 -0
  18. paasta_tools/api/views/flink.py +73 -0
  19. paasta_tools/api/views/instance.py +395 -0
  20. paasta_tools/api/views/pause_autoscaler.py +71 -0
  21. paasta_tools/api/views/remote_run.py +113 -0
  22. paasta_tools/api/views/resources.py +76 -0
  23. paasta_tools/api/views/service.py +35 -0
  24. paasta_tools/api/views/version.py +25 -0
  25. paasta_tools/apply_external_resources.py +79 -0
  26. paasta_tools/async_utils.py +109 -0
  27. paasta_tools/autoscaling/__init__.py +0 -0
  28. paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
  29. paasta_tools/autoscaling/forecasting.py +106 -0
  30. paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
  31. paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
  32. paasta_tools/autoscaling/utils.py +52 -0
  33. paasta_tools/bounce_lib.py +184 -0
  34. paasta_tools/broadcast_log_to_services.py +62 -0
  35. paasta_tools/cassandracluster_tools.py +210 -0
  36. paasta_tools/check_autoscaler_max_instances.py +212 -0
  37. paasta_tools/check_cassandracluster_services_replication.py +35 -0
  38. paasta_tools/check_flink_services_health.py +203 -0
  39. paasta_tools/check_kubernetes_api.py +57 -0
  40. paasta_tools/check_kubernetes_services_replication.py +141 -0
  41. paasta_tools/check_oom_events.py +244 -0
  42. paasta_tools/check_services_replication_tools.py +324 -0
  43. paasta_tools/check_spark_jobs.py +234 -0
  44. paasta_tools/cleanup_kubernetes_cr.py +138 -0
  45. paasta_tools/cleanup_kubernetes_crd.py +145 -0
  46. paasta_tools/cleanup_kubernetes_jobs.py +344 -0
  47. paasta_tools/cleanup_tron_namespaces.py +96 -0
  48. paasta_tools/cli/__init__.py +13 -0
  49. paasta_tools/cli/authentication.py +85 -0
  50. paasta_tools/cli/cli.py +260 -0
  51. paasta_tools/cli/cmds/__init__.py +13 -0
  52. paasta_tools/cli/cmds/autoscale.py +143 -0
  53. paasta_tools/cli/cmds/check.py +334 -0
  54. paasta_tools/cli/cmds/cook_image.py +147 -0
  55. paasta_tools/cli/cmds/get_docker_image.py +76 -0
  56. paasta_tools/cli/cmds/get_image_version.py +172 -0
  57. paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
  58. paasta_tools/cli/cmds/info.py +155 -0
  59. paasta_tools/cli/cmds/itest.py +117 -0
  60. paasta_tools/cli/cmds/list.py +66 -0
  61. paasta_tools/cli/cmds/list_clusters.py +42 -0
  62. paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
  63. paasta_tools/cli/cmds/list_namespaces.py +84 -0
  64. paasta_tools/cli/cmds/local_run.py +1396 -0
  65. paasta_tools/cli/cmds/logs.py +1601 -0
  66. paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
  67. paasta_tools/cli/cmds/mesh_status.py +174 -0
  68. paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
  69. paasta_tools/cli/cmds/push_to_registry.py +275 -0
  70. paasta_tools/cli/cmds/remote_run.py +252 -0
  71. paasta_tools/cli/cmds/rollback.py +347 -0
  72. paasta_tools/cli/cmds/secret.py +549 -0
  73. paasta_tools/cli/cmds/security_check.py +59 -0
  74. paasta_tools/cli/cmds/spark_run.py +1400 -0
  75. paasta_tools/cli/cmds/start_stop_restart.py +401 -0
  76. paasta_tools/cli/cmds/status.py +2302 -0
  77. paasta_tools/cli/cmds/validate.py +1012 -0
  78. paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
  79. paasta_tools/cli/fsm/__init__.py +13 -0
  80. paasta_tools/cli/fsm/autosuggest.py +82 -0
  81. paasta_tools/cli/fsm/template/README.md +8 -0
  82. paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
  83. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
  84. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
  85. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
  86. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
  87. paasta_tools/cli/fsm_cmd.py +121 -0
  88. paasta_tools/cli/paasta_tabcomplete.sh +23 -0
  89. paasta_tools/cli/schemas/adhoc_schema.json +199 -0
  90. paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
  91. paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
  92. paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
  93. paasta_tools/cli/schemas/deploy_schema.json +173 -0
  94. paasta_tools/cli/schemas/eks_schema.json +970 -0
  95. paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
  96. paasta_tools/cli/schemas/rollback_schema.json +160 -0
  97. paasta_tools/cli/schemas/service_schema.json +25 -0
  98. paasta_tools/cli/schemas/smartstack_schema.json +322 -0
  99. paasta_tools/cli/schemas/tron_schema.json +699 -0
  100. paasta_tools/cli/utils.py +1118 -0
  101. paasta_tools/clusterman.py +21 -0
  102. paasta_tools/config_utils.py +385 -0
  103. paasta_tools/contrib/__init__.py +0 -0
  104. paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
  105. paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
  106. paasta_tools/contrib/check_orphans.py +306 -0
  107. paasta_tools/contrib/create_dynamodb_table.py +35 -0
  108. paasta_tools/contrib/create_paasta_playground.py +105 -0
  109. paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
  110. paasta_tools/contrib/get_running_task_allocation.py +346 -0
  111. paasta_tools/contrib/habitat_fixer.py +86 -0
  112. paasta_tools/contrib/ide_helper.py +316 -0
  113. paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
  114. paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
  115. paasta_tools/contrib/kill_bad_containers.py +109 -0
  116. paasta_tools/contrib/mass-deploy-tag.sh +44 -0
  117. paasta_tools/contrib/mock_patch_checker.py +86 -0
  118. paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
  119. paasta_tools/contrib/render_template.py +129 -0
  120. paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
  121. paasta_tools/contrib/service_shard_remove.py +157 -0
  122. paasta_tools/contrib/service_shard_update.py +373 -0
  123. paasta_tools/contrib/shared_ip_check.py +77 -0
  124. paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
  125. paasta_tools/delete_kubernetes_deployments.py +89 -0
  126. paasta_tools/deployment_utils.py +44 -0
  127. paasta_tools/docker_wrapper.py +234 -0
  128. paasta_tools/docker_wrapper_imports.py +13 -0
  129. paasta_tools/drain_lib.py +351 -0
  130. paasta_tools/dump_locally_running_services.py +71 -0
  131. paasta_tools/eks_tools.py +119 -0
  132. paasta_tools/envoy_tools.py +373 -0
  133. paasta_tools/firewall.py +504 -0
  134. paasta_tools/firewall_logging.py +154 -0
  135. paasta_tools/firewall_update.py +172 -0
  136. paasta_tools/flink_tools.py +345 -0
  137. paasta_tools/flinkeks_tools.py +90 -0
  138. paasta_tools/frameworks/__init__.py +0 -0
  139. paasta_tools/frameworks/adhoc_scheduler.py +71 -0
  140. paasta_tools/frameworks/constraints.py +87 -0
  141. paasta_tools/frameworks/native_scheduler.py +652 -0
  142. paasta_tools/frameworks/native_service_config.py +301 -0
  143. paasta_tools/frameworks/task_store.py +245 -0
  144. paasta_tools/generate_all_deployments +9 -0
  145. paasta_tools/generate_authenticating_services.py +94 -0
  146. paasta_tools/generate_deployments_for_service.py +255 -0
  147. paasta_tools/generate_services_file.py +114 -0
  148. paasta_tools/generate_services_yaml.py +30 -0
  149. paasta_tools/hacheck.py +76 -0
  150. paasta_tools/instance/__init__.py +0 -0
  151. paasta_tools/instance/hpa_metrics_parser.py +122 -0
  152. paasta_tools/instance/kubernetes.py +1362 -0
  153. paasta_tools/iptables.py +240 -0
  154. paasta_tools/kafkacluster_tools.py +143 -0
  155. paasta_tools/kubernetes/__init__.py +0 -0
  156. paasta_tools/kubernetes/application/__init__.py +0 -0
  157. paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
  158. paasta_tools/kubernetes/application/tools.py +90 -0
  159. paasta_tools/kubernetes/bin/__init__.py +0 -0
  160. paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
  161. paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
  162. paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
  163. paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
  164. paasta_tools/kubernetes/remote_run.py +558 -0
  165. paasta_tools/kubernetes_tools.py +4679 -0
  166. paasta_tools/list_kubernetes_service_instances.py +128 -0
  167. paasta_tools/list_tron_namespaces.py +60 -0
  168. paasta_tools/long_running_service_tools.py +678 -0
  169. paasta_tools/mac_address.py +44 -0
  170. paasta_tools/marathon_dashboard.py +0 -0
  171. paasta_tools/mesos/__init__.py +0 -0
  172. paasta_tools/mesos/cfg.py +46 -0
  173. paasta_tools/mesos/cluster.py +60 -0
  174. paasta_tools/mesos/exceptions.py +59 -0
  175. paasta_tools/mesos/framework.py +77 -0
  176. paasta_tools/mesos/log.py +48 -0
  177. paasta_tools/mesos/master.py +306 -0
  178. paasta_tools/mesos/mesos_file.py +169 -0
  179. paasta_tools/mesos/parallel.py +52 -0
  180. paasta_tools/mesos/slave.py +115 -0
  181. paasta_tools/mesos/task.py +94 -0
  182. paasta_tools/mesos/util.py +69 -0
  183. paasta_tools/mesos/zookeeper.py +37 -0
  184. paasta_tools/mesos_maintenance.py +848 -0
  185. paasta_tools/mesos_tools.py +1051 -0
  186. paasta_tools/metrics/__init__.py +0 -0
  187. paasta_tools/metrics/metastatus_lib.py +1110 -0
  188. paasta_tools/metrics/metrics_lib.py +217 -0
  189. paasta_tools/monitoring/__init__.py +13 -0
  190. paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
  191. paasta_tools/monitoring_tools.py +652 -0
  192. paasta_tools/monkrelaycluster_tools.py +146 -0
  193. paasta_tools/nrtsearchservice_tools.py +143 -0
  194. paasta_tools/nrtsearchserviceeks_tools.py +68 -0
  195. paasta_tools/oom_logger.py +321 -0
  196. paasta_tools/paasta_deploy_tron_jobs +3 -0
  197. paasta_tools/paasta_execute_docker_command.py +123 -0
  198. paasta_tools/paasta_native_serviceinit.py +21 -0
  199. paasta_tools/paasta_service_config_loader.py +201 -0
  200. paasta_tools/paastaapi/__init__.py +29 -0
  201. paasta_tools/paastaapi/api/__init__.py +3 -0
  202. paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
  203. paasta_tools/paastaapi/api/default_api.py +569 -0
  204. paasta_tools/paastaapi/api/remote_run_api.py +604 -0
  205. paasta_tools/paastaapi/api/resources_api.py +157 -0
  206. paasta_tools/paastaapi/api/service_api.py +1736 -0
  207. paasta_tools/paastaapi/api_client.py +818 -0
  208. paasta_tools/paastaapi/apis/__init__.py +22 -0
  209. paasta_tools/paastaapi/configuration.py +455 -0
  210. paasta_tools/paastaapi/exceptions.py +137 -0
  211. paasta_tools/paastaapi/model/__init__.py +5 -0
  212. paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
  213. paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
  214. paasta_tools/paastaapi/model/deploy_queue.py +178 -0
  215. paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
  216. paasta_tools/paastaapi/model/envoy_backend.py +185 -0
  217. paasta_tools/paastaapi/model/envoy_location.py +184 -0
  218. paasta_tools/paastaapi/model/envoy_status.py +181 -0
  219. paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
  220. paasta_tools/paastaapi/model/flink_config.py +173 -0
  221. paasta_tools/paastaapi/model/flink_job.py +186 -0
  222. paasta_tools/paastaapi/model/flink_job_details.py +192 -0
  223. paasta_tools/paastaapi/model/flink_jobs.py +175 -0
  224. paasta_tools/paastaapi/model/float_and_error.py +173 -0
  225. paasta_tools/paastaapi/model/hpa_metric.py +176 -0
  226. paasta_tools/paastaapi/model/inline_object.py +170 -0
  227. paasta_tools/paastaapi/model/inline_response200.py +170 -0
  228. paasta_tools/paastaapi/model/inline_response2001.py +170 -0
  229. paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
  230. paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
  231. paasta_tools/paastaapi/model/instance_status.py +220 -0
  232. paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
  233. paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
  234. paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
  235. paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
  236. paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
  237. paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
  238. paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
  239. paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
  240. paasta_tools/paastaapi/model/instance_tasks.py +182 -0
  241. paasta_tools/paastaapi/model/integer_and_error.py +173 -0
  242. paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
  243. paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
  244. paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
  245. paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
  246. paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
  247. paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
  248. paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
  249. paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
  250. paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
  251. paasta_tools/paastaapi/model/remote_run_start.py +185 -0
  252. paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
  253. paasta_tools/paastaapi/model/remote_run_token.py +173 -0
  254. paasta_tools/paastaapi/model/resource.py +187 -0
  255. paasta_tools/paastaapi/model/resource_item.py +187 -0
  256. paasta_tools/paastaapi/model/resource_value.py +176 -0
  257. paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
  258. paasta_tools/paastaapi/model/smartstack_location.py +181 -0
  259. paasta_tools/paastaapi/model/smartstack_status.py +181 -0
  260. paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
  261. paasta_tools/paastaapi/model_utils.py +1879 -0
  262. paasta_tools/paastaapi/models/__init__.py +62 -0
  263. paasta_tools/paastaapi/rest.py +287 -0
  264. paasta_tools/prune_completed_pods.py +220 -0
  265. paasta_tools/puppet_service_tools.py +59 -0
  266. paasta_tools/py.typed +1 -0
  267. paasta_tools/remote_git.py +127 -0
  268. paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
  269. paasta_tools/run-paasta-api-playground.py +51 -0
  270. paasta_tools/secret_providers/__init__.py +66 -0
  271. paasta_tools/secret_providers/vault.py +214 -0
  272. paasta_tools/secret_tools.py +277 -0
  273. paasta_tools/setup_istio_mesh.py +353 -0
  274. paasta_tools/setup_kubernetes_cr.py +412 -0
  275. paasta_tools/setup_kubernetes_crd.py +138 -0
  276. paasta_tools/setup_kubernetes_internal_crd.py +154 -0
  277. paasta_tools/setup_kubernetes_job.py +353 -0
  278. paasta_tools/setup_prometheus_adapter_config.py +1028 -0
  279. paasta_tools/setup_tron_namespace.py +248 -0
  280. paasta_tools/slack.py +75 -0
  281. paasta_tools/smartstack_tools.py +676 -0
  282. paasta_tools/spark_tools.py +283 -0
  283. paasta_tools/synapse_srv_namespaces_fact.py +42 -0
  284. paasta_tools/tron/__init__.py +0 -0
  285. paasta_tools/tron/client.py +158 -0
  286. paasta_tools/tron/tron_command_context.py +194 -0
  287. paasta_tools/tron/tron_timeutils.py +101 -0
  288. paasta_tools/tron_tools.py +1448 -0
  289. paasta_tools/utils.py +4307 -0
  290. paasta_tools/yaml_tools.py +44 -0
  291. paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
  292. paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
  293. paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
  294. paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
  295. paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
  296. paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
  297. paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
  298. paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
  299. paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
  300. paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
  301. paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
  302. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
  303. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
  304. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
  305. paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
  306. paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
  307. paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
  308. paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
  309. paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
  310. paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
  311. paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
  312. paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
  313. paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
  314. paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
  315. paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
  316. paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
  317. paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
  318. paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
  319. paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
  320. paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
  321. paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
  322. paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
  323. paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
  324. paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
  325. paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
  326. paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
  327. paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
  328. paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
  329. paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
  330. paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
  331. paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
  332. paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
  333. paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
  334. paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
  335. paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
  336. paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
  337. paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
  338. paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
  339. paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
  340. paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
  341. paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
  342. paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
  343. paasta_tools-1.21.3.dist-info/LICENSE +201 -0
  344. paasta_tools-1.21.3.dist-info/METADATA +74 -0
  345. paasta_tools-1.21.3.dist-info/RECORD +348 -0
  346. paasta_tools-1.21.3.dist-info/WHEEL +5 -0
  347. paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
  348. paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1110 @@
1
+ #!/usr/bin/env python
2
+ # Copyright 2015-2016 Yelp Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import copy
16
+ import itertools
17
+ import math
18
+ import re
19
+ from collections import Counter
20
+ from collections import namedtuple
21
+ from typing import Any
22
+ from typing import Callable
23
+ from typing import Mapping
24
+ from typing import NamedTuple
25
+ from typing import Sequence
26
+ from typing import Tuple
27
+ from typing import TypeVar
28
+
29
+ import a_sync
30
+ from humanize import naturalsize
31
+ from kubernetes.client import V1Node
32
+ from kubernetes.client import V1Pod
33
+ from mypy_extensions import TypedDict
34
+ from typing_extensions import Counter as _Counter
35
+
36
+ from paasta_tools.kubernetes_tools import get_all_nodes_cached
37
+ from paasta_tools.kubernetes_tools import get_all_pods_cached
38
+ from paasta_tools.kubernetes_tools import get_pod_status
39
+ from paasta_tools.kubernetes_tools import is_node_ready
40
+ from paasta_tools.kubernetes_tools import KubeClient
41
+ from paasta_tools.kubernetes_tools import list_all_deployments
42
+ from paasta_tools.kubernetes_tools import paasta_prefixed
43
+ from paasta_tools.kubernetes_tools import PodStatus
44
+ from paasta_tools.mesos.master import MesosMetrics
45
+ from paasta_tools.mesos.master import MesosState
46
+ from paasta_tools.mesos_maintenance import MAINTENANCE_ROLE
47
+ from paasta_tools.mesos_tools import get_all_tasks_from_state
48
+ from paasta_tools.mesos_tools import get_mesos_quorum
49
+ from paasta_tools.mesos_tools import get_number_of_mesos_masters
50
+ from paasta_tools.mesos_tools import get_zookeeper_host_path
51
+ from paasta_tools.mesos_tools import is_task_terminal
52
+ from paasta_tools.mesos_tools import MesosResources
53
+ from paasta_tools.mesos_tools import MesosTask
54
+ from paasta_tools.utils import PaastaColors
55
+ from paasta_tools.utils import print_with_indent
56
+
57
+
58
+ DEFAULT_KUBERNETES_CPU_REQUEST = "100m"
59
+ DEFAULT_KUBERNETES_MEMORY_REQUEST = "200M"
60
+ DEFAULT_KUBERNETES_DISK_REQUEST = "0"
61
+
62
+
63
+ class ResourceInfo(namedtuple("ResourceInfo", ["cpus", "mem", "disk", "gpus"])):
64
+ def __new__(cls, cpus, mem, disk, gpus=0):
65
+ return super().__new__(cls, cpus, mem, disk, gpus)
66
+
67
+
68
+ class HealthCheckResult(NamedTuple):
69
+ message: str
70
+ healthy: bool
71
+
72
+
73
+ class ResourceUtilization(NamedTuple):
74
+ metric: str
75
+ total: int
76
+ free: int
77
+
78
+
79
+ def get_num_masters() -> int:
80
+ """Gets the number of masters from mesos state"""
81
+ zookeeper_host_path = get_zookeeper_host_path()
82
+ return get_number_of_mesos_masters(
83
+ zookeeper_host_path.host, zookeeper_host_path.path
84
+ )
85
+
86
+
87
+ def get_mesos_cpu_status(
88
+ metrics: MesosMetrics, mesos_state: MesosState
89
+ ) -> Tuple[int, int, int]:
90
+ """Takes in the mesos metrics and analyzes them, returning the status.
91
+
92
+ :param metrics: mesos metrics dictionary.
93
+ :param mesos_state: mesos state dictionary.
94
+ :returns: Tuple of total, used, and available CPUs.
95
+ """
96
+
97
+ total = metrics["master/cpus_total"]
98
+ used = metrics["master/cpus_used"]
99
+
100
+ for slave in mesos_state["slaves"]:
101
+ used += reserved_maintenence_resources(slave["reserved_resources"])["cpus"]
102
+
103
+ available = total - used
104
+ return total, used, available
105
+
106
+
107
+ def get_kube_cpu_status(
108
+ nodes: Sequence[V1Node],
109
+ ) -> Tuple[float, float, float]:
110
+ """Takes in the list of Kubernetes nodes and analyzes them, returning the status.
111
+
112
+ :param nodes: list of Kubernetes nodes.
113
+ :returns: Tuple of total, used, and available CPUs.
114
+ """
115
+
116
+ total = 0.0
117
+ available = 0.0
118
+ for node in nodes:
119
+ available += suffixed_number_value(node.status.allocatable["cpu"])
120
+ total += suffixed_number_value(node.status.capacity["cpu"])
121
+
122
+ used = total - available
123
+ return total, used, available
124
+
125
+
126
+ def get_mesos_memory_status(
127
+ metrics: MesosMetrics, mesos_state: MesosState
128
+ ) -> Tuple[int, int, int]:
129
+ """Takes in the mesos metrics and analyzes them, returning the status.
130
+
131
+ :param metrics: mesos metrics dictionary.
132
+ :param mesos_state: mesos state dictionary.
133
+ :returns: Tuple of total, used, and available memory in Mi.
134
+ """
135
+ total = metrics["master/mem_total"]
136
+ used = metrics["master/mem_used"]
137
+
138
+ for slave in mesos_state["slaves"]:
139
+ used += reserved_maintenence_resources(slave["reserved_resources"])["mem"]
140
+
141
+ available = total - used
142
+
143
+ return total, used, available
144
+
145
+
146
+ def get_kube_memory_status(
147
+ nodes: Sequence[V1Node],
148
+ ) -> Tuple[float, float, float]:
149
+ """Takes in the list of Kubernetes nodes and analyzes them, returning the status.
150
+
151
+ :param nodes: list of Kubernetes nodes.
152
+ :returns: Tuple of total, used, and available memory in Mi.
153
+ """
154
+ total = 0.0
155
+ available = 0.0
156
+ for node in nodes:
157
+ available += suffixed_number_value(node.status.allocatable["memory"])
158
+ total += suffixed_number_value(node.status.capacity["memory"])
159
+
160
+ total //= 1024 * 1024
161
+ available //= 1024 * 1024
162
+ used = total - available
163
+ return total, used, available
164
+
165
+
166
+ def get_mesos_disk_status(
167
+ metrics: MesosMetrics, mesos_state: MesosState
168
+ ) -> Tuple[int, int, int]:
169
+ """Takes in the mesos metrics and analyzes them, returning the status.
170
+
171
+ :param metrics: mesos metrics dictionary.
172
+ :param mesos_state: mesos state dictionary.
173
+ :returns: Tuple of total, used, and available disk space in Mi.
174
+ """
175
+
176
+ total = metrics["master/disk_total"]
177
+ used = metrics["master/disk_used"]
178
+
179
+ for slave in mesos_state["slaves"]:
180
+ used += reserved_maintenence_resources(slave["reserved_resources"])["disk"]
181
+
182
+ available = total - used
183
+ return total, used, available
184
+
185
+
186
+ def get_kube_disk_status(
187
+ nodes: Sequence[V1Node],
188
+ ) -> Tuple[float, float, float]:
189
+ """Takes in the list of Kubernetes nodes and analyzes them, returning the status.
190
+
191
+ :param nodes: list of Kubernetes nodes.
192
+ :returns: Tuple of total, used, and available disk space in Mi.
193
+ """
194
+
195
+ total = 0.0
196
+ available = 0.0
197
+ for node in nodes:
198
+ available += suffixed_number_value(node.status.allocatable["ephemeral-storage"])
199
+ total += suffixed_number_value(node.status.capacity["ephemeral-storage"])
200
+
201
+ total //= 1024 * 1024
202
+ available //= 1024 * 1024
203
+ used = total - available
204
+ return total, used, available
205
+
206
+
207
+ def get_mesos_gpu_status(
208
+ metrics: MesosMetrics, mesos_state: MesosState
209
+ ) -> Tuple[int, int, int]:
210
+ """Takes in the mesos metrics and analyzes them, returning gpus status.
211
+
212
+ :param metrics: mesos metrics dictionary.
213
+ :param mesos_state: mesos state dictionary.
214
+ :returns: Tuple of total, used, and available GPUs.
215
+ """
216
+ total = metrics["master/gpus_total"]
217
+ used = metrics["master/gpus_used"]
218
+
219
+ for slave in mesos_state["slaves"]:
220
+ used += reserved_maintenence_resources(slave["reserved_resources"])["gpus"]
221
+
222
+ available = total - used
223
+ return total, used, available
224
+
225
+
226
+ def get_kube_gpu_status(
227
+ nodes: Sequence[V1Node],
228
+ ) -> Tuple[float, float, float]:
229
+ """Takes in the list of Kubernetes nodes and analyzes them, returning the status.
230
+
231
+ :param nodes: list of Kubernetes nodes.
232
+ :returns: Tuple of total, used, and available GPUs.
233
+ """
234
+
235
+ total = 0.0
236
+ available = 0.0
237
+ for node in nodes:
238
+ available += suffixed_number_value(
239
+ node.status.allocatable.get("nvidia.com/gpu", "0")
240
+ )
241
+ total += suffixed_number_value(node.status.capacity.get("nvidia.com/gpu", "0"))
242
+
243
+ used = total - available
244
+ return total, used, available
245
+
246
+
247
+ def filter_mesos_state_metrics(dictionary: Mapping[str, Any]) -> Mapping[str, Any]:
248
+ valid_keys = ["cpus", "mem", "disk", "gpus"]
249
+ return {key: value for (key, value) in dictionary.items() if key in valid_keys}
250
+
251
+
252
+ def filter_kube_resources(dictionary: Mapping[str, str]) -> Mapping[str, str]:
253
+ valid_keys = ["cpu", "memory", "ephemeral-storage", "nvidia.com/gpu"]
254
+ return {key: value for (key, value) in dictionary.items() if key in valid_keys}
255
+
256
+
257
+ class ResourceParser:
258
+ @staticmethod
259
+ def cpus(resources):
260
+ resources = resources or {}
261
+ return suffixed_number_value(
262
+ resources.get("cpu", DEFAULT_KUBERNETES_CPU_REQUEST)
263
+ )
264
+
265
+ @staticmethod
266
+ def mem(resources):
267
+ resources = resources or {}
268
+ return suffixed_number_value(
269
+ resources.get("memory", DEFAULT_KUBERNETES_MEMORY_REQUEST)
270
+ )
271
+
272
+ @staticmethod
273
+ def disk(resources):
274
+ resources = resources or {}
275
+ return suffixed_number_value(
276
+ resources.get("ephemeral-storage", DEFAULT_KUBERNETES_DISK_REQUEST)
277
+ )
278
+
279
+
280
+ def allocated_node_resources(pods: Sequence[V1Pod]) -> Mapping[str, float]:
281
+ cpus = mem = disk = 0
282
+ for pod in pods:
283
+ cpus += sum(
284
+ ResourceParser.cpus(c.resources.requests) for c in pod.spec.containers
285
+ )
286
+ mem += sum(
287
+ ResourceParser.mem(c.resources.requests) for c in pod.spec.containers
288
+ )
289
+ disk += sum(
290
+ ResourceParser.disk(c.resources.requests) for c in pod.spec.containers
291
+ )
292
+ return {"cpu": cpus, "memory": mem, "ephemeral-storage": disk}
293
+
294
+
295
+ def healthcheck_result_for_resource_utilization(
296
+ resource_utilization: ResourceUtilization, threshold: int
297
+ ) -> HealthCheckResult:
298
+ """Given a resource data dict, assert that cpu
299
+ data is ok.
300
+
301
+ :param resource_utilization: the resource_utilization tuple to check
302
+ :returns: a HealthCheckResult
303
+ """
304
+ try:
305
+ utilization = percent_used(
306
+ resource_utilization.total,
307
+ resource_utilization.total - resource_utilization.free,
308
+ )
309
+ except ZeroDivisionError:
310
+ utilization = 0
311
+ message = "{}: {:.2f}/{:.2f}({:.2f}%) used. Threshold ({:.2f}%)".format(
312
+ resource_utilization.metric,
313
+ float(resource_utilization.total - resource_utilization.free),
314
+ resource_utilization.total,
315
+ utilization,
316
+ threshold,
317
+ )
318
+ healthy = utilization <= threshold
319
+ return HealthCheckResult(message=message, healthy=healthy)
320
+
321
+
322
+ def quorum_ok(masters: int, quorum: int) -> bool:
323
+ return masters >= quorum
324
+
325
+
326
+ def check_threshold(percent_used: float, threshold: int) -> bool:
327
+ return (100 - percent_used) > threshold
328
+
329
+
330
+ def percent_used(total: float, used: float) -> float:
331
+ return round(used / float(total) * 100.0, 2)
332
+
333
+
334
+ def assert_cpu_health(
335
+ cpu_status: Tuple[float, float, float], threshold: int = 10
336
+ ) -> HealthCheckResult:
337
+ total, used, available = cpu_status
338
+ try:
339
+ perc_used = percent_used(total, used)
340
+ except ZeroDivisionError:
341
+ return HealthCheckResult(
342
+ message="Error reading total available cpu from mesos!", healthy=False
343
+ )
344
+
345
+ if check_threshold(perc_used, threshold):
346
+ return HealthCheckResult(
347
+ message="CPUs: %.2f / %d in use (%s)"
348
+ % (used, total, PaastaColors.green("%.2f%%" % perc_used)),
349
+ healthy=True,
350
+ )
351
+ else:
352
+ return HealthCheckResult(
353
+ message="CRITICAL: Less than %d%% CPUs available. (Currently using %.2f%% of %d)"
354
+ % (threshold, perc_used, total),
355
+ healthy=False,
356
+ )
357
+
358
+
359
+ def assert_memory_health(
360
+ memory_status: Tuple[float, float, float], threshold: int = 10
361
+ ) -> HealthCheckResult:
362
+ total: float
363
+ used: float
364
+ total, used, _ = memory_status
365
+
366
+ total /= 1024
367
+ used /= 1024
368
+
369
+ try:
370
+ perc_used = percent_used(total, used)
371
+ except ZeroDivisionError:
372
+ return HealthCheckResult(
373
+ message="Error reading total available memory from mesos!", healthy=False
374
+ )
375
+
376
+ if check_threshold(perc_used, threshold):
377
+ return HealthCheckResult(
378
+ message="Memory: %0.2f / %0.2fGB in use (%s)"
379
+ % (used, total, PaastaColors.green("%.2f%%" % perc_used)),
380
+ healthy=True,
381
+ )
382
+ else:
383
+ return HealthCheckResult(
384
+ message="CRITICAL: Less than %d%% memory available. (Currently using %.2f%% of %.2fGB)"
385
+ % (threshold, perc_used, total),
386
+ healthy=False,
387
+ )
388
+
389
+
390
+ def assert_disk_health(
391
+ disk_status: Tuple[float, float, float], threshold: int = 10
392
+ ) -> HealthCheckResult:
393
+ total: float
394
+ used: float
395
+ total, used, _ = disk_status
396
+
397
+ total /= 1024
398
+ used /= 1024
399
+
400
+ try:
401
+ perc_used = percent_used(total, used)
402
+ except ZeroDivisionError:
403
+ return HealthCheckResult(
404
+ message="Error reading total available disk from mesos!", healthy=False
405
+ )
406
+
407
+ if check_threshold(perc_used, threshold):
408
+ return HealthCheckResult(
409
+ message="Disk: %0.2f / %0.2fGB in use (%s)"
410
+ % (used, total, PaastaColors.green("%.2f%%" % perc_used)),
411
+ healthy=True,
412
+ )
413
+ else:
414
+ return HealthCheckResult(
415
+ message="CRITICAL: Less than %d%% disk available. (Currently using %.2f%%)"
416
+ % (threshold, perc_used),
417
+ healthy=False,
418
+ )
419
+
420
+
421
+ def assert_gpu_health(
422
+ gpu_status: Tuple[float, float, float], threshold: int = 0
423
+ ) -> HealthCheckResult:
424
+ total, used, available = gpu_status
425
+
426
+ if math.isclose(total, 0):
427
+ # assume that no gpus is healthy since most machines don't have them
428
+ return HealthCheckResult(message="No GPUs found!", healthy=True)
429
+ else:
430
+ perc_used = percent_used(total, used)
431
+
432
+ if check_threshold(perc_used, threshold):
433
+ # only whole gpus can be used
434
+ return HealthCheckResult(
435
+ message="GPUs: %d / %d in use (%s)"
436
+ % (used, total, PaastaColors.green("%.2f%%" % perc_used)),
437
+ healthy=True,
438
+ )
439
+ else:
440
+ return HealthCheckResult(
441
+ message="CRITICAL: Less than %d%% GPUs available. (Currently using %.2f%% of %d)"
442
+ % (threshold, perc_used, total),
443
+ healthy=False,
444
+ )
445
+
446
+
447
+ def assert_mesos_tasks_running(
448
+ metrics: MesosMetrics,
449
+ ) -> HealthCheckResult:
450
+ running = metrics["master/tasks_running"]
451
+ staging = metrics["master/tasks_staging"]
452
+ starting = metrics["master/tasks_starting"]
453
+ return HealthCheckResult(
454
+ message="Tasks: running: %d staging: %d starting: %d"
455
+ % (running, staging, starting),
456
+ healthy=True,
457
+ )
458
+
459
+
460
+ def assert_kube_pods_running(
461
+ kube_client: KubeClient, namespace: str
462
+ ) -> HealthCheckResult:
463
+ statuses = [
464
+ get_pod_status(pod) for pod in get_all_pods_cached(kube_client, namespace)
465
+ ]
466
+ running = statuses.count(PodStatus.RUNNING)
467
+ pending = statuses.count(PodStatus.PENDING)
468
+ failed = statuses.count(PodStatus.FAILED)
469
+ healthy = running > 0
470
+ return HealthCheckResult(
471
+ message=f"Pods: running: {running} pending: {pending} failed: {failed}",
472
+ healthy=healthy,
473
+ )
474
+
475
+
476
+ def get_mesos_slaves_health_status(
477
+ metrics: MesosMetrics,
478
+ ) -> Tuple[int, int]:
479
+ return metrics["master/slaves_active"], metrics["master/slaves_inactive"]
480
+
481
+
482
+ def get_kube_nodes_health_status(
483
+ nodes: Sequence[V1Node],
484
+ ) -> Tuple[int, int]:
485
+ statuses = [is_node_ready(node) for node in nodes]
486
+ return statuses.count(True), statuses.count(False)
487
+
488
+
489
+ def assert_nodes_health(
490
+ nodes_health_status: Tuple[int, int],
491
+ ) -> HealthCheckResult:
492
+ active, inactive = nodes_health_status
493
+ healthy = active > 0
494
+ return HealthCheckResult(
495
+ message="Nodes: active: %d inactive: %d" % (active, inactive), healthy=healthy
496
+ )
497
+
498
+
499
+ def assert_quorum_size() -> HealthCheckResult:
500
+ masters, quorum = get_num_masters(), a_sync.block(get_mesos_quorum)
501
+ if quorum_ok(masters, quorum):
502
+ return HealthCheckResult(
503
+ message="Quorum: masters: %d configured quorum: %d " % (masters, quorum),
504
+ healthy=True,
505
+ )
506
+ else:
507
+ return HealthCheckResult(
508
+ message="CRITICAL: Number of masters (%d) less than configured quorum(%d)."
509
+ % (masters, quorum),
510
+ healthy=False,
511
+ )
512
+
513
+
514
+ _KeyFuncRetT = Sequence[Tuple[str, str]]
515
+
516
+
517
+ class _SlaveT(TypedDict):
518
+ id: str
519
+ resources: MesosResources
520
+ reserved_resources: MesosResources
521
+ attributes: Mapping[str, str]
522
+
523
+
524
+ _GenericNodeT = TypeVar("_GenericNodeT", _SlaveT, V1Node)
525
+
526
+ _GenericNodeGroupingFunctionT = Callable[[_GenericNodeT], _KeyFuncRetT]
527
+
528
+ _GenericNodeFilterFunctionT = Callable[[_GenericNodeT], bool]
529
+
530
+ _GenericNodeSortFunctionT = Callable[[Sequence[_GenericNodeT]], Sequence[_GenericNodeT]]
531
+
532
+
533
+ def key_func_for_attribute(
534
+ attribute: str,
535
+ ) -> Callable[[_SlaveT], str]:
536
+ """Return a closure that given a slave, will return the value of a specific
537
+ attribute.
538
+
539
+ :param attribute: the attribute to inspect in the slave
540
+ :returns: a closure, which takes a slave and returns the value of an attribute
541
+ """
542
+
543
+ def key_func(slave):
544
+ return slave["attributes"].get(attribute, "unknown")
545
+
546
+ return key_func
547
+
548
+
549
+ def key_func_for_attribute_multi(
550
+ attributes: Sequence[str],
551
+ ) -> _GenericNodeGroupingFunctionT:
552
+ """Return a closure that given a slave, will return the value of a list of
553
+ attributes, compiled into a hashable tuple
554
+
555
+ :param attributes: the attributes to inspect in the slave
556
+ :returns: a closure, which takes a slave and returns the value of those attributes
557
+ """
558
+
559
+ def get_attribute(slave, attribute):
560
+ if attribute == "hostname":
561
+ return slave["hostname"]
562
+ else:
563
+ return slave["attributes"].get(attribute, "unknown")
564
+
565
+ def key_func(slave):
566
+ return tuple((a, get_attribute(slave, a)) for a in attributes)
567
+
568
+ return key_func
569
+
570
+
571
+ def key_func_for_attribute_multi_kube(
572
+ attributes: Sequence[str],
573
+ ) -> Callable[[V1Node], _KeyFuncRetT]:
574
+ """Return a closure that given a node, will return the value of a list of
575
+ attributes, compiled into a hashable tuple
576
+
577
+ :param attributes: the attributes to inspect in the slave
578
+ :returns: a closure, which takes a node and returns the value of those attributes
579
+ """
580
+
581
+ def get_attribute(node, attribute):
582
+ return node.metadata.labels.get(paasta_prefixed(attribute), "unknown")
583
+
584
+ def key_func(node):
585
+ return tuple((a, get_attribute(node, a)) for a in attributes)
586
+
587
+ return key_func
588
+
589
+
590
+ def sort_func_for_attributes(
591
+ attributes: Sequence[str],
592
+ ) -> _GenericNodeSortFunctionT:
593
+ def sort(slaves):
594
+ for attribute in attributes:
595
+ slaves = sorted(slaves, key=key_func_for_attribute(attribute))
596
+ return slaves
597
+
598
+ return sort
599
+
600
+
601
+ def group_slaves_by_key_func(
602
+ key_func: _GenericNodeGroupingFunctionT,
603
+ slaves: Sequence[_GenericNodeT],
604
+ sort_func: _GenericNodeSortFunctionT = None,
605
+ ) -> Mapping[_KeyFuncRetT, Sequence[_GenericNodeT]]:
606
+ """Given a function for grouping slaves, return a
607
+ dict where keys are the unique values returned by
608
+ the key_func and the values are all those slaves which
609
+ have that specific value.
610
+
611
+ :param key_func: a function which consumes a slave and returns a value
612
+ :param slaves: a list of slaves
613
+ :returns: a dict of key: [slaves]
614
+ """
615
+ sorted_slaves: Sequence[_GenericNodeT]
616
+ if sort_func is None:
617
+ sorted_slaves = sorted(slaves, key=key_func)
618
+ else:
619
+ sorted_slaves = sort_func(slaves)
620
+
621
+ return {k: list(v) for k, v in itertools.groupby(sorted_slaves, key=key_func)}
622
+
623
+
624
+ class ResourceUtilizationDict(TypedDict):
625
+ free: ResourceInfo
626
+ total: ResourceInfo
627
+ slave_count: int
628
+
629
+
630
+ def calculate_resource_utilization_for_slaves(
631
+ slaves: Sequence[_SlaveT], tasks: Sequence[MesosTask]
632
+ ) -> ResourceUtilizationDict:
633
+ """Given a list of slaves and a list of tasks, calculate the total available
634
+ resource available in that list of slaves, and the resources consumed by tasks
635
+ running on those slaves.
636
+
637
+ :param slaves: a list of slaves to calculate resource usage for
638
+ :param tasks: the list of tasks running in the mesos cluster
639
+ :returns: a dict, containing keys for "free" and "total" resources. Each of these keys
640
+ is a ResourceInfo tuple, exposing a number for cpu, disk and mem.
641
+ """
642
+ resource_total_dict: _Counter[str] = Counter()
643
+ for slave in slaves:
644
+ filtered_resources = filter_mesos_state_metrics(slave["resources"])
645
+ resource_total_dict.update(Counter(filtered_resources))
646
+ resource_free_dict = copy.deepcopy(resource_total_dict)
647
+ for task in tasks:
648
+ task_resources = task["resources"]
649
+ resource_free_dict.subtract(Counter(filter_mesos_state_metrics(task_resources)))
650
+ for slave in slaves:
651
+ filtered_resources = filter_mesos_state_metrics(
652
+ reserved_maintenence_resources(slave["reserved_resources"])
653
+ )
654
+ resource_free_dict.subtract(Counter(filtered_resources))
655
+ return {
656
+ "free": ResourceInfo(
657
+ cpus=resource_free_dict["cpus"],
658
+ disk=resource_free_dict["disk"],
659
+ mem=resource_free_dict["mem"],
660
+ gpus=resource_free_dict.get("gpus", 0),
661
+ ),
662
+ "total": ResourceInfo(
663
+ cpus=resource_total_dict["cpus"],
664
+ disk=resource_total_dict["disk"],
665
+ mem=resource_total_dict["mem"],
666
+ gpus=resource_total_dict.get("gpus", 0),
667
+ ),
668
+ "slave_count": len(slaves),
669
+ }
670
+
671
+
672
+ _IEC_NUMBER_SUFFIXES = {
673
+ "k": 1000,
674
+ "m": 1000**-1,
675
+ "M": 1000**2,
676
+ "G": 1000**3,
677
+ "T": 1000**4,
678
+ "P": 1000**5,
679
+ "Ki": 1024,
680
+ "Mi": 1024**2,
681
+ "Gi": 1024**3,
682
+ "Ti": 1024**4,
683
+ "Pi": 1024**5,
684
+ }
685
+
686
+
687
+ def suffixed_number_value(s: str) -> float:
688
+ pattern = r"(?P<number>\d+)(?P<suff>\w*)"
689
+ match = re.match(pattern, s)
690
+ number, suff = match.groups()
691
+
692
+ if suff in _IEC_NUMBER_SUFFIXES:
693
+ return float(number) * _IEC_NUMBER_SUFFIXES[suff]
694
+ else:
695
+ return float(number)
696
+
697
+
698
+ def suffixed_number_dict_values(d: Mapping[Any, str]) -> Mapping[Any, float]:
699
+ return {k: suffixed_number_value(v) for k, v in d.items()}
700
+
701
+
702
+ def calculate_resource_utilization_for_kube_nodes(
703
+ nodes: Sequence[V1Node],
704
+ pods_by_node: Mapping[str, Sequence[V1Pod]],
705
+ ) -> ResourceUtilizationDict:
706
+ """Given a list of Kubernetes nodes, calculate the total available
707
+ resource available and the resources consumed in that list of nodes.
708
+
709
+ :param nodes: a list of Kubernetes nodes to calculate resource usage for
710
+ :returns: a dict, containing keys for "free" and "total" resources. Each of these keys
711
+ is a ResourceInfo tuple, exposing a number for cpu, disk and mem.
712
+ """
713
+ resource_total_dict: _Counter[str] = Counter()
714
+ resource_free_dict: _Counter[str] = Counter()
715
+ for node in nodes:
716
+ allocatable_resources = suffixed_number_dict_values(
717
+ filter_kube_resources(node.status.allocatable)
718
+ )
719
+ resource_total_dict.update(Counter(allocatable_resources))
720
+ allocated_resources = allocated_node_resources(pods_by_node[node.metadata.name])
721
+ resource_free_dict.update(
722
+ Counter(
723
+ {
724
+ "cpu": allocatable_resources["cpu"] - allocated_resources["cpu"],
725
+ "ephemeral-storage": allocatable_resources["ephemeral-storage"]
726
+ - allocated_resources["ephemeral-storage"],
727
+ "memory": allocatable_resources["memory"]
728
+ - allocated_resources["memory"],
729
+ }
730
+ )
731
+ )
732
+ return {
733
+ "free": ResourceInfo(
734
+ cpus=resource_free_dict["cpu"],
735
+ disk=resource_free_dict["ephemeral-storage"] / (1024**2),
736
+ mem=resource_free_dict["memory"] / (1024**2),
737
+ gpus=resource_free_dict.get("nvidia.com/gpu", 0),
738
+ ),
739
+ "total": ResourceInfo(
740
+ cpus=resource_total_dict["cpu"],
741
+ disk=resource_total_dict["ephemeral-storage"] / (1024**2),
742
+ mem=resource_total_dict["memory"] / (1024**2),
743
+ gpus=resource_total_dict.get("nvidia.com/gpu", 0),
744
+ ),
745
+ "slave_count": len(nodes),
746
+ }
747
+
748
+
749
+ def filter_tasks_for_slaves(
750
+ slaves: Sequence[_SlaveT], tasks: Sequence[MesosTask]
751
+ ) -> Sequence[MesosTask]:
752
+ """Given a list of slaves and a list of tasks, return a filtered
753
+ list of tasks, where those returned belong to slaves in the list of
754
+ slaves
755
+
756
+ :param slaves: the list of slaves which the tasks provided should be
757
+ running on.
758
+ :param tasks: the tasks to filter :returns: a list of tasks,
759
+ identical to that provided by the tasks param, but with only those where
760
+ the task is running on one of the provided slaves included.
761
+ """
762
+ slave_ids = [slave["id"] for slave in slaves]
763
+ return [task for task in tasks if task["slave_id"] in slave_ids]
764
+
765
+
766
+ def make_filter_slave_func(
767
+ attribute: str, values: Sequence[str]
768
+ ) -> _GenericNodeFilterFunctionT:
769
+ def filter_func(slave):
770
+ return slave["attributes"].get(attribute, None) in values
771
+
772
+ return filter_func
773
+
774
+
775
+ def filter_slaves(
776
+ slaves: Sequence[_GenericNodeT], filters: Sequence[_GenericNodeFilterFunctionT]
777
+ ) -> Sequence[_GenericNodeT]:
778
+ """Filter slaves by attributes
779
+
780
+ :param slaves: list of slaves to filter
781
+ :param filters: list of functions that take a slave and return whether the
782
+ slave should be included
783
+ :returns: list of slaves that return true for all the filters
784
+ """
785
+ if filters is None:
786
+ return slaves
787
+ return [s for s in slaves if all([f(s) for f in filters])]
788
+
789
+
790
+ def get_resource_utilization_by_grouping(
791
+ grouping_func: _GenericNodeGroupingFunctionT,
792
+ mesos_state: MesosState,
793
+ filters: Sequence[_GenericNodeFilterFunctionT] = [],
794
+ sort_func: _GenericNodeSortFunctionT = None,
795
+ ) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]:
796
+ """Given a function used to group slaves and mesos state, calculate
797
+ resource utilization for each value of a given attribute.
798
+
799
+ :grouping_func: a function that given a slave, will return the value of an
800
+ attribute to group by.
801
+ :param mesos_state: the mesos state
802
+ :param filters: filters to apply to the slaves in the calculation, with
803
+ filtering preformed by filter_slaves
804
+ :param sort_func: a function that given a list of slaves, will return the
805
+ sorted list of slaves.
806
+ :returns: a dict of {attribute_value: resource_usage}, where resource usage
807
+ is the dict returned by ``calculate_resource_utilization_for_slaves`` for
808
+ slaves grouped by attribute value.
809
+ """
810
+ slaves: Sequence[_SlaveT] = mesos_state.get("slaves", [])
811
+ slaves = filter_slaves(slaves, filters)
812
+ if not has_registered_slaves(mesos_state):
813
+ raise ValueError("There are no slaves registered in the mesos state.")
814
+
815
+ tasks = get_all_tasks_from_state(mesos_state, include_orphans=True)
816
+ non_terminal_tasks = [task for task in tasks if not is_task_terminal(task)]
817
+ slave_groupings = group_slaves_by_key_func(grouping_func, slaves, sort_func)
818
+
819
+ return {
820
+ attribute_value: calculate_resource_utilization_for_slaves(
821
+ slaves=slaves, tasks=filter_tasks_for_slaves(slaves, non_terminal_tasks)
822
+ )
823
+ for attribute_value, slaves in slave_groupings.items()
824
+ }
825
+
826
+
827
+ def get_resource_utilization_by_grouping_kube(
828
+ grouping_func: _GenericNodeGroupingFunctionT,
829
+ kube_client: KubeClient,
830
+ *,
831
+ namespace: str,
832
+ filters: Sequence[_GenericNodeFilterFunctionT] = [],
833
+ sort_func: _GenericNodeSortFunctionT = None,
834
+ ) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]:
835
+ """Given a function used to group nodes, calculate resource utilization
836
+ for each value of a given attribute.
837
+
838
+ :grouping_func: a function that given a node, will return the value of an
839
+ attribute to group by.
840
+ :param kube_client: the Kubernetes client
841
+ :param filters: filters to apply to the nodes in the calculation, with
842
+ filtering preformed by filter_slaves
843
+ :param sort_func: a function that given a list of nodes, will return the
844
+ sorted list of nodes.
845
+ :returns: a dict of {attribute_value: resource_usage}, where resource usage
846
+ is the dict returned by ``calculate_resource_utilization_for_kube_nodes`` for
847
+ nodes grouped by attribute value.
848
+ """
849
+ nodes = get_all_nodes_cached(kube_client)
850
+ nodes = filter_slaves(nodes, filters)
851
+ if len(nodes) == 0:
852
+ raise ValueError("There are no nodes registered in the Kubernetes.")
853
+
854
+ node_groupings = group_slaves_by_key_func(grouping_func, nodes, sort_func)
855
+
856
+ pods = get_all_pods_cached(kube_client, namespace)
857
+
858
+ pods_by_node = {}
859
+ for node in nodes:
860
+ pods_by_node[node.metadata.name] = [
861
+ pod for pod in pods if pod.spec.node_name == node.metadata.name
862
+ ]
863
+ return {
864
+ attribute_value: calculate_resource_utilization_for_kube_nodes(
865
+ nodes, pods_by_node
866
+ )
867
+ for attribute_value, nodes in node_groupings.items()
868
+ }
869
+
870
+
871
+ def resource_utillizations_from_resource_info(
872
+ total: ResourceInfo, free: ResourceInfo
873
+ ) -> Sequence[ResourceUtilization]:
874
+ """
875
+ Given two ResourceInfo tuples, one for total and one for free,
876
+ create a ResourceUtilization tuple for each metric in the ResourceInfo.
877
+ :param total:
878
+ :param free:
879
+ :returns: ResourceInfo for a metric
880
+ """
881
+ return [
882
+ ResourceUtilization(metric=field, total=total[index], free=free[index])
883
+ for index, field in enumerate(ResourceInfo._fields)
884
+ ]
885
+
886
+
887
+ def has_registered_slaves(
888
+ mesos_state: MesosState,
889
+ ) -> bool:
890
+ """Return a boolean indicating if there are any slaves registered
891
+ to the master according to the mesos state.
892
+ :param mesos_state: the mesos state from the master
893
+ :returns: a boolean, indicating if there are > 0 slaves
894
+ """
895
+ return len(mesos_state.get("slaves", [])) > 0
896
+
897
+
898
+ def get_mesos_resource_utilization_health(
899
+ mesos_metrics: MesosMetrics, mesos_state: MesosState
900
+ ) -> Sequence[HealthCheckResult]:
901
+ """Perform healthchecks against mesos metrics.
902
+ :param mesos_metrics: a dict exposing the mesos metrics described in
903
+ https://mesos.apache.org/documentation/latest/monitoring/
904
+ :returns: a list of HealthCheckResult tuples
905
+ """
906
+ return [
907
+ assert_cpu_health(get_mesos_cpu_status(mesos_metrics, mesos_state)),
908
+ assert_memory_health(get_mesos_memory_status(mesos_metrics, mesos_state)),
909
+ assert_disk_health(get_mesos_disk_status(mesos_metrics, mesos_state)),
910
+ assert_gpu_health(get_mesos_gpu_status(mesos_metrics, mesos_state)),
911
+ assert_mesos_tasks_running(mesos_metrics),
912
+ assert_nodes_health(get_mesos_slaves_health_status(mesos_metrics)),
913
+ ]
914
+
915
+
916
+ def get_kube_resource_utilization_health(
917
+ kube_client: KubeClient,
918
+ ) -> Sequence[HealthCheckResult]:
919
+ """Perform healthchecks against Kubernetes.
920
+ :param kube_client: the KUbernetes client
921
+ :returns: a list of HealthCheckResult tuples
922
+ """
923
+
924
+ nodes = get_all_nodes_cached(kube_client)
925
+
926
+ return [
927
+ assert_cpu_health(get_kube_cpu_status(nodes)),
928
+ assert_memory_health(get_kube_memory_status(nodes)),
929
+ assert_disk_health(get_kube_disk_status(nodes)),
930
+ assert_gpu_health(get_kube_gpu_status(nodes)),
931
+ assert_nodes_health(get_kube_nodes_health_status(nodes)),
932
+ ]
933
+
934
+
935
+ def get_mesos_state_status(
936
+ mesos_state: MesosState,
937
+ ) -> Sequence[HealthCheckResult]:
938
+ """Perform healthchecks against mesos state.
939
+ :param mesos_state: a dict exposing the mesos state described in
940
+ https://mesos.apache.org/documentation/latest/endpoints/master/state.json/
941
+ :returns: a list of HealthCheckResult tuples
942
+ """
943
+ return [
944
+ assert_quorum_size(),
945
+ ]
946
+
947
+
948
+ def run_healthchecks_with_param(
949
+ param: Any,
950
+ healthcheck_functions: Sequence[Callable[..., HealthCheckResult]],
951
+ format_options: Mapping[str, Any] = {},
952
+ ) -> Sequence[HealthCheckResult]:
953
+ return [
954
+ healthcheck(param, **format_options) for healthcheck in healthcheck_functions
955
+ ]
956
+
957
+
958
+ def assert_kube_deployments(
959
+ kube_client: KubeClient, namespace: str
960
+ ) -> HealthCheckResult:
961
+ num_deployments = len(list_all_deployments(kube_client, namespace))
962
+ return HealthCheckResult(
963
+ message=f"Kubernetes deployments: {num_deployments:>3}", healthy=True
964
+ )
965
+
966
+
967
+ def get_kube_status(
968
+ kube_client: KubeClient, namespace: str
969
+ ) -> Sequence[HealthCheckResult]:
970
+ """Gather information about Kubernetes.
971
+ :param kube_client: the KUbernetes client
972
+ :return: string containing the status
973
+ """
974
+ return run_healthchecks_with_param(
975
+ [kube_client, namespace], [assert_kube_deployments, assert_kube_pods_running]
976
+ )
977
+
978
+
979
+ def critical_events_in_outputs(healthcheck_outputs):
980
+ """Given a list of HealthCheckResults return those which are unhealthy."""
981
+ return [
982
+ healthcheck
983
+ for healthcheck in healthcheck_outputs
984
+ if healthcheck.healthy is False
985
+ ]
986
+
987
+
988
+ def generate_summary_for_check(name, ok):
989
+ """Given a check name and a boolean indicating if the service is OK, return
990
+ a formatted message.
991
+ """
992
+ status = PaastaColors.green("OK") if ok is True else PaastaColors.red("CRITICAL")
993
+ summary = f"{name} Status: {status}"
994
+ return summary
995
+
996
+
997
+ def status_for_results(healthcheck_results):
998
+ """Given a list of HealthCheckResult tuples, return the ok status
999
+ for each one.
1000
+ :param healthcheck_results: a list of HealthCheckResult tuples
1001
+ :returns: a list of booleans.
1002
+ """
1003
+ return [result.healthy for result in healthcheck_results]
1004
+
1005
+
1006
+ def print_results_for_healthchecks(summary, ok, results, verbose, indent=2):
1007
+ print(summary)
1008
+ if verbose >= 1:
1009
+ for health_check_result in results:
1010
+ if health_check_result.healthy:
1011
+ print_with_indent(health_check_result.message, indent)
1012
+ else:
1013
+ print_with_indent(PaastaColors.red(health_check_result.message), indent)
1014
+ elif not ok:
1015
+ unhealthy_results = critical_events_in_outputs(results)
1016
+ for health_check_result in unhealthy_results:
1017
+ print_with_indent(PaastaColors.red(health_check_result.message), indent)
1018
+
1019
+
1020
+ def healthcheck_result_resource_utilization_pair_for_resource_utilization(
1021
+ utilization, threshold
1022
+ ):
1023
+ """Given a ResourceUtilization, produce a tuple of (HealthCheckResult, ResourceUtilization),
1024
+ where that HealthCheckResult describes the 'health' of a given utilization.
1025
+ :param utilization: a ResourceUtilization tuple
1026
+ :param threshold: a threshold which decides the health of the given ResourceUtilization
1027
+ :returns: a tuple of (HealthCheckResult, ResourceUtilization)
1028
+ """
1029
+ return (
1030
+ healthcheck_result_for_resource_utilization(utilization, threshold),
1031
+ utilization,
1032
+ )
1033
+
1034
+
1035
+ def format_table_column_for_healthcheck_resource_utilization_pair(
1036
+ healthcheck_utilization_pair,
1037
+ ):
1038
+ """Given a tuple of (HealthCheckResult, ResourceUtilization), return a
1039
+ string representation of the ResourceUtilization such that it is formatted
1040
+ according to the value of HealthCheckResult.healthy.
1041
+
1042
+ :param healthcheck_utilization_pair: a tuple of (HealthCheckResult, ResourceUtilization)
1043
+ :returns: a string representing the ResourceUtilization.
1044
+ """
1045
+ color_func = (
1046
+ PaastaColors.green
1047
+ if healthcheck_utilization_pair[0].healthy
1048
+ else PaastaColors.red
1049
+ )
1050
+ utilization = (
1051
+ healthcheck_utilization_pair[1].total - healthcheck_utilization_pair[1].free
1052
+ )
1053
+ if int(healthcheck_utilization_pair[1].total) == 0:
1054
+ utilization_perc = 100
1055
+ else:
1056
+ utilization_perc = (
1057
+ utilization / float(healthcheck_utilization_pair[1].total) * 100
1058
+ )
1059
+ if healthcheck_utilization_pair[1].metric not in ["cpus", "gpus"]:
1060
+ return color_func(
1061
+ "{}/{} ({:.2f}%)".format(
1062
+ naturalsize(utilization * 1024 * 1024, gnu=True),
1063
+ naturalsize(
1064
+ healthcheck_utilization_pair[1].total * 1024 * 1024, gnu=True
1065
+ ),
1066
+ utilization_perc,
1067
+ )
1068
+ )
1069
+ else:
1070
+ return color_func(
1071
+ "{:.2f}/{:.0f} ({:.2f}%)".format(
1072
+ utilization, healthcheck_utilization_pair[1].total, utilization_perc
1073
+ )
1074
+ )
1075
+
1076
+
1077
+ def format_row_for_resource_utilization_healthchecks(healthcheck_utilization_pairs):
1078
+ """Given a list of (HealthCheckResult, ResourceUtilization) tuples, return a list with each of those
1079
+ tuples represented by a formatted string.
1080
+
1081
+ :param healthcheck_utilization_pairs: a list of (HealthCheckResult, ResourceUtilization) tuples.
1082
+ :returns: a list containing a string representation of each (HealthCheckResult, ResourceUtilization) tuple.
1083
+ """
1084
+ return [
1085
+ format_table_column_for_healthcheck_resource_utilization_pair(pair)
1086
+ for pair in healthcheck_utilization_pairs
1087
+ ]
1088
+
1089
+
1090
+ def get_table_rows_for_resource_info_dict(
1091
+ attribute_values, healthcheck_utilization_pairs
1092
+ ):
1093
+ """A wrapper method to join together
1094
+
1095
+ :param attribute: The attribute value and formatted columns to be shown in
1096
+ a single row. :param attribute_value: The value of the attribute
1097
+ associated with the row. This becomes index 0 in the array returned.
1098
+ :param healthcheck_utilization_pairs: a list of 2-tuples, where each tuple has the elements
1099
+ (HealthCheckResult, ResourceUtilization)
1100
+ :returns: a list of strings, representing a row in a table to be formatted.
1101
+ """
1102
+ return attribute_values + format_row_for_resource_utilization_healthchecks(
1103
+ healthcheck_utilization_pairs
1104
+ )
1105
+
1106
+
1107
+ def reserved_maintenence_resources(
1108
+ resources: MesosResources,
1109
+ ):
1110
+ return resources.get(MAINTENANCE_ROLE, {"cpus": 0, "mem": 0, "disk": 0, "gpus": 0})